diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index 85720e69fb3d..5279dd24a716 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -1,42 +1,52 @@
-For bugs or installation issues, please provide the following information.
-The more information you provide, the more likely people will be able to help you.
+Note: Providing complete information in the most concise form is the best way to get help. This issue template serves as the checklist for essential information to most of the technical issues and bug reports. For non-technical issues and feature requests, feel free to present the information in what you believe is the best form.
 
-## Environment info
-Operating System:
+For Q & A and discussion, please start a discussion thread at https://discuss.mxnet.io 
 
-Compiler:
+## Description
+(Brief description of the problem in no more than 2 sentences.)
 
-Package used (Python/R/Scala/Julia):
+## Environment info (Required)
+
+```
+What to do:
+1. Download the diagnosis script from https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/diagnose.py
+2. Run the script using `python diagnose.py` and paste its output here.
 
-MXNet version:
+```
+
+Package used (Python/R/Scala/Julia):
+(I'm using ...)
 
-Or if installed from source:
+For Scala user, please provide:
+1. Java version: (`java -version`)
+2. Maven version: (`mvn -version`)
+3. Scala runtime if applicable: (`scala -version`)
 
-MXNet commit hash (`git rev-parse HEAD`):
+For R user, please provide R `sessionInfo()`:
 
-If you are using python package, please provide
+## Build info (Required if built from source)
 
-Python version and distribution:
+Compiler (gcc/clang/mingw/visual studio):
 
-If you are using R package, please provide
+MXNet commit hash:
+(Paste the output of `git rev-parse HEAD` here.)
 
-R `sessionInfo()`:
+Build config:
+(Paste the content of config.mk, or the build command.)
 
 ## Error Message:
-Please paste the full error message, including stack trace.
+(Paste the complete error message, including stack trace.)
 
 ## Minimum reproducible example
-if you are using your own code, please provide a short script that reproduces the error.
+(If you are using your own code, please provide a short script that reproduces the error. Otherwise, please provide link to the existing example.)
 
 ## Steps to reproduce
-or if you are running standard examples, please provide the commands you have run that lead to the error.
+(Paste the commands you ran that produced the error.)
 
 1.
 2.
-3.
 
 ## What have you tried to solve it?
 
 1.
 2.
-3.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000000..468be298b8bd
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,18 @@
+## Description ##
+(Brief description on what this PR is about)
+
+## Checklist ##
+### Essentials ###
+- [ ] Passed code style checking (`make lint`)
+- [ ] Changes are complete (i.e. I finished coding on this PR)
+- [ ] All changes have test coverage
+- [ ] For user-facing API changes, API doc string has been updated. For new C++ functions in header files, their functionalities and arguments are well-documented. 
+- [ ] To my best knowledge, examples are either not affected by this change, or have been fixed to be compatible with this change
+
+### Changes ###
+- [ ] Feature1, tests, (and when applicable, API doc)
+- [ ] Feature2, tests, (and when applicable, API doc)
+
+## Comments ##
+- If this change is a backward incompatible change, why must this change be made.
+- Interesting edge cases to note here
diff --git a/.gitignore b/.gitignore
index 82d2e560237d..fbd62c9ec552 100644
--- a/.gitignore
+++ b/.gitignore
@@ -60,6 +60,7 @@ tracker
 __pycache__
 *.pkl
 *.params
+*.states
 *.json
 *.d
 build
@@ -146,3 +147,10 @@ bld
 target
 
 bin/im2rec
+
+model/
+
+# generated function signature for IDE auto-complete
+python/mxnet/symbol/gen_*
+python/mxnet/ndarray/gen_*
+python/.eggs
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dc9ca5f7bb0c..b6bb81418231 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,31 +1,63 @@
 cmake_minimum_required(VERSION 3.0.2)
 
-project(mxnet C CXX)
+if((${CMAKE_VERSION} VERSION_GREATER "3.9.0") OR (${CMAKE_VERSION} VERSION_EQUAL "3.9.0"))
+  set(FIRST_CUDA TRUE)
+else()
+  set(FIRST_CUDA FALSE)
+endif()
+include(cmake/Utils.cmake)
 
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
-  include(${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
+#Some things have order. This must be put in front alone
+mxnet_option(USE_CUDA             "Build with CUDA support"   ON)
+mxnet_option(USE_OLDCMAKECUDA           "Build with old cmake cuda" OFF)
+if(USE_CUDA)
+  add_definitions(-DMSHADOW_USE_CUDA=1)
+  IF(FIRST_CUDA AND (NOT USE_OLDCMAKECUDA))
+    set(__cuda_toolset "7.5" "8.0" "9.0")
+    set(CUDA_TOOLSET "8.0" CACHE STRING "Select CUDA Version.")
+    set_property( CACHE CUDA_TOOLSET PROPERTY STRINGS "" ${__cuda_toolset} )
+    set(CMAKE_GENERATOR_TOOLSET "cuda=${CUDA_TOOLSET},host=x64")
+    project(mxnet C CXX CUDA)
+  else()
+    project(mxnet C CXX)
+    set(FIRST_CUDA FALSE)
+  endif()
+else()
+  project(mxnet C CXX)
+  add_definitions(-DMSHADOW_USE_CUDA=0)
 endif()
 
-set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules;${CMAKE_MODULE_PATH}")
 
-include(cmake/Utils.cmake)
 mxnet_option(USE_OPENCV           "Build with OpenCV support" ON)
 mxnet_option(USE_OPENMP           "Build with Openmp support" ON)
-mxnet_option(USE_CUDA             "Build with CUDA support"   ON)
 mxnet_option(USE_CUDNN            "Build with cudnn support"  ON) # one could set CUDNN_ROOT for search path
 mxnet_option(USE_LAPACK           "Build with lapack support" ON IF NOT MSVC)
 mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON)
 mxnet_option(USE_MKLML_MKL        "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE))
 mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF)
-mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   OFF)
+mxnet_option(USE_OPERATOR_TUNING  "Enable auto-tuning of operators" ON AND NOT MSVC)
+mxnet_option(USE_GPERFTOOLS       "Build with GPerfTools support (if found)" ON)
+mxnet_option(USE_JEMALLOC         "Build with Jemalloc support"   ON)
 mxnet_option(USE_PROFILER         "Build with Profiler support"   OFF)
 mxnet_option(USE_DIST_KVSTORE     "Build with DIST_KVSTORE support" OFF)
-mxnet_option(USE_PLUGINS_WARPCTC	"Use WARPCTC Plugins" OFF)
+mxnet_option(USE_PLUGINS_WARPCTC  "Use WARPCTC Plugins" OFF)
 mxnet_option(USE_PLUGIN_CAFFE     "Use Caffe Plugin" OFF)
 mxnet_option(USE_CPP_PACKAGE      "Build C++ Package" OFF)
 mxnet_option(USE_MXNET_LIB_NAMING "Use MXNet library naming conventions." ON)
 mxnet_option(USE_GPROF            "Compile with gprof (profiling) flag" OFF)
 mxnet_option(USE_VTUNE            "Enable use of Intel Amplifier XE (VTune)" OFF) # one could set VTUNE_ROOT for search path
+mxnet_option(INSTALL_EXAMPLES     "Install the example source files." OFF)
+
+
+
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
+  include(${CMAKE_CURRENT_SOURCE_DIR}/build/private/local_config.cmake)
+endif()
+
+set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/Modules;${CMAKE_MODULE_PATH}")
+
+
+
 
 SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
 
@@ -34,6 +66,7 @@ if("$ENV{VERBOSE}" STREQUAL "1")
   set(CMAKE_VERBOISE_MAKEFILE ON)
 endif()
 
+
 if(MSVC)
   add_definitions(-DWIN32_LEAN_AND_MEAN)
   add_definitions(-DDMLC_USE_CXX11)
@@ -87,6 +120,9 @@ if(USE_VTUNE)
   if(NOT VTUNE_ROOT)
     set(VTUNE_ROOT /opt/intel/vtune_amplifier_xe_2017)
   endif()
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-omit-frame-pointer -g -pg")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer -g -pg")
+  set(CMAKE_LINK_LIBRARY_FILE_FLAG "${CMAKE_LINK_LIBRARY_FILE_FLAG} -g -pg")
   add_definitions(-DMXNET_USE_VTUNE=1)
   include_directories(${VTUNE_ROOT}/include)
   list(APPEND mxnet_LINKER_LIBS ${VTUNE_ROOT}/lib64/libittnotify.a)
@@ -108,6 +144,8 @@ if(USE_MKL_IF_AVAILABLE)
     if(NOT MSVC)
       list(APPEND mxnet_LINKER_LIBS dl)
     endif()
+    # If using MKL, use the Intel OMP libraries
+    list(APPEND mxnet_LINKER_LIBS iomp5)
     if(USE_MKL_EXPERIMENTAL)
       add_definitions(-DMKL_EXPERIMENTAL=1)
     else()
@@ -122,14 +160,20 @@ endif()
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
-if(EXISTS ${PROJECT_SOURCE_DIR}/mshadow/cmake)
-  include(mshadow/cmake/mshadow.cmake)
+if(FIRST_CUDA)
+  include(cmake/ChooseBlas.cmake)
   include(mshadow/cmake/Utils.cmake)
-  include(mshadow/cmake/Cuda.cmake)
+  include(cmake/FirstClassLangCuda.cmake)
 else()
-  include(mshadowUtils)
-  include(Cuda)
-  include(mshadow)
+  if(EXISTS ${PROJECT_SOURCE_DIR}/mshadow/cmake)
+    include(mshadow/cmake/mshadow.cmake)
+    include(mshadow/cmake/Utils.cmake)
+    include(mshadow/cmake/Cuda.cmake)
+  else()
+    include(mshadowUtils)
+    include(Cuda)
+    include(mshadow)
+  endif()
 endif()
 
 list(APPEND mxnet_LINKER_LIBS ${mshadow_LINKER_LIBS})
@@ -150,7 +194,7 @@ include_directories("dlpack/include")
 #  add_subdirectory(dlpack)
 #endif()
 
-if(NOT MSVC)
+if(NOT MSVC AND NOT APPLE)
   set(BEGIN_WHOLE_ARCHIVE -Wl,--whole-archive)
   set(END_WHOLE_ARCHIVE -Wl,--no-whole-archive)
 endif()
@@ -162,16 +206,38 @@ if(UNIX)
   endif()
 endif()
 
+set(ALT_MALLOC_FLAGS "-fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free")
+
+# ---[ gperftools
+if(USE_GPERFTOOLS)
+  find_package(Gperftools)
+  if(GPERFTOOLS_FOUND)
+    message(STATUS "Using Gperftools malloc (tcmalloc)")
+    include_directories(${GPERFTOOLS_INCLUDE_DIR})
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ALT_MALLOC_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ALT_MALLOC_FLAGS}")
+    set(mxnet_LINKER_LIBS ${mxnet_LINKER_LIBS} ${GPERFTOOLS_LIBRARIES})
+    set(USE_JEMALLOC 0)
+  endif()
+endif()
+
 # ---[ jemalloc
 if(USE_JEMALLOC)
+  if(USE_GPERFTOOLS)
+    message(ERROR "Only one of USE_JEMALLOC and USE_GPERFTOOLS can be defined at once")
+  endif()
   find_package(JeMalloc)
   if(JEMALLOC_FOUND)
+    message(STATUS "Using JEMalloc malloc")
     add_definitions(-DUSE_JEMALLOC)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${ALT_MALLOC_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ALT_MALLOC_FLAGS}")
     include_directories(${JEMALLOC_INCLUDE_DIRS})
     set(mxnet_LINKER_LIBS ${mxnet_LINKER_LIBS} ${JEMALLOC_LIBRARIES})
   endif()
 endif()
 
+# ---[ OpenCV
 if(USE_OPENCV)
   find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs)
   if(NOT OpenCV_FOUND) # if not OpenCV 3.x, then imgcodecs are not found
@@ -194,36 +260,41 @@ else(USE_OPENCV)
   add_definitions(-DMXNET_USE_OPENCV=0)
 endif()
 
+# ---[ OpenMP
 if(USE_OPENMP)
   find_package(OpenMP REQUIRED)
-  if(OPENMP_FOUND)
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/openmp/CMakeLists.txt)
+    # Intel/llvm OpenMP: https://github.com/llvm-mirror/openmp
+    set(OPENMP_STANDALONE_BUILD TRUE)
+    set(LIBOMP_ENABLE_SHARED FALSE)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/openmp)
+    list(REMOVE_ITEM mxnet_LINKER_LIBS iomp5)
+    list(APPEND mxnet_LINKER_LIBS omp)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+  else()
+    if(OPENMP_FOUND)
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+      set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+      set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+    endif()
   endif()
 elseif(UNIX)
   list(APPEND mxnet_LINKER_LIBS pthread)
 endif()
 
+# ---[ LAPack
 if(USE_LAPACK)
   add_definitions(-DMXNET_USE_LAPACK=1)
   list(APPEND mxnet_LINKER_LIBS lapack)
 else(USE_LAPACK)
   # Workaround for Windows until using new Jenkinsfile.
-  if(USE_BLAS STREQUAL "open")
+  if(BLAS STREQUAL "Open" OR BLAS STREQUAL "open")
     add_definitions(-DMXNET_USE_LAPACK=1)
   endif()
 endif()
 
-
-if(UNIX)
-  find_library(RTLIB rt)
-  if(RTLIB)
-    list(APPEND mxnet_LINKER_LIBS ${RTLIB})
-  endif()
-endif()
-
 # ---[ jemalloc
 if(USE_JEMALLOC)
   find_package(JeMalloc)
@@ -254,11 +325,18 @@ endif()
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mshadow/cmake)
   add_subdirectory("mshadow")
 endif()
-FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h")
+FILE(GLOB_RECURSE SOURCE "src/*.cc" "src/*.h" "include/*.h")
 FILE(GLOB_RECURSE CUDA "src/*.cu" "src/*.cuh")
 
 # add nnvm to source
-FILE(GLOB_RECURSE NNVMSOURCE "nnvm/src/*.cc" "nnvm/src/*.h" "nnvm/include/*.h")
+FILE(GLOB_RECURSE NNVMSOURCE
+  nnvm/src/c_api/*.cc
+  nnvm/src/core/*.cc
+  nnvm/src/pass/*.cc
+  nnvm/src/c_api/*.h
+  nnvm/src/core/*.h
+  nnvm/src/pass/*.h
+  nnvm/include/*.h)
 list(APPEND SOURCE ${NNVMSOURCE})
 
 # add mshadow file
@@ -289,6 +367,10 @@ if(USE_PLUGINS_WARPCTC)
 	list(APPEND CUDA ${PLUGINS_CUSRC})
 endif()
 
+if(USE_OPERATOR_TUNING)
+  add_definitions(-DMXNET_USE_OPERATOR_TUNING=1)
+endif()
+
 if(USE_PLUGIN_CAFFE)
   if(NOT USE_CUDA)
     set(CPU_ONLY ON)
@@ -343,37 +425,46 @@ if(MSVC)
 endif()
 
 if(USE_CUDA)
-  list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES})
-  # define preprocessor macro so that we will not include the generated forcelink header
-  mshadow_cuda_compile(cuda_objs ${CUDA})
-  if(MSVC)
-    FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-    list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
-    set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
-    list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
-    FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-    list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
-    FIND_LIBRARY(CUDA_cusolver_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
-    list(APPEND mxnet_LINKER_LIBS "${CUDA_cusolver_LIBRARY}/../cusolver.lib") # For cusolver 
-  else(MSVC)
-    list(APPEND mxnet_LINKER_LIBS nvrtc cuda cufft cusolver)
-    link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
-  endif()
-  list(APPEND SOURCE ${cuda_objs} ${CUDA})
-  add_definitions(-DMXNET_USE_CUDA=1)
-  add_definitions(-DMXNET_USE_NVRTC=1)
-  if(CUDA_LIBRARY_PATH)
-    if(IS_CONTAINER_BUILD)
-      # In case of building on a production-like build container which may not have Cuda installed
-      if(NOT CMAKE_SYSTEM_HAS_CUDA)
-        # Assuming building in a container that doesn't have CUDA installed (ie CPU-only build machine)
-        # so use the stub cuda driver shared library
-        if(EXISTS ${CUDA_LIBRARY_PATH}/stubs/libcuda.so)
-          link_directories(${CUDA_LIBRARY_PATH}/stubs)
+  if(FIRST_CUDA)
+    mshadow_select_nvcc_arch_flags(NVCC_FLAGS_ARCH)
+    string(REPLACE ";" " " NVCC_FLAGS_ARCH "${NVCC_FLAGS_ARCH}")
+    set(CMAKE_CUDA_FLAGS "${NVCC_FLAGS_ARCH}")
+    set(CMAKE_CUDA_FLAGS_RELEASE "${NVCC_FLAGS_ARCH} -use_fast_math")
+    list(APPEND mxnet_LINKER_LIBS nvrtc cuda cublas cufft cusolver curand)
+    list(APPEND SOURCE ${CUDA})
+    add_definitions(-DMXNET_USE_CUDA=1)
+  else()
+    list(APPEND CUDA_INCLUDE_DIRS ${INCLUDE_DIRECTORIES})
+    # define preprocessor macro so that we will not include the generated forcelink header
+    mshadow_cuda_compile(cuda_objs ${CUDA})
+    if(MSVC)
+        FIND_LIBRARY(CUDA_nvrtc_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
+        list(APPEND mxnet_LINKER_LIBS ${CUDA_nvrtc_LIBRARY})
+        set(CUDA_cuda_LIBRARY "${CUDA_nvrtc_LIBRARY}/../cuda.lib")
+        list(APPEND mxnet_LINKER_LIBS ${CUDA_cuda_LIBRARY})
+        FIND_LIBRARY(CUDA_cufft_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
+        list(APPEND mxnet_LINKER_LIBS "${CUDA_cufft_LIBRARY}/../cufft.lib") # For fft operator
+        FIND_LIBRARY(CUDA_cusolver_LIBRARY nvrtc "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64"  "${CUDA_TOOLKIT_ROOT_DIR}/lib/win32")
+        list(APPEND mxnet_LINKER_LIBS "${CUDA_cusolver_LIBRARY}/../cusolver.lib") # For cusolver
+    else(MSVC)
+        list(APPEND mxnet_LINKER_LIBS nvrtc cuda cufft cusolver)
+        link_directories("${CUDA_TOOLKIT_ROOT_DIR}/lib64")
+    endif()
+    list(APPEND SOURCE ${cuda_objs} ${CUDA})
+    add_definitions(-DMXNET_USE_CUDA=1)
+    if(CUDA_LIBRARY_PATH)
+        if(IS_CONTAINER_BUILD)
+        # In case of building on a production-like build container which may not have Cuda installed
+        if(NOT CMAKE_SYSTEM_HAS_CUDA)
+            # Assuming building in a container that doesn't have CUDA installed (ie CPU-only build machine)
+            # so use the stub cuda driver shared library
+            if(EXISTS ${CUDA_LIBRARY_PATH}/stubs/libcuda.so)
+            link_directories(${CUDA_LIBRARY_PATH}/stubs)
+            endif()
+        endif()
         endif()
-      endif()
     endif()
-  endif()
+ endif()
 endif()
 
 # unsupported: if caffe is a subdirectory of mxnet, load its CMakeLists.txt as well
@@ -398,53 +489,63 @@ else()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
 endif()
 
+set(MXNET_INSTALL_TARGETS mxnet)
 if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin" AND USE_MXNET_LIB_NAMING)
   add_library(mxnet MODULE ${SOURCE})
+  add_library(mxnet_static STATIC ${SOURCE})
 else()
   if(UNIX)
+    list(APPEND MXNET_INSTALL_TARGETS mxnet_static)
     add_library(mxnet_static STATIC ${SOURCE})
     # Need an arbitrary source file to trigger CMake to build the library
     add_library(mxnet SHARED)
-    # This has prolems, as it adds libmxnet_static to INTERFACE_LINK_LIBRARIES
-    target_link_libraries(mxnet "-Wl,--whole-archive $<TARGET_FILE:mxnet_static> -Wl,--no-whole-archive")
-    target_link_libraries(mxnet mxnet_static) # Let cmake understand the dependency
-    add_custom_target(
-        StaticallyLinkStaticMXNetLibrary ALL
-        BYPRODUCTS ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/libmxnet.a
-        WORKING_DIRECTORY ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}
-        COMMAND ln -sf libmxnet_static.a libmxnet.a
-        DEPENDS mxnet_static
-    )
+    set_target_properties(mxnet_static PROPERTIES OUTPUT_NAME mxnet)
+    target_link_libraries(mxnet PRIVATE "-Wl,--whole-archive $<TARGET_FILE:mxnet_static> -Wl,--no-whole-archive")
+    target_link_libraries(mxnet PRIVATE mxnet_static) # Let cmake understand the dependency
   else()
     add_library(mxnet SHARED ${SOURCE})
   endif()
 endif()
-target_link_libraries(mxnet ${mxnet_LINKER_LIBS})
-
-if(USE_PLUGINS_WARPCTC)
-  target_link_libraries(mxnet debug ${WARPCTC_LIB_DEBUG})
-  target_link_libraries(mxnet optimized ${WARPCTC_LIB_RELEASE})
-endif()
-
-target_link_libraries(mxnet dmlc)
 
-if(MSVC AND USE_MXNET_LIB_NAMING)
-  set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
+if(USE_CUDA)
+  if(FIRST_CUDA)
+    target_compile_options(mxnet PUBLIC "$<$<CONFIG:DEBUG>:-Xcompiler=-MTd>")
+    target_compile_options(mxnet PUBLIC "$<$<CONFIG:RELEASE>:-Xcompiler=-MT>")
+  endif()
 endif()
-
-
 if(USE_DIST_KVSTORE)
   if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/ps-lite/CMakeLists.txt)
     add_subdirectory("ps-lite")
-    list(APPEND pslite_LINKER_LIBS pslite)
-    target_link_libraries(mxnet debug ${pslite_LINKER_LIBS_DEBUG})
-    target_link_libraries(mxnet optimized ${pslite_LINKER_LIBS_RELEASE})
+    list(APPEND pslite_LINKER_LIBS pslite protobuf)
+    target_link_libraries(mxnet PUBLIC debug ${pslite_LINKER_LIBS_DEBUG})
+    target_link_libraries(mxnet PUBLIC optimized ${pslite_LINKER_LIBS_RELEASE})
+    if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+      list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS_DEBUG})
+    else()
+      list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS_RELEASE})
+    endif()
+    target_link_libraries(mxnet PUBLIC debug ${pslite_LINKER_LIBS_DEBUG})
+    target_link_libraries(mxnet PUBLIC optimized ${pslite_LINKER_LIBS_RELEASE})
+
   else()
-    set(pslite_LINKER_LIBS protobuf zmq-static )
+    set(pslite_LINKER_LIBS protobuf zmq-static)
   endif()
   add_definitions(-DMXNET_USE_DIST_KVSTORE)
-  target_link_libraries(mxnet ${pslite_LINKER_LIBS})
   include_directories(SYSTEM ${pslite_INCLUDE_DIR})
+  list(APPEND mxnet_LINKER_LIBS ${pslite_LINKER_LIBS})
+endif()
+
+target_link_libraries(mxnet PUBLIC ${mxnet_LINKER_LIBS})
+
+if(USE_PLUGINS_WARPCTC)
+  target_link_libraries(mxnet PUBLIC debug ${WARPCTC_LIB_DEBUG})
+  target_link_libraries(mxnet PUBLIC optimized ${WARPCTC_LIB_RELEASE})
+endif()
+
+target_link_libraries(mxnet PUBLIC dmlc)
+
+if(MSVC AND USE_MXNET_LIB_NAMING)
+  set_target_properties(mxnet PROPERTIES OUTPUT_NAME "libmxnet")
 endif()
 
 if(USE_PROFILER)
@@ -453,6 +554,18 @@ endif()
 
 add_subdirectory(tests)
 
+include(GNUInstallDirs)
+install(TARGETS ${MXNET_INSTALL_TARGETS}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
+
+install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+if (INSTALL_EXAMPLES)
+  install(DIRECTORY example  DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME})
+endif()
+
 # AUTO_INSTALL_DIR -> Optional: specify post-build install direcory
 if(AUTO_INSTALL_DIR)
   # ---[ Install Includes
@@ -493,6 +606,7 @@ if(MSVC)
   find_package(PythonInterp)
   set(PYTHON_EXECUTABLE ${PYTHON_EXECUTABLE} CACHE FILEPATH "Path to the python executable")
 endif()
-set(LINT_DIRS include src scripts python tests cpp-package)
-add_custom_target(mxnet_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${LINT_DIRS} -DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -DPROJECT_NAME=mxnet -P ${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/cmake/lint.cmake)
+set(LINT_DIRS "include src plugin cpp-package tests")
+set(EXCLUDE_PATH "src/operator/contrib/ctc_include")
+add_custom_target(mxnet_lint COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE} -DLINT_DIRS=${LINT_DIRS} -DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} -DPROJECT_NAME=mxnet -DEXCLUDE_PATH=${EXCLUDE_PATH} -P ${CMAKE_CURRENT_SOURCE_DIR}/dmlc-core/cmake/lint.cmake)
 
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 000000000000..57b4ec3cb3fb
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1,14 @@
+# Owners of Apache MXNet
+
+# Global owners
+*			@apache/mxnet-committers
+
+# Owners of language bindings
+R-package/*		@thirdwing
+scala-package/*		@javelinjs
+perl-package/*		@sergeykolychev
+
+# CMake owners
+CMakeLists.txt		@cjolivier01
+cmake/*			@cjolivier01
+
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 8cae93854e19..64cd29dc0bbe 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -1,5 +1,5 @@
-Contributors of DMLC/MXNet
-==========================
+Contributors of Apache MXNet (incubating)
+=========================================
 MXNet has been developed by a community of people who are interested in large-scale machine learning and deep learning.
 Everyone is more than welcomed to is a great way to make the project better and more accessible to more users.
 
@@ -7,6 +7,7 @@ Committers
 ----------
 Committers are people who have made substantial contribution to the project and being active.
 The committers are the granted write access to the project.
+A full list of committers can be found here:  http://incubator.apache.org/projects/mxnet.html
 
 * [Bing Xu](https://github.com/antinucleon)
   - Bing is the initiator and major contributor of operators and ndarray modules of mxnet.
@@ -39,6 +40,7 @@ The committers are the granted write access to the project.
   - Zixuan is one of major maintainers of mxnet scala package.
 * [Yuan Tang](https://github.com/terrytangyuan)
   - Yuan is one of major maintainers of mxnet scala package.
+* [Chris Olivier](https://github.com/cjolivier01)
 
 ### Become a Committer
 MXNet is a opensource project and we are actively looking for new committers
@@ -50,7 +52,7 @@ New committers will be proposed by current committers, with support from more th
 
 List of Contributors
 --------------------
-* [Full List of Contributors](https://github.com/dmlc/mxnet/graphs/contributors)
+* [Full List of Contributors](https://github.com/apache/incubator-mxnet/graphs/contributors)
   - To contributors: please add your name to the list when you submit a patch to the project:)
 * [Feng Wang](https://github.com/happynear)
   - Feng makes mxnet compatible with Windows Visual Studio.
@@ -146,3 +148,4 @@ List of Contributors
 * [Xizhou Zhu](https://github.com/einsiedler0408/)
 * [Jean Kossaifi](https://github.com/JeanKossaifi/)
 * [Kenta Kubo](https://github.com/kkk669/)
+* [Manu Seth](https://github.com/mseth10/)
diff --git a/Jenkinsfile b/Jenkinsfile
index 95115cf58920..cbe63758ac70 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -7,37 +7,41 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm
 // command to start a docker container
 docker_run = 'tests/ci_build/ci_build.sh'
 // timeout in minutes
-max_time = 60
+max_time = 1440
 // assign any caught errors here
 err = null
-// set build status to success by default
-currentBuild.result = "SUCCESS"
 
 // initialize source codes
 def init_git() {
+  deleteDir()
   retry(5) {
     try {
       timeout(time: 2, unit: 'MINUTES') {
         checkout scm
         sh 'git submodule update --init'
+        sh 'git clean -d -f'        
       }
     } catch (exc) {
       deleteDir()
-      error "Failed to fetch source codes"
+      error "Failed to fetch source codes with ${exc}"
+      sleep 2
     }
   }
 }
 
 def init_git_win() {
+  deleteDir()
   retry(5) {
     try {
       timeout(time: 2, unit: 'MINUTES') {
         checkout scm
         bat 'git submodule update --init'
+        bat 'git clean -d -f'        
       }
     } catch (exc) {
       deleteDir()
-      error "Failed to fetch source codes"
+      error "Failed to fetch source codes with ${exc}"
+      sleep 2
     }
   }
 }
@@ -50,7 +54,7 @@ def make(docker_type, make_flag) {
     try {
       sh "${docker_run} ${docker_type} make ${make_flag}"
     } catch (exc) {
-      echo 'Incremental compilation failed. Fall back to build from scratch'
+      echo 'Incremental compilation failed with ${exc}. Fall back to build from scratch'
       sh "${docker_run} ${docker_type} sudo make clean"
       sh "${docker_run} ${docker_type} sudo make -C amalgamation/ clean"
       sh "${docker_run} ${docker_type} make ${make_flag}"
@@ -78,11 +82,18 @@ echo ${libs} | sed -e 's/,/ /g' | xargs md5sum
 }
 
 // Python unittest for CPU
-def python_ut(docker_type) {
+// Python 2
+def python2_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests --with-timer --verbose tests/python/unittest"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests --with-timer --verbose tests/python/train"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/unittest"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/train"
+  }
+}
+
+// Python 3
+def python3_ut(docker_type) {
+  timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
     sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/unittest"
   }
@@ -90,10 +101,17 @@ def python_ut(docker_type) {
 
 // GPU test has two parts. 1) run unittest on GPU, 2) compare the results on
 // both CPU and GPU
-def python_gpu_ut(docker_type) {
+// Python 2
+def python2_gpu_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
-    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests --with-timer --verbose tests/python/gpu"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/gpu"
+  }
+}
+
+// Python 3
+def python3_gpu_ut(docker_type) {
+  timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete"
     sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/gpu"
   }
@@ -150,11 +168,21 @@ try {
           }
         }
       },
+      'Amalgamation MIN': {
+        node('mxnetlinux') {
+          ws('workspace/amalgamationmin') {
+            init_git()
+            make('cpu', '-C amalgamation/ clean')
+            make('cpu', '-C amalgamation/ USE_BLAS=openblas MIN=1')
+          }
+        }
+      },
       'Amalgamation': {
         node('mxnetlinux') {
           ws('workspace/amalgamation') {
             init_git()
-            make('cpu', '-C amalgamation/ USE_BLAS=openblas MIN=1')
+            make('cpu', '-C amalgamation/ clean')
+            make('cpu', '-C amalgamation/ USE_BLAS=openblas')
           }
         }
       },
@@ -185,6 +213,7 @@ try {
             withEnv(['OpenBLAS_HOME=C:\\mxnet\\openblas', 'OpenCV_DIR=C:\\mxnet\\opencv_vc14', 'CUDA_PATH=C:\\CUDA\\v8.0']) {
               init_git_win()
               bat """mkdir build_vc14_cpu
+    call "C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\bin\\x86_amd64\\vcvarsx86_amd64.bat"
     cd build_vc14_cpu
     cmake -G \"Visual Studio 14 2015 Win64\" -DUSE_CUDA=0 -DUSE_CUDNN=0 -DUSE_NVRTC=0 -DUSE_OPENCV=1 -DUSE_OPENMP=1 -DUSE_PROFILER=1 -DUSE_BLAS=open -DUSE_LAPACK=1 -DUSE_DIST_KVSTORE=0 ${env.WORKSPACE}"""
               bat 'C:\\mxnet\\build_vc14_cpu.bat'
@@ -242,31 +271,75 @@ try {
     }
 
     stage('Unit Test') {
-      parallel 'Python2/3: CPU': {
+      parallel 'Python2: CPU': {
         node('mxnetlinux') {
-          ws('workspace/ut-python-cpu') {
+          ws('workspace/ut-python2-cpu') {
             init_git()
             unpack_lib('cpu')
-            python_ut('cpu')
+            python2_ut('cpu')
           }
         }
       },
-      'Python2/3: GPU': {
+      'Python3: CPU': {
         node('mxnetlinux') {
-          ws('workspace/ut-python-gpu') {
+          ws('workspace/ut-python3-cpu') {
+            init_git()
+            unpack_lib('cpu')
+            python3_ut('cpu')
+          }
+        }
+      },
+      'Python2: GPU': {
+        node('mxnetlinux') {
+          ws('workspace/ut-python2-gpu') {
+            init_git()
+            unpack_lib('gpu', mx_lib)
+            python2_gpu_ut('gpu')
+          }
+        }
+      },
+      'Python3: GPU': {
+        node('mxnetlinux') {
+          ws('workspace/ut-python3-gpu') {
             init_git()
             unpack_lib('gpu', mx_lib)
-            python_gpu_ut('gpu')
+            python3_gpu_ut('gpu')
           }
         }
       },
-      'Python2/3: MKLML': {
+      'Python2: MKLML-CPU': {
         node('mxnetlinux') {
-          ws('workspace/ut-python-mklml') {
+          ws('workspace/ut-python2-mklml-cpu') {
             init_git()
             unpack_lib('mklml')
-            python_ut('mklml_gpu')
-            python_gpu_ut('mklml_gpu')
+            python2_ut('mklml_gpu')
+          }
+        }
+      },
+      'Python2: MKLML-GPU': {
+        node('mxnetlinux') {
+          ws('workspace/ut-python2-mklml-gpu') {
+            init_git()
+            unpack_lib('mklml')
+            python2_gpu_ut('mklml_gpu')
+          }
+        }
+      },
+      'Python3: MKLML-CPU': {
+        node('mxnetlinux') {
+          ws('workspace/ut-python3-mklml-cpu') {
+            init_git()
+            unpack_lib('mklml')
+            python3_ut('mklml_gpu')
+          }
+        }
+      },
+      'Python3: MKLML-GPU': {
+        node('mxnetlinux') {
+          ws('workspace/ut-python3-mklml-gpu') {
+            init_git()
+            unpack_lib('mklml')
+            python3_gpu_ut('mklml_gpu')
           }
         }
       },
@@ -313,7 +386,7 @@ try {
               sh "${docker_run} cpu rm -rf .Renviron"
               sh "${docker_run} cpu mkdir -p /workspace/ut-r-cpu/site-library"
               sh "${docker_run} cpu make rpkg USE_BLAS=openblas R_LIBS=/workspace/ut-r-cpu/site-library"
-              sh "${docker_run} cpu R CMD INSTALL --library=/workspace/ut-r-cpu/site-library mxnet_current_r.tar.gz"
+              sh "${docker_run} cpu R CMD INSTALL --library=/workspace/ut-r-cpu/site-library R-package"
               sh "${docker_run} cpu make rpkgtest R_LIBS=/workspace/ut-r-cpu/site-library"
             }
           }
@@ -328,13 +401,13 @@ try {
               sh "${docker_run} gpu rm -rf .Renviron"
               sh "${docker_run} gpu mkdir -p /workspace/ut-r-gpu/site-library"
               sh "${docker_run} gpu make rpkg USE_BLAS=openblas R_LIBS=/workspace/ut-r-gpu/site-library"
-              sh "${docker_run} gpu R CMD INSTALL --library=/workspace/ut-r-gpu/site-library mxnet_current_r.tar.gz"
+              sh "${docker_run} gpu R CMD INSTALL --library=/workspace/ut-r-gpu/site-library R-package"
               sh "${docker_run} gpu make rpkgtest R_LIBS=/workspace/ut-r-gpu/site-library R_GPU_ENABLE=1"
             }
           }
         }
       },
-      'Python2/3: CPU Win':{
+      'Python 2: CPU Win':{
         node('mxnetwindows') {
           ws('workspace/ut-python-cpu') {
             init_git_win()
@@ -343,20 +416,30 @@ try {
     7z x -y vc14_cpu.7z'''
             bat """xcopy C:\\mxnet\\data data /E /I /Y
     xcopy C:\\mxnet\\model model /E /I /Y
-    call activate py3
+    call activate py2
     set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
     del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
     C:\\mxnet\\test_cpu.bat"""
-                            bat """xcopy C:\\mxnet\\data data /E /I /Y
+          }
+         }
+       },
+       'Python 3: CPU Win': {
+          node('mxnetwindows') {
+          ws('workspace/ut-python-cpu') {
+            init_git_win()
+            unstash 'vc14_cpu'
+            bat '''rmdir /s/q pkg_vc14_cpu
+    7z x -y vc14_cpu.7z'''
+          bat """xcopy C:\\mxnet\\data data /E /I /Y
     xcopy C:\\mxnet\\model model /E /I /Y
-    call activate py2
+    call activate py3
     set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_cpu\\python
     del /S /Q ${env.WORKSPACE}\\pkg_vc14_cpu\\python\\*.pyc
     C:\\mxnet\\test_cpu.bat"""
           }
          }
        },
-       'Python2/3: GPU Win':{
+       'Python 2: GPU Win':{
          node('mxnetwindows') {
            ws('workspace/ut-python-gpu') {
              init_git_win()
@@ -365,19 +448,29 @@ try {
     7z x -y vc14_gpu.7z'''
              bat """xcopy C:\\mxnet\\data data /E /I /Y
     xcopy C:\\mxnet\\model model /E /I /Y
-    call activate py3
+    call activate py2
     set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
     del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
     C:\\mxnet\\test_gpu.bat"""
+           }
+         }
+       },
+       'Python 3: GPU Win':{
+         node('mxnetwindows') {
+           ws('workspace/ut-python-gpu') {
+             init_git_win()
+             unstash 'vc14_gpu'
+             bat '''rmdir /s/q pkg_vc14_gpu
+    7z x -y vc14_gpu.7z'''
              bat """xcopy C:\\mxnet\\data data /E /I /Y
     xcopy C:\\mxnet\\model model /E /I /Y
-    call activate py2
+    call activate py3
     set PYTHONPATH=${env.WORKSPACE}\\pkg_vc14_gpu\\python
     del /S /Q ${env.WORKSPACE}\\pkg_vc14_gpu\\python\\*.pyc
     C:\\mxnet\\test_gpu.bat"""
            }
          }
-       }
+        }
     }
 
     stage('Integration Test') {
@@ -428,9 +521,11 @@ try {
         }
       }
     }
+  // set build status to success at the end
+  currentBuild.result = "SUCCESS"
 } catch (caughtError) {
     node("mxnetlinux") {
-        sh "echo caught error"
+        sh "echo caught ${caughtError}"
         err = caughtError
         currentBuild.result = "FAILURE"
     }
diff --git a/KEYS b/KEYS
index 070f38d4f78e..d646bb7c3f11 100644
--- a/KEYS
+++ b/KEYS
@@ -189,3 +189,177 @@ TNxwR0b9K/mLKGh58n1vVT79QReQFQ4wWFyQkmFkL9ybG04wTKe00VDNP987nSBg
 FuSamX64+S6T8IwAuP9U
 =KRiV
 -----END PGP PUBLIC KEY BLOCK-----
+pub   4096R/C65AF308 2017-08-15 [expires: 2021-08-15]
+uid       [ultimate] Sandeep Krishnamurthy <skm@apache.org>
+sig 3        C65AF308 2017-08-15  Sandeep Krishnamurthy <skm@apache.org>
+sub   4096R/3D0D60F6 2017-08-15 [expires: 2021-08-15]
+sig          C65AF308 2017-08-15  Sandeep Krishnamurthy <skm@apache.org>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBFmTNosBEACuTlborR4n0MyVHTndYNVAjT3MNkJDitsSAeBpUl2wXUZNZ3YJ
+iDfLsDCdzSTc/uYsfeVfRay1nYZQNBO2ikRVg5CO+Q4T7wceZ7uFwEVNnKNRiFlI
+GjJMnjWa4g4GqmRLrEDJXxibFLWuCZgu8b2z/EQfCp+lBY4Q01/ag6ydoejDicRF
+sHrdmt8bJolvAjNepdsW2nxOAo/j00yDyR/xdCCIXATopHdaS3isGlF+gsXr3PTT
+oLUqqwst+Sx8Zc/0cCd+QXtzrb5jImtKHTj9nQJpznHxWeGhQsdd6Hvt76lrJBRm
+MKM4Ti+jzy2yCs+VLOpqiY3AUuleNELQ6LgGgZGDY4doLtliSjpEiddHVkXNV+et
+gq114Ucr86pPPS7I92yx3FES9uforljzZB0MZRDv3feaBZKy8+HR040I2/PSe3eD
+PU7qb4Sj6vUhxztLGAKjKWFt+DbFuUZqS8mGCk1fFId49+U/XQvjbmy7GfTRXZ05
+XWwf6SAerh8gDBxFm60ALz+7LiJYy6D8HBAE+HTLf4/FK3o1cuW9niHWO/7RdHiC
+XW9N98dwPm49nn+bXan45lT85zTJhUOWY5/PQMitj68D4Z6EHDnFaSBTcvvjCor9
+sJSJKh8p36df489xD3fe7D7ckzu1J7STGkvarQ+wkWTrdCK6dzFmo0cnzwARAQAB
+tCZTYW5kZWVwIEtyaXNobmFtdXJ0aHkgPHNrbUBhcGFjaGUub3JnPokCPQQTAQoA
+JwUCWZM2iwIbAwUJB4YfgAULCQgHAwUVCgkICwUWAgMBAAIeAQIXgAAKCRBRjac6
+xlrzCOMlD/0dq36zb/lmyBbpd3Xznag4YZyLbO7oR7IwKK6/X8VMOYX3Jd8bXncR
+w09G/03or2j5MZIfhYH0yOEUHhXae8Ewi2/kGwMGH8nEc9njSv4cZKx76iQ8OBcL
+/V3DZZEc1h7bDaqheOiWpfSEnSg3SO9LyHDh1s+7UomIccFQgBCUg/BpIB2A1i1/
+aAAPeqBWqEqOIC+QHlXL8RXotC4K/h+qc8XVa5GQKuAuJZNb6P4twiDXzJaeFYuE
+x6ZeJ7aTJW8cnG/5ShXyrjd/awdpnuUZNoUBVF/ZIxBNGJSD6005BFIysIVFIyIz
++Qf+SLyMdRsAZQFm5xg9HkBRz7/YUY5q1FZz8VBl6XzZLmqpU9siB4O6jzbDQ2BW
+fng8FR1kQvaHqIDA3zCDslas1imlcHqJjmKVEmoVB77ug3aTuq3o817ExrUsjciZ
+CUOedZ3TnmF3I/YLryqKKUzGSzrNuEFa3lRzL7pULKsRzoWBdbhW3XskKWux1SlP
+OWEr/lm8Yg+P2wDikY7jYQeJU/iZKc3cEUbo/edLA5+12mbL3cUWP/If2+MXc04f
+nbhKZtb0jcB91KSUgMi+TP/IM9l5w3q+qmG6BDyBwwE0rf/VVvOqh9Ayt+iyLwfz
+VLkEVsTpqsjBxCEd/GAUoIBK6cCchDmCbKMgQqWySxaA+gb6oul/abkCDQRZkzaL
+ARAAweT6N1419HCIODjuUaQNFBDuu+IjROYllJCbqaHxXoj1uvOQD2wMxZo43IIk
+NqxbAgV1z90phiOLBC44z2MNvUj2yx41iZ1zkRkBRTx+Q3oUm9Vcon9a7blfh/mW
+/YhPbs2E1ukK/tL317rerdrNet3EhiqglHU3a6I4D/oPnQ/8t9UcYiGVULrhUVTc
+GoNdkz+Fu3Fmai/stnUNnYoMQIeBBukoF++SKfxm/y57LeScMbqJX2lPEVRm4ehx
+SNlO38/p42aVsNzTuWlb0otGh+WaHZvmfLpckjZVCk+rGqoFUkx+BWfoyLv8VOPJ
+qY4hQ0SjAeEcYwK2/Z99d2/0BN0zemldjvtT7Unrnm7hni3Ore9QaHCqhvyfh9P2
+hcdg9FS8VOEtxjdW/IAWEcT62drZz4WAarv1a8gxogwXtjt5acWMgkwuAXwhQM/D
+g4KkKMtT182MmF/cVn4RfGkRihnbHCXrp+sej19I3hUaB855PfIZ0v8N7fU2jM0h
+cv84ha6w0bV0Ab4HFqo+6NT4c4yPh/PgotEXYgkIQauPkicgkuETfOyuYLtdcPGj
+c3kP4HnJ6JXVq7VlI/kEO+LWEi8ygsUP5l3dfLPf1O6BOANQAF7tI/5g1pB75miX
+Y1yx9liFtJUuhXs3IN/6d1I2ZA3Q17LWbb6uh9ecINiLSPsAEQEAAYkCJQQYAQoA
+DwUCWZM2iwIbDAUJB4YfgAAKCRBRjac6xlrzCIRTD/9gEejzaO7f4VnmovrYkoBt
+wF2b2z7F9RhdxgCAvBfuQ7mI6WzH2chYP7SwSjugB+XPFQ5fM0sa8UHKyBryGBfx
+aQaxS/1lFyIM7g/zNCNxHQN9ZLmIK2i+VSKZYjbZDh6ESnT/h2Shk5IN/8ho11+e
+nGehxVDDEpEBAtwETwLOh7v9+uXfn6uDchBZJIT/SwqONUWsOQrsB6JEPKoSCaM6
+PcRjmU+Vd8DLkxrvQe0PqlYiiiNZiRv1WJ2ywPXTU9dvsAdRgvPWdNGcPhjjQwuj
+e95OzvNGGdcVPX+cudfiSm9/BTCEBpUkjg8rFSjh3YisJ18TuV9DzG1X/bvWLXaY
+zACetOaMYkvMboZfwHdbCPpD/MGnqAjFWoRvdKlV0ZZfwUOzEEp6NHFfPw1+9px5
+BremMOrPkPZALGs+7mb9s8tvC2OKMkbXzqDsXnZUS5p3QgrensHBQt9FyaLlE0Zi
+x+/cW/NpygBSjWyGX3ahdS30U4rl5BfkfyzvCkohWSoaVODf/2HjxEyVVONDShBf
+kpbpFOW5jAfVjs2ZqlVsUkNCL9wSPKUU30szkZ1AT2BOivJoCJv8WgF7np1LAZCa
+C0d/3i4v3h6AgzUqDjDILFCNiDgsaKtdoB5dK1tgjC3wX42lRAXDsTnF78c5iGk4
+6gcqWf5YDM+zv+yvsafmug==
+=my1/
+-----END PGP PUBLIC KEY BLOCK-----
+pub   4096R/703DF31B 2017-10-16
+uid                  Chris Olivier (CODE SIGNING KEY) <cjolivier01@gmail.com>
+sig 3        703DF31B 2017-10-16  Chris Olivier (CODE SIGNING KEY) <cjolivier01@gmail.com>
+sub   4096R/7B90EEF2 2017-10-16
+sig          703DF31B 2017-10-16  Chris Olivier (CODE SIGNING KEY) <cjolivier01@gmail.com>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+Version: GnuPG v1
+
+mQINBFnk4CsBEADBZBL8aDcNMHfpxd/gCPYf+sw0EyWJZ+whUwwod8TmZohJr84a
+Lb3RTgCfOo5XU7DYYZoQGJsJsTgPw4bAQEPbHjNr0NNKR1IphYo8mhKxhzrocNDO
+QI9X/PyyZRNdW8hRHzBZDB+Wrb3dp+J1a6Zn+hK3LhQ+I2HHBM/oWx9j5+0NqKs+
+kqOIC7H3hY1+ZG/jhaG4L1/VMShsPrDD/yiGPUyF2gq345Q+5VUlMXRy1iHJjqC2
+syEUS4m5xL6csqz4JYZlPNKYKwBzzJ5O3SKaLGESLMZTmdjvXmTb4nBbj1ioQcwh
+N3IRaBFp+0Bxxcv0Jp/c9GLJF5u1aKOmYmlpyeQLF8APrABtkoX1/hCQhqk9nlpb
+Y5+gq+VolJF6W78QxEVdpBBguFS+qSnXY+6YxrMel+XPKwgwJw5GDCd4tcy0dkoM
+PLyA5azyOiQOzxHbEi0SheD1kcSSW3sqTdCXzIxHc9Sdwb6sHiboPFJSwIl79R1j
+k0WztKIccVrlgcpiYyzVHr+bRRPgcW3P83mp3YVzsUdFdxYiVI6drAkcOD8NPumB
+ehP8Ih1GOejemN5RIzq1ZLhZOBH1Y8MM44R/Z8YthzobXbwMQJY8n9jvUi81/DC3
+NX2MWui4AWXdzIgQxMi3l9n+1LOQXhEUfJW0I2gw+QqKFUeSnc+mX/SmDwARAQAB
+tDhDaHJpcyBPbGl2aWVyIChDT0RFIFNJR05JTkcgS0VZKSA8Y2pvbGl2aWVyMDFA
+Z21haWwuY29tPokCOAQTAQIAIgUCWeTgKwIbAwYLCQgHAwIGFQgCCQoLBBYCAwEC
+HgECF4AACgkQgP2B13A98xsvKhAAn8mouJvwdFp6Pm7BHOpTA1uKcQq0V/nctaDB
+2laT3pklv+vFWioviFsTcwAyQ8JW80/Sl+ENRt06vqtUXWOFaJn14mLhASEveur0
+O7itKXePAb77UdoPKvCixnGldycnocxYVekcJJ11YQR1L3cN76dC3E3G9nVZFkXI
+TE6unN0XTJ0BRsihZcvE896v7zoWwa4RqaFL2EKiROsQr0WuK345Cj5nhVlISQs1
+jPcg52PYHSomIencNVWvuBWeS6LowBkC/VY3tDyI6JYBpjQxeJN6ctd/U4SLoaDA
+EAbkbJqj4bvXVBGBpCdZdsv9RXldGAw8XOSWjEmJDBV5x1KzTyrbqPAlV9GPvV7m
+3LiB2fRdkTLt9Z4UnOkOtvOQX9Zo7O5vBMbTFGpJKIqeJ6SWZjol35PQiLJ8vJdh
+U4qLLBmK6rTTipDXV74WYjYoO0f9DPBdgpY/fR6JZRmyN4fVVc3fGvJZZDyQv+KT
+lnfaXFvlSPGE6QsHO7ghbaKvRla4mUpf9BJq7YWiA3ghLVClHC5erWb4daxpn4vN
+nby4Djk2CanndIXwLOL3WDsGaUB/JCIvS8AYkKqk1nCgc4r/8jjUdYIc0Qll1Zn6
+5HBu5ju7gEtVl2QHIs2VI7YhJa2Z6mKUymhwkT3WBDpMDn8rgAu1UsLalhkFReF/
+jVh/mTW5Ag0EWeTgKwEQALdJpNgiIGWYZDj892gKk+/zsRgaMMmvfEEdfpY5mWvr
+t8L1NC/A+/K4oQgBaFvJyXHS1FGc3e2Te2Vqi0raoUjSzpX0KcsemeAxbEtpt14A
+xZ/LSz9R1PZEgnGwgkk2QaGDJThHAjrLUZRT08NORb35hI33yPhMnfHt/flW7D+B
+ZtJRk0wD/O+B5RGd73Cn5gHGZmoOzEVw9SmmrIrUS+meZOwIadJ7nu0VkijtpghM
+CO1KE/D3ocv4RhtNC0T3xP5HN961kBYO2s+3ddW7uYRlB78mEdFZ/bbIWqedXLGt
+IvSm4bEoySfPbmFzGTBAXdUDCMNeh3Xtw5UZaDEdiq4hgnfswfH301QMQJzWQTzp
+zY3LhiPGs4fP0IBzjWygIBxktRJKIfQWYE6J9cNC+Xe+mrfpqpwNMsPt78flwghD
+sOFmvrwGYnlx8mehnwrW5OIQFqucLVXoDUKi/qmCub5QcJEXmEqfiVlAyDlhrH6L
+t8S8lTDP08WJtN9tlRCL/66YUGSGoCGw3R6UUxhk1qhTHU/vlk5RTzO1I/SaVkQs
+Lrn0sQwQlunHzhtawne2vof9/kxCzV6vvnrrLPa7c/Y506IVDU54KzMAKdCuUUaK
+DmGo+Iu1fweRd5360Put4Fgv21fC1W3/yb0ZMqefoaH4ekIcbQWubAt896K2wZN/
+ABEBAAGJAh8EGAECAAkFAlnk4CsCGwwACgkQgP2B13A98xu7Qg/+MrduvSlvrRka
+U93vaumhV25uzjFZnEsMBe61xZp4NrF6BhSdWJgSZXI+8xVblvNcTemIjCuRAXIt
+zqmN9O6UOEnRn4B0IOvXPAw/RT+XY6NCLb01Nd7EcIo0cAdQi4hWaPk0caZ5ABSu
+Ss5sFzsYyShhGNbo0KKYqGsYc4qz7mOr6QG5/opmi4UWIEr9LZkIjlY/TT6ltSyd
+GIVb//KeUobVxEpMe1iET7YB/8Yvad5TWXpcSdIpwfj6l3W3HsLHbx0zhndFuWOY
+npLS+f3/SFu60Bq0rmF3W12CmeZTlgtSvJowhhfnnmBIo8cZz+G6EYutmgukd8QE
+lD1E967Tbh4aJnKzODhpF/uF+zxwWw7gBvDOTMKblOZBTSMKYutuSuCi3Ysfw02g
+XYjMQkUQMJIxj5AoVipGDRJYKCAu508CLsvGJGrJF+nehfsbdd8Kye4SBaOWA94Y
+8JuGTFarFyRlKshgRN0qGVs2wo64Se3p9EAvrIwqqyItJw7dTmxXQlkbCWSfEdjM
+IjljtjhIMhMLB5rf8BPCZ6og5fKqUF5LOp8DujG2DGa9ZhYWTzOO/UGZP60qGTot
+5bm+5Ovl57Yk4UUUSC+Uk+yZ9QOAdOVtbTX+SbmNCUmZ+mTcB6A/XoA5jKsVUyZm
+GZZVNUU0hQYfulYDY5E8fJ4Olzpf5OE=
+=WmLB
+-----END PGP PUBLIC KEY BLOCK-----
+pub   rsa4096 2017-11-21 [SC]
+      331E9A5ED727FADD429B2894F2F1EAB589EBCFB1
+uid           [ultimate] Haibin Lin <linhaibin.eric@gmail.com>
+sig 3        F2F1EAB589EBCFB1 2017-11-21  Haibin Lin <linhaibin.eric@gmail.com>
+sub   rsa4096 2017-11-21 [E]
+sig          F2F1EAB589EBCFB1 2017-11-21  Haibin Lin <linhaibin.eric@gmail.com>
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBFoTp3YBEACiGa++rsTjQal+33xADuWxzN9L8bTkMu4uFJqYvyNP2z1Q0fcM
+DFjLJcvsc3ODSlkDGlkrtFpYlqkBTFERABU19TcAQ5FYFu1uULUybtHm55h6OKAm
+1qfSRcKvdidDRytf7XAnhK/jvjtY71EQZUz2OtvKj0p93C22JcaJasKjHEF+8Jv0
+1rvV4BsZcY3hl9ORbv+nvBB6PX6zkpfhh0edVl50yzJEM34dtBZ1CTVlcJhIj0yo
+LEZkt+zKEz5C3/D5OgM2DoclUInAvPeIGXvOgoQi9he4YjMppC3fmcA9O+sJ8XFh
+dqNxcI+ddcvg84g4ntC2iJb8OOX75xkkoIsJXhZgwxBbdnwINNY6Eqqyx2lMvGRI
+BLTSxLKsfX/mCmW9mwNrKxfrBIb107ldxwfo+13/Vh45nIlhM0yxfhlukHmYEHp+
+G+T+aD67t0HHZHr27M2x0qTdKkRoI+7xYTUvu+OmObJej48UDhi4GMAjQ61TeLm1
+OyetyMoKpB+Cah1n0O5j6nDPRJBS9OPi361DIZRhlg4IkrbIP5MHs+Zvof8O04xq
+GRfYAqEhT6rP98TidpHVhFEV3CrDLVDJLZ3Vqglj2iyNOjEjF1GJJBaFWUoXhKPs
+WVZMfgpkaXRwng6r6ieRmmt/Ci//JV6ztkwKk7e0OQJBqbwA0A7lqx7j2QARAQAB
+tCVIYWliaW4gTGluIDxsaW5oYWliaW4uZXJpY0BnbWFpbC5jb20+iQJOBBMBCAA4
+FiEEMx6aXtcn+t1CmyiU8vHqtYnrz7EFAloTp3YCGwMFCwkIBwIGFQgJCgsCBBYC
+AwECHgECF4AACgkQ8vHqtYnrz7GFWA//Z6YTxtlZSHFlqkAFFOsDtV3DghSC8zJe
+LRm508fZn53e9a3fUvT9U1sUfW8DI69GRK+IBkvP5hcmMb1U4N3MxzX4YC/13wMY
+3BtUbCIpD8uBJOtuC7fPAH//Ij/4wv4Fp1/3WL6y04+mJIayMyKqmc3nBLD0rVWC
+AHEsPR7tiDDMltrzxMNHIJCDaiClJzKiCrQ4owKBOnY2TU/E64xyk5IwAczz2lCY
+712h6+q2mO7F672Yt6b6pqmugnFqWdqUj9dx1V9x//4y/k0DefF7G/1Lk1lh4Eyo
+aUx3jve/74Y87ICW1AhR2/TvdfWbsAkPyfy98k1SLR/9BulSIXIFeduxaFl7M3D8
+98aB5pqO8tPl2BFUJwh/uywDx0994MjQ8Xvrjmb9WJOAx9OyokivVCvmqJOkBzve
+Fk/4KUHTFTGQCoXbbBlIQTC9hBd8c1S4t0gFGbcjlqTvr/ZnTdpSgbzZ/96/SVRm
+dYOgjjpkrBOZgJPwsmmRQ2MufeZUtmkFSqdIRLGBNTefsMDDCGvyNeR/XCgM5Zfy
+39PX/GHFKgq5Ei2ywEyZOGLCK5MwA12fMExYoedazFFjv6ApGpz+j831A2z/crEo
+bRpVvd+rFzGnCKDq5viUD7cRzIPLVltYCNEayEgWta4KI+00/ayaaT6sM7N7oM32
+r01Wv02FvdG5Ag0EWhOndgEQAPiiTvmo9fZNW/5IxL7kDR6u9FEmEb2EZI+KxzbN
+RYYY0IPsnA8TY9Rzj9D7xV8Vmf2Pd5SUyCtVwLfBKhadLh755NeehNXWIbW802gH
+bvbykL/Zcn98oiLOVfK/Op/6MVpDuGXZ6CpDbQDSn6ne6/CWQnoz1+Wo+wbs1TOy
+AhO6xKa20NtGIZrfZD01dSzRC5DMJD3GK1j6HdVUz5piwiTsGvGRJ3ZLfObdlHGn
+CTMA39Jb8zQ0QtWPsOre0Nz2JQ53awMBaUhan5MeoOYp6ccsgD1BigyxmKb8iIDN
+NM/Iwi0Ib5L4AiGh6fQFf0WF8p74yIn1WgFcWxJXR1ZzvMDDHXqq97SQtbr9FKhu
+xrceh/92Ga4ruAJRCbMtmOTUP4APTeT4csANdgJxtW+I4QAp01BQSl75pB2QDlam
++tqePQDboAGc78Ck6096wML0ZMKDDxXPrI67uppuM02FYuJ41ZQjOytigeoGS88g
+ByZwPcFIT+5XgtNC0BH7U9VIkiap5U00lykzEjcRjrZTtKqHdeFPbSEpv1QfIcLG
+Ra439g9acRHX82sVzhzZk5uu9QKyDN1EpuWoLOaOrICHcMSC7GkVXS8+/7TX0vAN
+vn/51fb+tHJekGfaPhsPuIbSba2kmUy8sSS/6JJHkJ1aEFigAPbwUbZTqNlb4IRm
+FBVBABEBAAGJAjYEGAEIACAWIQQzHppe1yf63UKbKJTy8eq1ievPsQUCWhOndgIb
+DAAKCRDy8eq1ievPsbrpEACQ8HqAvq3NuiM00WyHla7VtghCWVEmRozbYc4dR7u+
+sTQrVgbLfgR5zeSWCMHpEcaN/RS58O/i1Dk0DLHTu3NrarzrkEPlHwIgJQ7orxFD
+YW3Z2Ytk40uKex4ou/8VzvXTpj1u8d/GHgGdvChBmtw5FaMgc8PBi4FnlIS5cAGU
+1ca1RwMX0WpFsp9HgrQLVxgkDs/m7oRSmC5GvPDIpb5S9QFzJKYKTJxSfXXO6hCk
+FGAGHWjVC26a/wSUtZQfb3G9sYZJuKUOwr4tpz1y6Ronc34cZYi1FlKWJuz01w4s
+4PKjFG/wbYSd+QLfftyyVPMLdY+wCwc8O59QqKx5Rj8HQLxIwSL3chhmdAHCmejM
+zKCpkFyLOc6+Wjet6hD6X3EsjIee1AAy22D24EaLJsju9zR/khJFS4K76aQX7dYN
+aB3C7S5HGxvYGSqfnn4eBaEzrSOde7HEcqYpYKxS+jB1c4X4W91NSTsqDd0QJMVF
+35eKfhWj+X6jWIC+48kfzypXdOCnPbto7wrr40yYCHw3XSXj40H5dWSsWEZVmS+s
+Dzz6zy9maHVyXa/rNsL7OjqimtKad65r/wfSFPPIcR1jJfP4GMNHV0TYqxdyDaXg
+iEVpHzOV7gd75fJbOvoNxNZj20Yj5sg8OCwbv8PxLXEcBFs7hhjQMhVRsjpNYzAR
+Iw==
+=rMlc
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/LICENSE b/LICENSE
index d64569567334..1a02899feead 100644
--- a/LICENSE
+++ b/LICENSE
@@ -96,6 +96,7 @@
           Derivative Works a copy of this License; and
 
       (b) You must cause any modified files to carry prominent notices
+
           stating that You changed the files; and
 
       (c) You must retain, in the Source form of any Derivative Works
@@ -200,3 +201,227 @@
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
+
+    =======================================================================
+    Apache MXNET (incubating) Subcomponents:
+
+    The Apache MXNET (incubating) project contains subcomponents with separate copyright
+    notices and license terms. Your use of the source code for the these
+    subcomponents is subject to the terms and conditions of the following
+    licenses.
+
+    ========================================================================
+    Apache-2.0 licenses
+    ========================================================================
+
+    The following components are provided under an Apache 2.0 license.
+
+    1. MXNet Cpp-package - For details, /cpp-package/LICENSE
+    2. MXNet rcnn - For details, see, example/rcnn/LICENSE
+    3. scala-package - For details, see, scala-package/LICENSE
+    4. Warp-CTC - For details, see, src/operator/contrib/ctc_include/LICENSE
+    5. dlpack - For details, see, dlpack/LICENSE
+    6. dmlc-core - For details, see, dmlc-core/LICENSE
+    7. mshadow - For details, see, mshadow/LICENSE
+    8. nnvm/dmlc-core - For details, see, nnvm/dmlc-core/LICENSE
+    9. nnvm - For details, see, nnvm/LICENSE
+    10. nnvm/tvm - For details, see, nnvm/tvm/LICENSE
+    11. nnvm/tvm/HalideIR/LICENSE - For details, see,  nnvm/tvm/HalideIR/LICENSE
+    12. nnvm-fusion - For details, see, nnvm/plugin/nnvm-fusion/LICENSE
+    13. ps-lite - For details, see, ps-lite/LICENSE
+
+    ========================================================================
+    MIT licenses
+    ========================================================================
+
+    1. Fast R-CNN  - For details, see example/rcnn/LICENSE
+    2. Faster R-CNN - For details, see example/rcnn/LICENSE
+    3. tree_lstm - For details, see example/gluon/tree_lstm/LICENSE
+
+    ========================================================================
+    JQuery License (MIT license)
+    ========================================================================
+    jQuery JavaScript Library v1.11.1
+    http://jquery.com/
+
+    Includes Sizzle.js
+    http://sizzlejs.com/
+
+    Copyright 2005, 2014 jQuery Foundation, Inc. and other contributors
+    ----
+    Released under the MIT license
+    MIT License
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+    OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+    OTHER DEALINGS IN THE SOFTWARE.
+    ----
+    http://jquery.org/license
+
+    Date: 2014-05-01T17:42Z
+
+    ========================================================================
+    NVIDIA Licenses
+    ========================================================================
+
+    1. Warp-CTC
+    For details, see, src/operator/contrib/ctc_include/contrib/moderngpu/LICENSE
+
+    /******************************************************************************
+    * Redistribution and use in source and binary forms, with or without
+    * modification, are permitted provided that the following conditions are met:
+    *     * Redistributions of source code must retain the above copyright
+    *       notice, this list of conditions and the following disclaimer.
+    *     * Redistributions in binary form must reproduce the above copyright
+    *       notice, this list of conditions and the following disclaimer in the
+    *       documentation and/or other materials provided with the distribution.
+    *     * Neither the name of the NVIDIA CORPORATION nor the
+    *       names of its contributors may be used to endorse or promote products
+    *       derived from this software without specific prior written permission.
+    *
+    * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+    * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+    *
+    ******************************************************************************/
+
+    2. CUB Library
+    For details, see, cub/LICENSE.TXT
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+       *  Redistributions of source code must retain the above copyright
+          notice, this list of conditions and the following disclaimer.
+       *  Redistributions in binary form must reproduce the above copyright
+          notice, this list of conditions and the following disclaimer in the
+          documentation and/or other materials provided with the distribution.
+       *  Neither the name of the NVIDIA CORPORATION nor the
+          names of its contributors may be used to endorse or promote products
+          derived from this software without specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+    DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+    ========================================================================
+    Other Licenses
+    ========================================================================
+
+    1. Caffe
+    For details, see, example/rcnn/LICENSE
+
+    LICENSE
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice, this
+       list of conditions and the following disclaimer.
+    2. Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    CONTRIBUTION AGREEMENT
+
+    By contributing to the BVLC/caffe repository through pull-request, comment,
+    or otherwise, the contributor releases their content to the
+    license and copyright terms herein.
+
+
+    2. MS COCO API
+    For details, see, example/rcnn/LICENSE
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice, this
+       list of conditions and the following disclaimer.
+    2. Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+    ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+    WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+    DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+    ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+    (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+    ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF TH
+E USE OF THIS
+    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+    The views and conclusions contained in the software and documentation are those
+    of the authors and should not be interpreted as representing official policies,
+    either expressed or implied, of the FreeBSD Project.
+
+
+    3. Sphinx JavaScript utilties for the full-text search
+
+    For details, see, docs/_static/searchtools_custom.js
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are
+    met:
+
+    * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+    HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+    SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+    LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+    OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
diff --git a/Makefile b/Makefile
index 33151e574ea7..ceed6450436c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,11 @@
 ROOTDIR = $(CURDIR)
 
+ifeq ($(OS),Windows_NT)
+	UNAME_S := Windows
+else
+	UNAME_S := $(shell uname -s)
+endif
+
 ifndef config
 ifdef CXXNET_CONFIG
 	config = $(CXXNET_CONFIG)
@@ -36,8 +42,8 @@ include $(config)
 
 ifeq ($(USE_MKL2017), 1)
 # must run ./prepare_mkl before including mshadow.mk
-	RETURN_STRING = $(shell ./prepare_mkl.sh $(MKLML_ROOT))
-	MKLROOT = $(firstword $(RETURN_STRING))
+	RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT))
+	MKLROOT := $(firstword $(RETURN_STRING))
 	export USE_MKLML = $(lastword $(RETURN_STRING))
 endif
 
@@ -72,9 +78,14 @@ ifeq ($(USE_PROFILER), 1)
 	CFLAGS += -DMXNET_USE_PROFILER=1
 endif
 
+# CFLAGS for segfault logger
+ifeq ($(USE_SIGNAL_HANDLER), 1)
+	CFLAGS += -DMXNET_USE_SIGNAL_HANDLER=1
+endif
+
 # Caffe Plugin
 ifdef CAFFE_PATH
-  CFLAGS += -DMXNET_USE_CAFFE=1
+	CFLAGS += -DMXNET_USE_CAFFE=1
 endif
 
 ifndef LINT_LANG
@@ -91,7 +102,9 @@ else
 endif
 
 ifeq ($(USE_OPENMP), 1)
-	CFLAGS += -fopenmp
+	ifneq ($(UNAME_S), Darwin)
+		CFLAGS += -fopenmp
+	endif
 endif
 
 ifeq ($(USE_NNPACK), 1)
@@ -105,11 +118,21 @@ ifeq ($(USE_MKL2017), 1)
 	CFLAGS += -I$(ROOTDIR)/src/operator/mkl/
 	CFLAGS += -I$(MKLML_ROOT)/include
 	LDFLAGS += -L$(MKLML_ROOT)/lib
-ifeq ($(USE_MKL2017_EXPERIMENTAL), 1)
-	CFLAGS += -DMKL_EXPERIMENTAL=1
-else
-	CFLAGS += -DMKL_EXPERIMENTAL=0
+	ifeq ($(USE_MKL2017_EXPERIMENTAL), 1)
+		CFLAGS += -DMKL_EXPERIMENTAL=1
+	else
+		CFLAGS += -DMKL_EXPERIMENTAL=0
+	endif
+	ifeq ($(UNAME_S), Darwin)
+		LDFLAGS += -lmklml
+	else
+		LDFLAGS += -Wl,--as-needed -lmklml_intel -lmklml_gnu
+	endif
+	LDFLAGS +=  -liomp5
 endif
+
+ifeq ($(USE_OPERATOR_TUNING), 1)
+	CFLAGS += -DMXNET_USE_OPERATOR_TUNING=1
 endif
 
 # verify existence of separate lapack library when using blas/openblas/atlas
@@ -146,7 +169,60 @@ ifeq ($(USE_CUDNN), 1)
 	LDFLAGS += -lcudnn
 endif
 
+# gperftools malloc library (tcmalloc)
+ifeq ($(USE_GPERFTOOLS), 1)
+#	FIND_LIBNAME=tcmalloc_and_profiler
+	FIND_LIBNAME=tcmalloc
+	FIND_LIBFILEEXT=so
+	FIND_LIBFILE=$(wildcard /lib/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+	ifeq (,$(FIND_LIBFILE))
+		FIND_LIBFILE=$(wildcard /usr/lib/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+		ifeq (,$(FIND_LIBFILE))
+			FIND_LIBFILE=$(wildcard /usr/local/lib/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+			ifeq (,$(FIND_LIBFILE))
+				USE_GPERFTOOLS=0
+			endif
+		endif
+	endif
+	ifeq ($(USE_GPERFTOOLS), 1)
+		CFLAGS += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+		LDFLAGS += $(FIND_LIBFILE)
+	endif
+endif
+
+# jemalloc malloc library (if not using gperftools)
+ifneq ($(USE_GPERFTOOLS), 1)
+	ifeq ($(USE_JEMALLOC), 1)
+		FIND_LIBNAME=jemalloc
+		FIND_LIBFILEEXT=so
+		FIND_LIBFILE=$(wildcard /lib/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+		ifeq (,$(FIND_LIBFILE))
+			FIND_LIBFILE=$(wildcard /usr/lib/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+			ifeq (,$(FIND_LIBFILE))
+				FIND_LIBFILE=$(wildcard /usr/local/lib/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+				ifeq (,$(FIND_LIBFILE))
+					FIND_LIBFILE=$(wildcard /usr/lib/x86_64-linux-gnu/lib$(FIND_LIBNAME).$(FIND_LIBFILEEXT))
+					ifeq (,$(FIND_LIBFILE))
+						USE_JEMALLOC=0
+					endif
+				endif
+			endif
+		endif
+		ifeq ($(USE_JEMALLOC), 1)
+			CFLAGS += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc \
+			-fno-builtin-free -DUSE_JEMALLOC
+			LDFLAGS += $(FIND_LIBFILE)
+		endif
+	endif
+endif
 
+# If not using tcmalloc or jemalloc, print a warning (user should consider installing)
+ifneq ($(USE_GPERFTOOLS), 1)
+	ifneq ($(USE_JEMALLOC), 1)
+$(warning WARNING: Significant performance increases can be achieved by installing and \
+enabling gperftools or jemalloc development packages)
+	endif
+endif
 
 ifeq ($(USE_THREADED_ENGINE), 1)
 	CFLAGS += -DMXNET_USE_THREADED_ENGINE
@@ -166,8 +242,8 @@ endif
 
 # Sets 'CUDA_ARCH', which determines the GPU architectures supported
 # by the compiled kernels.  Users can edit the KNOWN_CUDA_ARCHS list below
-# to remove archs they don't wish to support to speed compilation, or they
-# can pre-set the CUDA_ARCH args in config.mk for full control.
+# to remove archs they don't wish to support to speed compilation, or they can
+# pre-set the CUDA_ARCH args in config.mk to a non-null value for full control.
 #
 # For archs in this list, nvcc will create a fat-binary that will include
 # the binaries (SASS) for all architectures supported by the installed version
@@ -175,13 +251,13 @@ endif
 # If these kernels are then run on a newer-architecture GPU, the binary will
 # be JIT-compiled by the updated driver from the included PTX.
 ifeq ($(USE_CUDA), 1)
-ifeq ($(origin CUDA_ARCH), undefined)
+ifeq ($(CUDA_ARCH),)
 	KNOWN_CUDA_ARCHS := 30 35 50 52 60 61 70
 	# Run nvcc on a zero-length file to check architecture-level support.
 	# Create args to include SASS in the fat binary for supported levels.
 	CUDA_ARCH := $(foreach arch,$(KNOWN_CUDA_ARCHS), \
-                  $(shell $(NVCC) -arch=sm_$(arch) -E --x cu /dev/null >/dev/null 2>&1 && \
-                          echo -gencode arch=compute_$(arch),code=sm_$(arch)))
+				$(shell $(NVCC) -arch=sm_$(arch) -E --x cu /dev/null >/dev/null 2>&1 && \
+						echo -gencode arch=compute_$(arch),code=sm_$(arch)))
 	# Convert a trailing "code=sm_NN" to "code=[sm_NN,compute_NN]" to also
 	# include the PTX of the most recent arch in the fat-binaries for
 	# forward compatibility with newer GPUs.
@@ -189,7 +265,7 @@ ifeq ($(origin CUDA_ARCH), undefined)
 	# Add fat binary compression if supported by nvcc.
 	COMPRESS := --fatbin-options -compress-all
 	CUDA_ARCH += $(shell $(NVCC) -cuda $(COMPRESS) --x cu /dev/null -o /dev/null >/dev/null 2>&1 && \
-	                     echo $(COMPRESS))
+						 echo $(COMPRESS))
 endif
 endif
 
@@ -231,20 +307,18 @@ PLUGIN_OBJ =
 PLUGIN_CUOBJ =
 include $(MXNET_PLUGINS)
 
-# scala package profile
-ifeq ($(OS),Windows_NT)
+ifeq ($(UNAME_S), Windows)
 	# TODO(yizhi) currently scala package does not support windows
 	SCALA_PKG_PROFILE := windows
 else
-	UNAME_S := $(shell uname -s)
 	ifeq ($(UNAME_S), Darwin)
 		WHOLE_ARCH= -all_load
 		NO_WHOLE_ARCH= -noall_load
 		SCALA_PKG_PROFILE := osx-x86_64
 	else
-		SCALA_PKG_PROFILE := linux-x86_64
 		WHOLE_ARCH= --whole-archive
 		NO_WHOLE_ARCH= --no-whole-archive
+		SCALA_PKG_PROFILE := linux-x86_64
 	endif
 endif
 
@@ -255,22 +329,37 @@ ALL_DEP = $(OBJ) $(EXTRA_OBJ) $(PLUGIN_OBJ) $(LIB_DEP)
 ifeq ($(USE_CUDA), 1)
 	CFLAGS += -I$(ROOTDIR)/cub
 	ALL_DEP += $(CUOBJ) $(EXTRA_CUOBJ) $(PLUGIN_CUOBJ)
-	LDFLAGS += -lcuda -lcufft
+	LDFLAGS += -lcuda -lcufft -lnvrtc
 	SCALA_PKG_PROFILE := $(SCALA_PKG_PROFILE)-gpu
+	ifeq ($(USE_NCCL), 1)
+		ifneq ($(USE_NCCL_PATH), NONE)
+			CFLAGS += -I$(USE_NCCL_PATH)/include
+			LDFLAGS += -L$(USE_NCCL_PATH)/lib
+		endif
+		LDFLAGS += -lnccl
+		CFLAGS += -DMXNET_USE_NCCL=1
+	else
+		CFLAGS += -DMXNET_USE_NCCL=0
+	endif
 else
 	SCALA_PKG_PROFILE := $(SCALA_PKG_PROFILE)-cpu
+	CFLAGS += -DMXNET_USE_NCCL=0
 endif
 
-# For quick compile test, used smaller subset
-ALLX_DEP= $(ALL_DEP)
-
-ifeq ($(USE_NVRTC), 1)
-	LDFLAGS += -lnvrtc
-	CFLAGS += -DMXNET_USE_NVRTC=1
+ifeq ($(USE_LIBJPEG_TURBO), 1)
+	ifneq ($(USE_LIBJPEG_TURBO_PATH), NONE)
+		CFLAGS += -I$(USE_LIBJPEG_TURBO_PATH)/include
+		LDFLAGS += -L$(USE_LIBJPEG_TURBO_PATH)/lib
+	endif
+	LDFLAGS += -lturbojpeg
+	CFLAGS += -DMXNET_USE_LIBJPEG_TURBO=1
 else
-	CFLAGS += -DMXNET_USE_NVRTC=0
+	CFLAGS += -DMXNET_USE_LIBJPEG_TURBO=0
 endif
 
+# For quick compile test, used smaller subset
+ALLX_DEP= $(ALL_DEP)
+
 build/src/%.o: src/%.cc
 	@mkdir -p $(@D)
 	$(CXX) -std=c++11 -c $(CFLAGS) -MMD -c $< -o $@
@@ -307,9 +396,9 @@ lib/libmxnet.a: $(ALLX_DEP)
 	ar crv $@ $(filter %.o, $?)
 
 lib/libmxnet.so: $(ALLX_DEP)
-	 @mkdir -p $(@D)
-	 $(CXX) $(CFLAGS) -shared -o $@ $(filter-out %libnnvm.a, $(filter %.o %.a, $^)) $(LDFLAGS) \
-	 -Wl,${WHOLE_ARCH} $(filter %libnnvm.a, $^) -Wl,${NO_WHOLE_ARCH}
+	@mkdir -p $(@D)
+	$(CXX) $(CFLAGS) -shared -o $@ $(filter-out %libnnvm.a, $(filter %.o %.a, $^)) $(LDFLAGS) \
+	-Wl,${WHOLE_ARCH} $(filter %libnnvm.a, $^) -Wl,${NO_WHOLE_ARCH}
 
 $(PS_PATH)/build/libps.a: PSLITE
 
@@ -346,7 +435,7 @@ test: $(TEST)
 lint: cpplint rcpplint jnilint pylint
 
 cpplint:
-	python2 dmlc-core/scripts/lint.py mxnet cpp include src plugin cpp-package tests \
+	dmlc-core/scripts/lint.py mxnet cpp include src plugin cpp-package tests \
 	--exclude_path src/operator/contrib/ctc_include
 
 pylint:
@@ -378,7 +467,7 @@ cyclean:
 
 # R related shortcuts
 rcpplint:
-	python2 dmlc-core/scripts/lint.py mxnet-rcpp ${LINT_LANG} R-package/src
+	dmlc-core/scripts/lint.py mxnet-rcpp ${LINT_LANG} R-package/src
 
 rpkg:
 	mkdir -p R-package/inst
@@ -400,17 +489,19 @@ rpkg:
 	devtools::install_version('roxygen2',version='5.0.1',\
 	repo='https://cloud.r-project.org/',quiet=TRUE)}"
 	Rscript -e "require(roxygen2); roxygen2::roxygenise('R-package')"
-	R CMD build --no-build-vignettes R-package
-	rm -rf mxnet_current_r.tar.gz
+	R CMD INSTALL R-package
 	rm -rf R-package/src/image_recordio.h
-	mv mxnet_*.tar.gz mxnet_current_r.tar.gz
 
 rpkgtest:
 	Rscript -e "require(testthat);res<-test_dir('R-package/tests/testthat');if(!testthat:::all_passed(res)){stop('Test failures', call. = FALSE)}"
 
+scalaclean:
+	(cd $(ROOTDIR)/scala-package; \
+		mvn clean -P$(SCALA_PKG_PROFILE))
+
 scalapkg:
 	(cd $(ROOTDIR)/scala-package; \
-		mvn clean package -P$(SCALA_PKG_PROFILE) -Dcxx="$(CXX)" \
+		mvn package -P$(SCALA_PKG_PROFILE) -Dcxx="$(CXX)" \
 			-Dcflags="$(CFLAGS)" -Dldflags="$(LDFLAGS)" \
 			-Dcurrent_libdir="$(ROOTDIR)/lib" \
 			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
@@ -434,7 +525,7 @@ scaladeploy:
 			-Dlddeps="$(LIB_DEP) $(ROOTDIR)/lib/libmxnet.a")
 
 jnilint:
-	python2 dmlc-core/scripts/lint.py mxnet-jnicpp cpp scala-package/native/src
+	dmlc-core/scripts/lint.py mxnet-jnicpp cpp scala-package/native/src
 
 ifneq ($(EXTRA_OPERATORS),)
 clean: cyclean $(EXTRA_PACKAGES_CLEAN)
diff --git a/NEWS.md b/NEWS.md
index 4f1ecd15689c..fc6b10188fc7 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,11 +1,121 @@
 MXNet Change Log
 ================
-## 0.11.0-rc2
-### - Major Features
+## 1.0.0
+### Performance
+  - Enhanced the performance of `sparse.dot` operator.
+  - MXNet now automatically set OpenMP to use all available CPU cores to maximize CPU utilization when `NUM_OMP_THREADS` is not set.
+  - Unary and binary operators now avoid using OpenMP on small arrays if using OpenMP actually hurts performance due to multithreading overhead.
+  - Significantly improved performance of `broadcast_add`, `broadcast_mul`, etc on CPU.
+  - Added bulk execution to imperative mode. You can control segment size with `mxnet.engine.bulk`. As a result, the speed of Gluon in hybrid mode is improved, especially on small networks and multiple GPUs.
+  - Improved speed for `ctypes` invocation from Python frontend.
+### New Features - Gradient Compression [Experimental]
+  - Speed up multi-GPU and distributed training by compressing communication of gradients. This is especially effective when training networks with large fully-connected layers. In Gluon this can be activated with `compression_params` in Trainer.
+### New Features - Support of NVIDIA Collective Communication Library (NCCL) [Experimental]
+  - Use `kvstore=’nccl’` for (in some cases) faster training on multiple GPUs.
+  - Significantly faster than kvstore=’device’ when batch size is small.
+  - It is recommended to set environment variable `NCCL_LAUNCH_MODE` to `PARALLEL` when using NCCL version 2.1 or newer.
+### New Features - Advanced Indexing [General Availability]
+  - NDArray now supports advanced indexing (both slice and assign) as specified by the numpy standard: https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing with the following restrictions:
+    - if key is a list type, only a list of integers is supported, e.g. `key=[1, 2]` is supported, while not for `key=[[1, 2]]`.
+    - Ellipsis (...) and np.newaxis are not supported.
+    - `Boolean` array indexing is not supported.
+### New Features - Gluon [General Availability]
+  - Performance optimizations discussed above.
+  - Added support for loading data in parallel with multiple processes to `gluon.data.DataLoader`. The number of workers can be set with `num_worker`. Does not support windows yet.
+  - Added Block.cast to support networks with different data types, e.g. `float16`.
+  - Added Lambda block for wrapping a user defined function as a block.
+  - Generalized `gluon.data.ArrayDataset` to support arbitrary number of arrays.
+### New Features - ARM / Raspberry Pi support [Experimental]
+  - MXNet now compiles and runs on ARMv6, ARMv7, ARMv64 including Raspberry Pi devices. See https://github.com/apache/incubator-mxnet/tree/master/docker_multiarch for more information.
+### New Features - NVIDIA Jetson support [Experimental]
+  - MXNet now compiles and runs on NVIDIA Jetson TX2 boards with GPU acceleration.
+  - You can install the python MXNet package on a Jetson board by running - `$ pip install mxnet-jetson-tx2`.
+### New Features - Sparse Tensor Support [General Availability]
+  - Added more sparse operators: `contrib.SparseEmbedding`, `sparse.sum` and `sparse.mean`. 
+  - Added `asscipy()` for easier conversion to scipy.
+  - Added `check_format()` for sparse ndarrays to check if the array format is valid.
+### Bug-fixes  
+  - Fixed a[-1] indexing doesn't work on `NDArray`.
+  - Fixed `expand_dims` if axis < 0.
+  - Fixed a bug that causes topk to produce incorrect result on large arrays.
+  - Improved numerical precision of unary and binary operators for `float64` data.
+  - Fixed derivatives of log2 and log10. They used to be the same with log.
+  - Fixed a bug that causes MXNet to hang after fork. Note that you still cannot use GPU in child processes after fork due to limitations of CUDA.
+  - Fixed a bug that causes `CustomOp` to fail when using auxiliary states.
+  - Fixed a security bug that is causing MXNet to listen on all available interfaces when running training in distributed mode.
+### Doc Updates
+  - Added a security best practices document under FAQ section.
+  - Fixed License Headers including restoring copyright attributions.
+  - Documentation updates. 
+  - Links for viewing source.
+ 
+ For more information and examples, see [full release notes](https://cwiki.apache.org/confluence/display/MXNET/Apache+MXNet+%28incubating%29+1.0+Release+Notes)
+
+
+## 0.12.1
+### Bug-fixes
+  - Added GPU support for the `syevd` operator which ensures that there is GPU support for all linalg-operators.
+  - Bugfix for `syevd` on CPU such that it works for `float32`.
+  - Fixed API call when `OMP_NUM_THREADS` environment variable is set. 
+  - Fixed `MakeNonlossGradNode` bug.
+  - Fixed bug related to passing `dtype` to `array()`. 
+  - Fixed some minor bugs for sparse distributed training.
+  - Fixed a bug on `Slice` accessing uninitialized memory in `param.begin` in the file `matrix_op-inl.h`. 
+  - Fixed `gluon.data.RecordFileDataset`.
+  - Fixed a bug that caused `autograd` to crash on some networks.
+  
+  
+## 0.12.0
+### Performance
+  - Added full support for NVIDIA Volta GPU Architecture and CUDA 9. Training CNNs is up to 3.5x faster than Pascal when using float16 precision.
+  - Enabled JIT compilation. Autograd and Gluon hybridize now use less memory and has faster speed. Performance is almost the same with old symbolic style code.
+  - Improved ImageRecordIO image loading performance and added indexed RecordIO support.
+  - Added better openmp thread management to improve CPU performance.
+### New Features - Gluon
+  - Added enhancements to the Gluon package, a high-level interface designed to be easy to use while keeping most of the flexibility of low level API. Gluon supports both imperative and symbolic programming, making it easy to train complex models imperatively with minimal impact on performance. Neural networks (and other machine learning models) can be defined and trained with `gluon.nn` and `gluon.rnn` packages. 
+  - Added new loss functions - `SigmoidBinaryCrossEntropyLoss`, `CTCLoss`, `HuberLoss`, `HingeLoss`, `SquaredHingeLoss`, `LogisticLoss`, `TripletLoss`.
+  - `gluon.Trainer` now allows reading and setting learning rate with `trainer.learning_rate` property.
+  - Added API `HybridBlock.export` for exporting gluon models to MXNet format.
+  - Added `gluon.contrib` package.
+    - Convolutional recurrent network cells for RNN, LSTM and GRU.
+    - `VariationalDropoutCell`
+### New Features - Autograd
+  - Added enhancements to `autograd` package, which enables automatic differentiation of NDArray operations.
+  - `autograd.Function` allows defining both forward and backward computation for custom operators.
+  - Added `mx.autograd.grad` and experimental second order gradient support (most operators don't support second order gradient yet).
+  - Autograd now supports cross-device graphs. Use `x.copyto(mx.gpu(i))` and `x.copyto(mx.cpu())` to do computation on multiple devices.
+### New Features - Sparse Tensor Support
+  - Added support for sparse matrices. 
+  - Added limited cpu support for two sparse formats in `Symbol` and `NDArray` - `CSRNDArray` and `RowSparseNDArray`.
+  - Added a sparse dot product operator and many element-wise sparse operators.
+  - Added a data iterator for sparse data input - `LibSVMIter`.
+  - Added three optimizers for sparse gradient updates: `Ftrl`, `SGD` and `Adam`.
+  - Added `push` and `row_sparse_pull` with `RowSparseNDArray` in distributed kvstore.
+### Other New Features
+  - Added limited support for fancy indexing, which allows you to very quickly access and modify complicated subsets of an array's values. `x[idx_arr0, idx_arr1, ..., idx_arrn]` is now supported. Features such as combining and slicing are planned for the next release. Checkout master to get a preview.
+  - Random number generators in `mx.nd.random.*` and `mx.sym.random.*` now support both CPU and GPU.
+  - `NDArray` and `Symbol` now supports "fluent" methods. You can now use `x.exp()` etc instead of `mx.nd.exp(x)` or `mx.sym.exp(x)`.
+  - Added `mx.rtc.CudaModule` for writing and running CUDA kernels from python. 
+  - Added `multi_precision` option to optimizer for easier float16 training.
+  - Better support for IDE auto-completion. IDEs like PyCharm can now correctly parse mxnet operators.
+### API Changes
+  - Operators like `mx.sym.linalg_*` and `mx.sym.random_*` are now moved to `mx.sym.linalg.*` and `mx.sym.random.*`. The old names are still available but deprecated.
+  - `sample_*` and `random_*` are now merged as `random.*`, which supports both scalar and  `NDArray` distribution parameters.
+### Bug-fixes
+  - Fixed a bug that causes `argsort` operator to fail on large tensors.
+  - Fixed numerical stability issues when summing large tensors.
+  - Fixed a bug that causes arange operator to output wrong results for large ranges.
+  - Improved numerical precision for unary and binary operators on `float64` inputs.
+
+For more information and examples, see [full release notes](https://cwiki.apache.org/confluence/display/MXNET/MXNet+0.12.0+Release+Notes)
+
+
+## 0.11.0
+### Major Features
   - Apple Core ML model converter
   - Support for Keras v1.2.2
   - For more information see [full release notes](https://cwiki.apache.org/confluence/display/MXNET/v0.11.0+Release+Notes)
-### - API Changes
+### API Changes
   - Added `CachedOp`. You can now cache the operators that’s called frequently with the same set of arguments to reduce overhead.
   - Added sample_multinomial for sampling from multinomial distributions.
   - Added `trunc` operator for rounding towards zero.
@@ -16,9 +126,9 @@ MXNet Change Log
   - `allow_extra` is added to Module.set_params to ignore extra parameters.
   - Added `mod` operator for modulo.
   - Added `multi_precision` option to SGD optimizer to improve training with float16. Resnet50 now achieves the same accuracy when trained with float16 and gives 50% speedup on Titan XP.
-### - Performance Improvements
+### Performance Improvements
   - ImageRecordIter now stores data in pinned memory to improve GPU memcopy speed.
-### - Bugfixes
+### Bugfixes
   - Cython interface is fixed. `make cython` and `python setup.py install --with-cython` should install the cython interface and reduce overhead in applications that use imperative/bucketing.
   - Fixed various bugs in Faster-RCNN example: https://github.com/dmlc/mxnet/pull/6486
   - Fixed various bugs in SSD example.
@@ -28,7 +138,7 @@ MXNet Change Log
   - Fixed context mismatch when loading optimizer states.
   - Fixed a bug in ReLU activation when using MKL.
   - Fixed a few race conditions that causes crashes on shutdown.
-### - Refactors
+### Refactors
   - Refactored TShape/TBlob to use int64 dimensions and DLTensor as internal storage. Getting ready for migration to DLPack. As a result TBlob::dev_mask_ and TBlob::stride_ are removed.
 
 
diff --git a/NOTICE b/NOTICE
index 03695607e3e9..d5327226ae6c 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,5 +1,13 @@
-Apache MXNET (incubating)
-Copyright [2015-2017] The Apache Software Foundation
+    Apache MXNET (incubating)
+    Copyright 2017- The Apache Software Foundation
+
+    This product includes software developed at
+    The Apache Software Foundation (http://www.apache.org/).
+
+
+
+
+
+
+
 
-This product includes software developed at
-The Apache Software Foundation (http://www.apache.org/).
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index e0b435513718..6e0f93294bf7 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: mxnet
 Type: Package
 Title: MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems
-Version: 0.11.0
+Version: 1.0.0
 Date: 2017-06-27
 Author: Tianqi Chen, Qiang Kou, Tong He
 Maintainer: Qiang Kou <qkou@qkou.info>
diff --git a/R-package/R/gru.R b/R-package/R/gru.R
deleted file mode 100644
index d2ffd9a414c2..000000000000
--- a/R-package/R/gru.R
+++ /dev/null
@@ -1,355 +0,0 @@
-# gru cell symbol
-gru <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) {
-    if (dropout > 0)
-        indata <- mx.symbol.Dropout(data=indata, p=dropout)
-    i2h <- mx.symbol.FullyConnected(data=indata,
-                                    weight=param$gates.i2h.weight,
-                                    bias=param$gates.i2h.bias,
-                                    num.hidden=num.hidden * 2,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".gates.i2h"))
-    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
-                                    weight=param$gates.h2h.weight,
-                                    bias=param$gates.h2h.bias,
-                                    num.hidden=num.hidden * 2,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".gates.h2h"))
-    gates <- i2h + h2h
-    slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=2,
-                                          name=paste0("t", seqidx, ".l", layeridx, ".slice"))
-    update.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid")
-    reset.gate <- mx.symbol.Activation(slice.gates[[2]], act.type="sigmoid")
-
-    htrans.i2h <- mx.symbol.FullyConnected(data=indata,
-                                           weight=param$trans.i2h.weight,
-                                           bias=param$trans.i2h.bias,
-                                           num.hidden=num.hidden,
-                                           name=paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
-    h.after.reset <- prev.state$h * reset.gate
-    htrans.h2h <- mx.symbol.FullyConnected(data=h.after.reset,
-                                           weight=param$trans.h2h.weight,
-                                           bias=param$trans.h2h.bias,
-                                           num.hidden=num.hidden,
-                                           name=paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
-    h.trans <- htrans.i2h + htrans.h2h
-    h.trans.active <- mx.symbol.Activation(h.trans, act.type="tanh")
-    next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h)
-    return (list(h=next.h))
-}
-
-# unrolled gru network
-gru.unroll <- function(num.gru.layer, seq.len, input.size,
-                       num.hidden, num.embed, num.label, dropout=0) {
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-    param.cells <- lapply(1:num.gru.layer, function(i) {
-        cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
-                     gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
-                     gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
-                     gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
-                     trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
-                     trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
-                     trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
-                     trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.gru.layer, function(i) {
-        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    label <- mx.symbol.Variable("label")
-    data <- mx.symbol.Variable("data")
-    embed <- mx.symbol.Embedding(data=data, input.dim=input.size,
-                                 weight=embed.weight, output.dim=num.embed, name='embed')
-    wordvec <- mx.symbol.SliceChannel(data=embed, num.outputs=seq.len, squeeze.axis=1)
-
-    last.hidden <- list()
-    for (seqidx in 1:seq.len) {
-        hidden <- wordvec[[seqidx]]
-        # stack GRU
-        for (i in 1:num.gru.layer) {
-            dp <- ifelse(i==1, 0, dropout)
-            next.state <- gru(num.hidden, indata=hidden,
-                              prev.state=last.states[[i]],
-                              param=param.cells[[i]],
-                              seqidx=seqidx, layeridx=i, 
-                              dropout=dp)
-            hidden <- next.state$h
-            last.states[[i]] <- next.state
-        }
-        # decoder
-        if (dropout > 0)
-            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-        last.hidden <- c(last.hidden, hidden)
-    }
-    last.hidden$dim <- 0
-    last.hidden$num.args <- seq.len
-    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
-    fc <- mx.symbol.FullyConnected(data=concat,
-                                   weight=cls.weight,
-                                   bias=cls.bias,
-                                   num.hidden=num.label)
-
-    label <- mx.symbol.transpose(data=label)
-    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
-
-    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
-    return (loss.all)
-}
-
-# gru inference model symbol
-gru.inference.symbol <- function(num.gru.layer, seq.len, input.size,
-                                 num.hidden, num.embed, num.label, dropout=0) {
-    seqidx <- 1
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-
-    param.cells <- lapply(1:num.gru.layer, function(i) {
-        cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
-                     gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
-                     gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
-                     gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
-                     trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
-                     trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
-                     trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
-                     trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.gru.layer, function(i) {
-        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    data <- mx.symbol.Variable("data")
-    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                                  weight=embed.weight, output_dim=num.embed, name="embed")
-
-    # stack GRU
-    for (i in 1:num.gru.layer) {
-        dp <- ifelse(i==1, 0, dropout)
-        next.state <- gru(num.hidden, indata=hidden,
-                          prev.state=last.states[[i]],
-                          param=param.cells[[i]],
-                          seqidx=seqidx, layeridx=i, 
-                          dropout=dp)
-        hidden <- next.state$h
-        last.states[[i]] <- next.state
-    }
-    # decoder
-    if (dropout > 0)
-        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-
-    fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label,
-                                   weight=cls.weight, bias=cls.bias, name='pred')
-    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
-    unpack.h <- lapply(1:num.gru.layer, function(i) {
-        state <- last.states[[i]]
-        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
-        return (state.h)
-    })
-
-    list.all <- c(sm, unpack.h)
-    return (mx.symbol.Group(list.all))
-}
-
-#' Training GRU Unrolled Model
-#'
-#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
-#'      The Training set.
-#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
-#'      The validation set used for validation evaluation during the progress.
-#' @param num.gru.layer integer
-#'      The number of the layer of gru.
-#' @param seq.len integer
-#'      The length of the input sequence.
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer
-#'      The batch size used for R array training.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param ctx mx.context, optional
-#'      The device used to perform training.
-#' @param num.round integer, default=10
-#'      The number of iterations over training data to train the model.
-#' @param update.period integer, default=1
-#'      The number of iterations to update parameters during training period.
-#' @param initializer initializer object. default=mx.init.uniform(0.01)
-#'      The initialization scheme for parameters.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @param optimizer string, default="sgd"
-#'      The optimization method.
-#' @param ... other parameters passing to \code{mx.gru}/.
-#' @return model A trained gru unrolled model.
-#'
-#' @export
-mx.gru <- function( train.data, eval.data=NULL,
-                    num.gru.layer, seq.len,
-                    num.hidden, num.embed, num.label,
-                    batch.size, input.size,
-                    ctx=mx.ctx.default(),
-                    num.round=10, update.period=1,
-                    initializer=mx.init.uniform(0.01),
-                    dropout=0, optimizer='sgd',
-                    ...) {
-    # check data and change data into iterator
-    train.data <- check.data(train.data, batch.size, TRUE)
-    eval.data <- check.data(eval.data, batch.size, FALSE)
-
-    # get unrolled gru symbol
-    rnn.sym <- gru.unroll( num.gru.layer=num.gru.layer,
-                           num.hidden=num.hidden,
-                           seq.len=seq.len,
-                           input.size=input.size,
-                           num.embed=num.embed,
-                           num.label=num.label,
-                           dropout=dropout)
-
-    init.states.name <- lapply(1:num.gru.layer, function(i) {
-        state.h <- paste0("l", i, ".init.h")
-        return (state.h)
-    })
-
-    # set up gru model
-    model <- setup.rnn.model(rnn.sym=rnn.sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.gru.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=initializer,
-                             dropout=dropout)
-
-    # train gru model
-    model <- train.rnn( model, train.data, eval.data,
-                        num.round=num.round,
-                        update.period=update.period,
-                        ctx=ctx,
-                        init.states.name=init.states.name,
-                        ...)
-    # change model into MXFeedForwardModel
-    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
-    return(structure(model, class="MXFeedForwardModel"))
-}
-
-#' Create a GRU Inference Model
-#'
-#' @param num.gru.layer integer
-#'      The number of the layer of gru.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer, default=1
-#'      The batch size used for R array training.
-#' @param arg.params list
-#'      The batch size used for R array training.
-#' @param ctx mx.context, optional
-#'      Model parameter, list of name to NDArray of net's weights.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
-#'      A gru inference model.
-#'
-#' @export
-mx.gru.inference <- function(num.gru.layer,
-                             input.size,
-                             num.hidden,
-                             num.embed,
-                             num.label,
-                             batch.size=1,
-                             arg.params,
-                             ctx=mx.cpu(),
-                             dropout=0.) {
-    sym <- gru.inference.symbol(num.gru.layer=num.gru.layer,
-                                 input.size=input.size,
-                                 num.hidden=num.hidden,
-                                 num.embed=num.embed,
-                                 num.label=num.label,
-                                 dropout=dropout)
-
-    init.states.name <- lapply(1:num.gru.layer, function(i) {
-        state.h <- paste0("l", i, ".init.h")
-        return (state.h)
-    })
-
-    seq.len <- 1
-    # set up gru model
-    model <- setup.rnn.model(rnn.sym=sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.gru.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=mx.init.uniform(0.01),
-                             dropout=dropout)
-    arg.names <- names(model$rnn.exec$ref.arg.arrays)
-    for (k in names(arg.params)) {
-        if ((k %in% arg.names) && is.param.name(k) ) {
-            rnn.input <- list()
-            rnn.input[[k]] <- arg.params[[k]]
-            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
-        }
-    }
-    init.states <- list()
-    for (i in 1:num.gru.layer) {
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-
-    return (model)
-}
-
-#' Using forward function to predict in gru inference model
-#'
-#' @param model gru model
-#'      A gru inference model
-#' @param input.data, array.matrix
-#'      The input data for forward function
-#' @param new.seq boolean, default=FALSE
-#'      Whether the input is the start of a new sequence
-#'
-#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
-#'
-#' @export
-mx.gru.forward <- function(model, input.data, new.seq=FALSE) {
-    if (new.seq == TRUE) {
-        init.states <- list()
-        for (i in 1:model$num.rnn.layer) {
-            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-        }
-        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    }
-    dim(input.data) <- c(model$batch.size)
-    data <- list(data=mx.nd.array(input.data))
-    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
-    mx.exec.forward(model$rnn.exec, is.train=FALSE)
-    init.states <- list()
-    for (i in 1:model$num.rnn.layer) {
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
-    return (list(prob=prob, model=model))
-}
-
diff --git a/R-package/R/initializer.R b/R-package/R/initializer.R
index 7a1ffb2b182a..9f5e75be91a5 100644
--- a/R-package/R/initializer.R
+++ b/R-package/R/initializer.R
@@ -4,11 +4,11 @@
 #' @param shape the shape of the array to be generated.
 #'
 mx.init.internal.default <- function(name, shape, ctx, allow.unknown=FALSE) {
-  if (endsWith(name, "bias")) return (mx.nd.zeros(shape, ctx))
-  if (endsWith(name, "gamma")) return (mx.nd.ones(shape, ctx))
-  if (endsWith(name, "beta")) return (mx.nd.zeros(shape, ctx))
-  if (endsWith(name, "moving_mean")) return (mx.nd.zeros(shape, ctx))
-  if (endsWith(name, "moving_var")) return (mx.nd.ones(shape, ctx))
+  if (endsWith(name, "bias")) return (mx.nd.zeros(shape))
+  if (endsWith(name, "gamma")) return (mx.nd.ones(shape))
+  if (endsWith(name, "beta")) return (mx.nd.zeros(shape))
+  if (endsWith(name, "moving_mean")) return (mx.nd.zeros(shape))
+  if (endsWith(name, "moving_var")) return (mx.nd.ones(shape))
   if (allow.unknown) return(NULL)
   stop(paste("Unkown initialization pattern for ", name))
 }
@@ -21,9 +21,9 @@ mx.init.internal.default <- function(name, shape, ctx, allow.unknown=FALSE) {
 mx.init.uniform <- function(scale) {
   function(name, shape, ctx, allow.unknown=FALSE) {
     if (!endsWith(name, "weight")) {
-      return (mx.init.internal.default(name, shape, ctx, allow.unknown))
+      return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown))
     }
-    return (mx.runif(shape, -scale, scale, ctx))
+    return (mx.nd.random.uniform(low = -scale, high = scale, shape = shape))
   }
 }
 
@@ -35,9 +35,9 @@ mx.init.uniform <- function(scale) {
 mx.init.normal <- function(sd) {
   function(name, shape, ctx, allow.unknown=FALSE) {
     if (!endsWith(name, "weight")) {
-      return (mx.init.internal.default(name, shape, ctx, allow.unknown))
+      return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown))
     }
-    return (mx.rnorm(shape, 0, sd, ctx))
+    return (mx.nd.random.normal(loc = 0, scale = sd, shape = shape))
   }
 }
 
@@ -56,9 +56,9 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
                            magnitude = 3){
   function(name, shape, ctx, allow.unknown = FALSE){
     if (!endsWith(name, "weight")) {
-      return (mx.init.internal.default(name, shape, ctx, allow.unknown))
+      return (mx.init.internal.default(name = name, shape = shape, allow.unknown = allow.unknown))
     }
-
+    
     fan_out = shape[length(shape)]
     fan_in  = prod(shape[-length(shape)])
     factor_val  = 1
@@ -71,13 +71,13 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
     } else {
       stop("Not supported factor type. See usage of function mx.init.Xavier")
     }
-
+    
     scale = sqrt(magnitude / factor_val)
-
+    
     if (rnd_type == "uniform"){
-      return(mx.runif(shape, -scale, scale, ctx))
+      return(mx.nd.random.uniform(low = -scale, high = scale, shape = shape))
     } else if (rnd_type == "gaussian"){
-      return(mx.rnorm(shape, 0, scale, ctx))
+      return(mx.nd.random.normal(loc = 0, scale = scale, shape = shape))
     } else {
       stop("Not supported random type. See usage of function mx.init.Xavier")
     }
@@ -92,7 +92,7 @@ mx.init.Xavier <- function(rnd_type = "uniform", factor_type = "avg",
 #' @param ctx mx.context The context of the weights
 #' @param skip.unknown Whether skip the unknown weight types
 #' @export
-mx.init.create <- function(initializer, shape.array, ctx, skip.unknown=TRUE) {
+mx.init.create <- function(initializer, shape.array, ctx=NULL, skip.unknown=TRUE) {
   if (length(shape.array) == 0) return(list())
   names = names(shape.array)
   ret <- lapply(1 : length(names), function(i) {
diff --git a/R-package/R/lstm.R b/R-package/R/lstm.R
deleted file mode 100644
index 622388993c8c..000000000000
--- a/R-package/R/lstm.R
+++ /dev/null
@@ -1,388 +0,0 @@
-# lstm cell symbol
-lstm <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout=0) {
-    if (dropout > 0)
-        indata <- mx.symbol.Dropout(data=indata, p=dropout)
-    i2h <- mx.symbol.FullyConnected(data=indata,
-                                    weight=param$i2h.weight,
-                                    bias=param$i2h.bias,
-                                    num.hidden=num.hidden * 4,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".i2h"))
-    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
-                                    weight=param$h2h.weight,
-                                    bias=param$h2h.bias,
-                                    num.hidden=num.hidden * 4,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".h2h"))
-    gates <- i2h + h2h
-    slice.gates <- mx.symbol.SliceChannel(gates, num.outputs=4,
-                                          name=paste0("t", seqidx, ".l", layeridx, ".slice"))
-
-    in.gate <- mx.symbol.Activation(slice.gates[[1]], act.type="sigmoid")
-    in.transform <- mx.symbol.Activation(slice.gates[[2]], act.type="tanh")
-    forget.gate <- mx.symbol.Activation(slice.gates[[3]], act.type="sigmoid")
-    out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type="sigmoid")
-    next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
-    next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh")
-
-    return (list(c=next.c, h=next.h))
-}
-
-# unrolled lstm network
-lstm.unroll <- function(num.lstm.layer, seq.len, input.size,
-                        num.hidden, num.embed, num.label, dropout=0.) {
-
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-
-    param.cells <- lapply(1:num.lstm.layer, function(i) {
-        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.lstm.layer, function(i) {
-        state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")),
-                      h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    label <- mx.symbol.Variable("label")
-    data <- mx.symbol.Variable("data")
-    embed <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                                 weight=embed.weight, output_dim=num.embed, name="embed")
-    wordvec <- mx.symbol.SliceChannel(data=embed, num_outputs=seq.len, squeeze_axis=1)
-
-    last.hidden <- list()
-    for (seqidx in 1:seq.len) {
-        hidden <- wordvec[[seqidx]]
-        # stack lstm
-        for (i in 1:num.lstm.layer) {
-            dp <- ifelse(i==1, 0, dropout)
-            next.state <- lstm(num.hidden, indata=hidden,
-                               prev.state=last.states[[i]],
-                               param=param.cells[[i]],
-                               seqidx=seqidx, layeridx=i,
-                               dropout=dp)
-            hidden <- next.state$h
-            last.states[[i]] <- next.state
-        }
-        # decoder
-        if (dropout > 0)
-            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-        last.hidden <- c(last.hidden, hidden)
-    }
-    last.hidden$dim <- 0
-    last.hidden$num.args <- seq.len
-    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
-    fc <- mx.symbol.FullyConnected(data=concat,
-                                   weight=cls.weight,
-                                   bias=cls.bias,
-                                   num.hidden=num.label)
-
-    label <- mx.symbol.transpose(data=label)
-    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
-
-    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
-    return (loss.all)
-}
-
-# lstm inference model symbol
-lstm.inference.symbol <- function(num.lstm.layer, input.size,
-                                  num.hidden, num.embed, num.label, dropout=0.) {
-    seqidx <- 0
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-
-    param.cells <- lapply(1:num.lstm.layer, function(i) {
-        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.lstm.layer, function(i) {
-        state <- list(c=mx.symbol.Variable(paste0("l", i, ".init.c")),
-                      h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    data <- mx.symbol.Variable("data")
-    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                                  weight=embed.weight, output_dim=num.embed, name="embed")
-
-    # stack lstm
-    for (i in 1:num.lstm.layer) {
-        dp <- ifelse(i==1, 0, dropout)
-        next.state <- lstm(num.hidden, indata=hidden,
-                           prev.state=last.states[[i]],
-                           param=param.cells[[i]],
-                           seqidx=seqidx, layeridx=i,
-                           dropout=dp)
-        hidden <- next.state$h
-        last.states[[i]] <- next.state
-    }
-    # decoder
-    if (dropout > 0)
-        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-
-    fc <- mx.symbol.FullyConnected(data=hidden, num_hidden=num.label,
-                                   weight=cls.weight, bias=cls.bias, name='pred')
-    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
-    unpack.c <- lapply(1:num.lstm.layer, function(i) {
-        state <- last.states[[i]]
-        state.c <- mx.symbol.BlockGrad(state$c, name=paste0("l", i, ".last.c"))
-        return (state.c)
-    })
-    unpack.h <- lapply(1:num.lstm.layer, function(i) {
-        state <- last.states[[i]]
-        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
-        return (state.h)
-    })
-
-    list.all <- c(sm, unpack.c, unpack.h)
-    return (mx.symbol.Group(list.all))
-}
-
-
-
-#' Training LSTM Unrolled Model
-#'
-#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
-#'      The Training set.
-#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
-#'      The validation set used for validation evaluation during the progress.
-#' @param num.lstm.layer integer
-#'      The number of the layer of lstm.
-#' @param seq.len integer
-#'      The length of the input sequence.
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer
-#'      The batch size used for R array training.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param ctx mx.context, optional
-#'      The device used to perform training.
-#' @param num.round integer, default=10
-#'      The number of iterations over training data to train the model.
-#' @param update.period integer, default=1
-#'      The number of iterations to update parameters during training period.
-#' @param initializer initializer object. default=mx.init.uniform(0.01)
-#'      The initialization scheme for parameters.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @param optimizer string, default="sgd"
-#'      The optimization method.
-#' @param epoch.end.callback function, optional
-#'     The callback when iteration ends.
-#' @param batch.end.callback function, optional
-#'     The callback when one mini-batch iteration ends.
-#' @param ... other parameters passing to \code{mx.lstm}/.
-#' @return model A trained lstm unrolled model.
-#'
-#' @export
-mx.lstm <- function(train.data, eval.data=NULL,
-                    num.lstm.layer, seq.len,
-                    num.hidden, num.embed, num.label,
-                    batch.size, input.size,
-                    ctx=mx.ctx.default(),
-                    num.round=10, update.period=1,
-                    initializer=mx.init.uniform(0.01),
-                    dropout=0, optimizer='sgd',
-                    epoch.end.callback=NULL, batch.end.callback=NULL,
-                    model,
-                    arg.params,
-                    ...) {
-    # check data and change data into iterator
-    train.data <- check.data(train.data, batch.size, TRUE)
-    eval.data <- check.data(eval.data, batch.size, FALSE)
-    
-    
-
-    # get unrolled lstm symbol
-    if(missing(model)){
-        rnn.sym <- lstm.unroll(num.lstm.layer=num.lstm.layer,
-                           num.hidden=num.hidden,
-                           seq.len=seq.len,
-                           input.size=input.size,
-                           num.embed=num.embed,
-                           num.label=num.label,
-                           dropout=dropout)
-    } else {
-      rnn.sym=model$symbol
-    }
-
-    init.states.c <- lapply(1:num.lstm.layer, function(i) {
-        state.c <- paste0("l", i, ".init.c")
-        return (state.c)
-    })
-    init.states.h <- lapply(1:num.lstm.layer, function(i) {
-        state.h <- paste0("l", i, ".init.h")
-        return (state.h)
-    })
-    init.states.name <- c(init.states.c, init.states.h)
-
-    # set up lstm model
-    model <- setup.rnn.model(rnn.sym=rnn.sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.lstm.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=initializer,
-                             dropout=dropout)
-    # restore states
-    if (!missing(arg.params)){
-      arg.names <- names(model$rnn.exec$ref.arg.arrays)
-      for (k in names(arg.params)) {
-        if ((k %in% arg.names) && is.param.name(k) ) {
-          rnn.input <- list()
-          rnn.input[[k]] <- arg.params[[k]]
-          mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
-        }
-      }
-    }
-
-    # train lstm model
-    model <- train.rnn( model, train.data, eval.data,
-                        num.round=num.round,
-                        update.period=update.period,
-                        ctx=ctx,
-                        init.states.name=init.states.name,
-                        epoch.end.callback=epoch.end.callback, 
-                        batch.end.callback=batch.end.callback,
-                        ...)
-    # change model into MXFeedForwardModel
-    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
-    return(structure(model, class="MXFeedForwardModel"))
-}
-
-
-#' Create a LSTM Inference Model
-#'
-#' @param num.lstm.layer integer
-#'      The number of the layer of lstm.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer, default=1
-#'      The batch size used for R array training.
-#' @param arg.params list
-#'      The batch size used for R array training.
-#' @param ctx mx.context, optional
-#'      Model parameter, list of name to NDArray of net's weights.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
-#'      A lstm inference model.
-#'
-#' @export
-mx.lstm.inference <- function(num.lstm.layer,
-                              input.size,
-                              num.hidden,
-                              num.embed,
-                              num.label,
-                              batch.size=1,
-                              arg.params,
-                              ctx=mx.cpu(),
-                              dropout=0.) {
-    sym <- lstm.inference.symbol(num.lstm.layer=num.lstm.layer,
-                                 input.size=input.size,
-                                 num.hidden=num.hidden,
-                                 num.embed=num.embed,
-                                 num.label=num.label,
-                                 dropout=dropout)
-
-    init.states.c <- lapply(1:num.lstm.layer, function(i) {
-        state.c <- paste0("l", i, ".init.c")
-        return (state.c)
-    })
-    init.states.h <- lapply(1:num.lstm.layer, function(i) {
-        state.h <- paste0("l", i, ".init.h")
-        return (state.h)
-    })
-    init.states.name <- c(init.states.c, init.states.h)
-
-    seq.len <- 1
-    # set up lstm model
-    model <- setup.rnn.model(rnn.sym=sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.lstm.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=mx.init.uniform(0.01),
-                             dropout=dropout)
-    arg.names <- names(model$rnn.exec$ref.arg.arrays)
-    for (k in names(arg.params)) {
-        if ((k %in% arg.names) && is.param.name(k) ) {
-            rnn.input <- list()
-            rnn.input[[k]] <- arg.params[[k]]
-            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
-        }
-    }
-    init.states <- list()
-    for (i in 1:num.lstm.layer) {
-        init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-
-    return (model)
-}
-
-#' Using forward function to predict in lstm inference model
-#'
-#' @param model lstm model
-#'      A Lstm inference model
-#' @param input.data, array.matrix
-#'      The input data for forward function
-#' @param new.seq boolean, default=FALSE
-#'      Whether the input is the start of a new sequence
-#'
-#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
-#'
-#' @export
-mx.lstm.forward <- function(model, input.data, new.seq=FALSE) {
-    if (new.seq == TRUE) {
-        init.states <- list()
-        for (i in 1:model$num.rnn.layer) {
-            init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.c")]]*0
-            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-        }
-        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    }
-    dim(input.data) <- c(model$batch.size)
-    data <- list(data=mx.nd.array(input.data))
-    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
-    mx.exec.forward(model$rnn.exec, is.train=FALSE)
-    init.states <- list()
-    for (i in 1:model$num.rnn.layer) {
-        init.states[[paste0("l", i, ".init.c")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.c_output")]]
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
-    return (list(prob=prob, model=model))
-}
diff --git a/R-package/R/model.rnn.R b/R-package/R/model.rnn.R
new file mode 100644
index 000000000000..8f3ab8c25874
--- /dev/null
+++ b/R-package/R/model.rnn.R
@@ -0,0 +1,339 @@
+# Internal function to do multiple device training on RNN
+mx.model.train.buckets <- function(symbol, ctx, train.data, eval.data, 
+                                   dlist, arg.params, aux.params, 
+                                   grad.req, arg.update.idx, 
+                                   begin.round, end.round, optimizer, metric, 
+                                   epoch.end.callback, batch.end.callback, kvstore, verbose = TRUE) {
+  
+  ndevice <- length(ctx)
+  if (verbose) 
+    message(paste0("Start training with ", ndevice, " devices"))
+  
+  input.names <- names(dlist)
+  arg.params.names <- names(arg.params)
+  
+  if (is.list(symbol)) sym_ini <- symbol[[names(train.data$bucketID)]] else sym_ini <- symbol
+  
+  slices <- lapply(1:ndevice, function(i) {
+    sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = F))
+  })
+  
+  train.execs <- lapply(1:ndevice, function(i) {
+    s <- slices[[i]]
+    mx.symbol.bind(symbol = sym_ini, arg.arrays = c(s, arg.params)[arg.update.idx], 
+                           aux.arrays = aux.params, ctx = ctx[[i]], grad.req = grad.req)
+  })
+  
+  # KVStore related stuffs
+  params.index <- as.integer(
+    mx.util.filter.null(
+      lapply(1:length(train.execs[[1]]$ref.grad.arrays), function(k) {
+        if (!is.null(train.execs[[1]]$ref.grad.arrays[[k]])) k else NULL}
+      )))
+  
+  update.on.kvstore <- FALSE
+  if (!is.null(kvstore) && kvstore$update.on.kvstore) {
+    update.on.kvstore <- TRUE
+    kvstore$set.optimizer(optimizer)
+  } else {
+    updaters <- lapply(1:ndevice, function(i) {
+      mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
+    })
+  }
+  
+  if (!is.null(kvstore)) {
+    kvstore$init(params.index, train.execs[[1]]$ref.arg.arrays[params.index])
+  }
+  
+  # train over specified number of epochs
+  for (iteration in begin.round:end.round) {
+    nbatch <- 0
+    if (!is.null(metric)) {
+      train.metric <- metric$init()
+    }
+    train.data$reset()
+    while (train.data$iter.next()) {
+      
+      # Get iterator data
+      dlist <- train.data$value()[input.names]
+      
+      # Slice inputs for multi-devices
+      slices <- lapply(1:ndevice, function(i) {
+        sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = F))
+      })
+      
+      # Assign input to each executor - bug on inference if using BatchNorm
+      if (is.list(symbol)) {
+        train.execs <- lapply(1:ndevice, function(i) {
+          s <- slices[[i]]
+          mx.symbol.bind(symbol = symbol[[names(train.data$bucketID)]], 
+                                 arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
+                                 aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
+        })
+      } else {
+        for (i in 1:ndevice) {
+          s <- slices[[i]]
+          mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
+        }
+      }
+      
+      for (texec in train.execs) {
+        mx.exec.forward(texec, is.train = TRUE)
+      }
+      
+      out.preds <- lapply(train.execs, function(texec) {
+        mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
+      })
+      
+      for (texec in train.execs) {
+        mx.exec.backward(texec)
+      }
+      
+      if (!is.null(kvstore)) {
+        # push the gradient
+        kvstore$push(params.index, lapply(train.execs, function(texec) {
+          texec$ref.grad.arrays[params.index]
+        }), -params.index)
+      }
+      if (update.on.kvstore) {
+        # pull back weight
+        kvstore$pull(params.index, lapply(train.execs, function(texec) {
+          texec$ref.arg.arrays[params.index]
+        }), -params.index)
+      } else {
+        # pull back gradient sums
+        if (!is.null(kvstore)) {
+          kvstore$pull(params.index, lapply(train.execs, function(texec) {
+            texec$ref.grad.arrays[params.index]
+          }), -params.index)
+        }
+        arg.blocks <- lapply(1:ndevice, function(i) {
+          updaters[[i]](train.execs[[i]]$ref.arg.arrays, train.execs[[i]]$ref.grad.arrays)
+        })
+        for (i in 1:ndevice) {
+          mx.exec.update.arg.arrays(train.execs[[i]], arg.blocks[[i]], skip.null = TRUE)
+        }
+      }
+      
+      # Update the evaluation metrics
+      if (!is.null(metric)) {
+        for (i in 1:ndevice) {
+          train.metric <- metric$update(label = slices[[i]][[length(slices[[i]])]], 
+                                        pred = out.preds[[i]], state = train.metric)
+        }
+      }
+      
+      nbatch <- nbatch + 1
+      
+      if (!is.null(batch.end.callback)) {
+        batch.end.callback(iteration, nbatch, environment())
+      }
+    }
+    
+    if (!is.null(metric)) {
+      result <- metric$get(train.metric)
+      if (verbose) 
+        message(paste0("[", iteration, "] Train-", result$name, "=", result$value))
+    }
+    
+    if (!is.null(eval.data)) {
+      if (!is.null(metric)) {
+        eval.metric <- metric$init()
+      }
+      eval.data$reset()
+      while (eval.data$iter.next()) {
+        
+        # Get iterator data
+        dlist <- eval.data$value()[input.names]
+        
+        # Slice input to multiple devices
+        slices <- lapply(1:ndevice, function(i) {
+          sapply(names(dlist), function(n) mx.nd.split(data=dlist[[n]], num_outputs = ndevice, axis = 0, squeeze_axis = F))
+        })
+        
+        # Assign input to each executor - bug on inference if using BatchNorm
+        if (is.list(symbol)) {
+          train.execs <- lapply(1:ndevice, function(i) {
+            s <- slices[[i]]
+            mx.symbol.bind(symbol = symbol[[names(eval.data$bucketID)]], 
+                                   arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.params.names])[arg.update.idx],
+                                   aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad.req)
+          })
+        } else {
+          for (i in 1:ndevice) {
+            s <- slices[[i]]
+            mx.exec.update.arg.arrays(train.execs[[i]], s, match.name=TRUE)
+          }
+        }
+        
+        for (texec in train.execs) {
+          mx.exec.forward(texec, is.train = FALSE)
+        }
+        
+        # copy outputs to CPU
+        out.preds <- lapply(train.execs, function(texec) {
+          mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
+        })
+        
+        if (!is.null(metric)) {
+          for (i in 1:ndevice) {
+            eval.metric <- metric$update(slices[[i]][[length(slices[[i]])]], 
+                                         out.preds[[i]], eval.metric)
+          }
+        }
+      }
+      
+      if (!is.null(metric)) {
+        result <- metric$get(eval.metric)
+        if (verbose) {
+          message(paste0("[", iteration, "] Validation-", result$name, "=", 
+                         result$value))
+        }
+      }
+    } else {
+      eval.metric <- NULL
+    }
+    # get the model out
+    model <- mx.model.extract.model(sym_ini, train.execs)
+    
+    epoch_continue <- TRUE
+    if (!is.null(epoch.end.callback)) {
+      epoch_continue <- epoch.end.callback(iteration, 0, environment(), verbose = verbose)
+    }
+    
+    if (!epoch_continue) {
+      break
+    }
+  }
+  return(model)
+}
+
+
+# 
+#' Train RNN with bucket support
+#'
+#' @param symbol Symbol or list of Symbols representing the model
+#' @param train.data Training data created by mx.io.bucket.iter
+#' @param eval.data Evaluation data created by mx.io.bucket.iter
+#' @param num.round int, number of epoch
+#' @param initializer
+#' @param optimizer
+#' @param batch.end.callback
+#' @param epoch.end.callback
+#' @param begin.round
+#' @param metric
+#' @param ctx
+#' @param kvstore
+#' @param verbose
+#'
+#' @export
+mx.model.buckets <- function(symbol, train.data, eval.data = NULL, metric = NULL, 
+                             arg.params = NULL, aux.params = NULL, fixed.params = NULL, 
+                             num.round = 1, begin.round = 1, 
+                             initializer = mx.init.uniform(0.01), optimizer = "sgd", ctx = NULL, 
+                             batch.end.callback = NULL, epoch.end.callback = NULL, 
+                             kvstore = "local", verbose = TRUE) {
+  
+  if (!train.data$iter.next()) {
+    train.data$reset()
+    if (!train.data$iter.next()) 
+      stop("Empty train.data")
+  }
+  
+  if (!is.null(eval.data)) {
+    if (!eval.data$iter.next()) {
+      eval.data$reset()
+      if (!eval.data$iter.next()) 
+        stop("Empty eval.data")
+    }
+  }
+  
+  if (is.null(ctx)) 
+    ctx <- mx.ctx.default()
+  if (is.mx.context(ctx)) {
+    ctx <- list(ctx)
+  }
+  if (!is.list(ctx)) 
+    stop("ctx must be mx.context or list of mx.context")
+  if (is.character(optimizer)) {
+    if (is.numeric(input.shape)) {
+      ndim <- length(input.shape)
+      batchsize <- input.shape[[ndim]]
+    } else {
+      ndim <- length(input.shape[[1]])
+      batchsize <- input.shape[[1]][[ndim]]
+    }
+    optimizer <- mx.opt.create(optimizer, rescale.grad = (1/batchsize), ...)
+  }
+  
+  if (is.list(symbol)) sym_ini <- symbol[[names(train.data$bucketID)]] else sym_ini <- symbol
+  
+  arguments <- sym_ini$arguments
+  input.names <- intersect(names(train.data$value()), arguments)
+  
+  input.shape <- sapply(input.names, function(n) {
+    dim(train.data$value()[[n]])
+  }, simplify = FALSE)
+  
+  shapes <- sym_ini$infer.shape(input.shape)
+  
+  # assign arg.params and aux.params arguments to arg.params.input and aux.params.input
+  arg.params.input <- arg.params
+  aux.params.input <- aux.params
+  
+  # initialize all arguments with zeros
+  arg.params <- lapply(shapes$arg.shapes, function(shape) {
+    mx.nd.zeros(shape = shape, ctx = mx.cpu())
+  })
+  
+  # initialize input parameters
+  dlist <- arg.params[input.names]
+  
+  # initialize parameters - only argument ending with _weight and _bias are initialized
+  arg.params.ini <- mx.init.create(initializer = initializer, shape.array = shapes$arg.shapes, ctx = mx.cpu(), skip.unknown = TRUE)
+  
+  # assign initilized parameters to arg.params
+  arg.params[names(arg.params.ini)] <- arg.params.ini
+  
+  # assign input params to arg.params
+  arg.params[names(arg.params.input)] <- arg.params.input
+  
+  # remove input params from arg.params
+  arg.params[input.names] <- NULL
+  
+  # Grad request
+  grad.req <- rep("null", length(arguments))
+  grad.req.write <- arguments %in% setdiff(names(arg.params.ini), fixed.params)
+  grad.req[grad.req.write] <- "write"
+  
+  # Arg array order
+  update_names <- c(input.names, names(arg.params))
+  arg.update.idx <- match(arguments, update_names)
+  
+  # aux parameters setup
+  aux.params <- lapply(shapes$aux.shapes, function(shape) {
+    mx.nd.zeros(shape = shape, ctx = mx.cpu())
+  })
+  
+  aux.params.ini <- mx.init.create(initializer, shapes$aux.shapes, ctx = mx.cpu(), skip.unknown = FALSE)
+  if (length(aux.params) > 0) {
+    aux.params[names(aux.params.ini)] <- aux.params.ini
+  } else aux.params <- NULL
+  
+  aux.params[names(aux.params.input)] <- aux.params.input
+  
+  # kvstore initialization
+  kvstore <- mx.model.create.kvstore(kvstore, params$arg.params, length(ctx), 
+                                             verbose = verbose)
+  
+  ### Execute training
+  model <- mx.model.train.buckets(symbol = symbol, ctx = ctx,  train.data = train.data, eval.data = eval.data, 
+                                  dlist = dlist,  arg.params = arg.params, aux.params = aux.params, 
+                                  grad.req = grad.req, arg.update.idx = arg.update.idx, 
+                                  optimizer = optimizer, metric = metric, 
+                                  begin.round = begin.round, end.round = num.round, 
+                                  batch.end.callback = batch.end.callback, epoch.end.callback = epoch.end.callback, 
+                                  kvstore = kvstore, verbose = verbose)
+  
+  return(model)
+}
diff --git a/R-package/R/mx.io.bucket.iter.R b/R-package/R/mx.io.bucket.iter.R
new file mode 100644
index 000000000000..8e5ab59eaab8
--- /dev/null
+++ b/R-package/R/mx.io.bucket.iter.R
@@ -0,0 +1,110 @@
+
+BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "batch.size", 
+                                                   "data.mask.element", "shuffle", "bucket.plan", "bucketID", "epoch", "batch", "batch.per.bucket", 
+                                                   "last.batch.pad", "batch.per.epoch", "seed"), 
+                          methods = list(
+                            initialize = function(buckets, 
+                                                  batch.size, data.mask.element = 0, shuffle = FALSE, seed = 123) {
+                              .self$buckets <- buckets
+                              .self$bucket.names <- names(.self$buckets)
+                              .self$batch.size <- batch.size
+                              .self$data.mask.element <- data.mask.element
+                              .self$epoch <- 0
+                              .self$batch <- 0
+                              .self$shuffle <- shuffle
+                              .self$batch.per.bucket <- 0
+                              .self$batch.per.epoch <- 0
+                              .self$bucket.plan <- NULL
+                              .self$bucketID <- NULL
+                              .self$seed <- seed
+                              .self
+                            }, reset = function() {
+                              buckets_nb <- length(bucket.names)
+                              buckets_id <- 1:buckets_nb
+                              buckets.size <- sapply(.self$buckets, function(x) {
+                                dim(x$data)[length(dim(x$data)) - 1]
+                              })
+                              .self$batch.per.bucket <- ceiling(buckets.size/.self$batch.size)
+                              .self$last.batch.pad <- .self$batch.size - buckets.size %% .self$batch.size
+                              .self$last.batch.pad[.self$last.batch.pad == .self$batch.size] <- 0
+                              
+                              .self$batch.per.epoch <- sum(.self$batch.per.bucket)
+                              # Number of batches per epoch given the batch.size
+                              .self$batch.per.epoch <- sum(.self$batch.per.bucket)
+                              .self$epoch <- .self$epoch + 1
+                              .self$batch <- 0
+                              
+                              if (.self$shuffle) {
+                                set.seed(.self$seed)
+                                bucket_plan_names <- sample(rep(names(.self$batch.per.bucket), times = .self$batch.per.bucket))
+                                .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
+                                                         FUN = cumsum)
+                                names(.self$bucket.plan) <- bucket_plan_names
+                                ### Return first BucketID at reset for initialization of the model
+                                .self$bucketID <- .self$bucket.plan[1]
+                                
+                                .self$buckets <- lapply(.self$buckets, function(x) {
+                                  shuffle_id <- sample(dim(x$data)[length(dim(x$data)) - 1])
+                                  if (length(dim(x$label)) == 0) {
+                                    list(data = x$data[shuffle_id, ], label = x$label[shuffle_id])
+                                  } else {
+                                    list(data = x$data[shuffle_id, ], label = x$label[shuffle_id, ])
+                                  }
+                                })
+                              } else {
+                                bucket_plan_names <- rep(names(.self$batch.per.bucket), times = .self$batch.per.bucket)
+                                .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
+                                                         FUN = cumsum)
+                                names(.self$bucket.plan) <- bucket_plan_names
+                              }
+                            }, iter.next = function() {
+                              .self$batch <- .self$batch + 1
+                              .self$bucketID <- .self$bucket.plan[batch]
+                              if (.self$batch > .self$batch.per.epoch) {
+                                return(FALSE)
+                              } else {
+                                return(TRUE)
+                              }
+                            }, value = function() {
+                              # bucketID is a named integer: the integer indicates the batch id for the given
+                              # bucket (used to fetch appropriate samples within the bucket) the name is the a
+                              # character containing the sequence length of the bucket (used to unroll the rnn
+                              # to appropriate sequence length)
+                              idx <- (.self$bucketID - 1) * (.self$batch.size) + (1:batch.size)
+                              
+                              ### reuse first idx for padding
+                              if (bucketID == .self$batch.per.bucket[names(.self$bucketID)] & !.self$last.batch.pad[names(.self$bucketID)] == 0) {
+                                idx <- c(idx[1:(.self$batch.size - .self$last.batch.pad[names(.self$bucketID)])], 1:(.self$last.batch.pad[names(.self$bucketID)]))
+                              }
+                              
+                              data <- .self$buckets[[names(.self$bucketID)]]$data[idx, , drop = F]
+                              seq.mask <- as.integer(names(bucketID)) - apply(data==.self$data.mask.element, 1, sum)
+                              if (length(dim(.self$buckets[[names(.self$bucketID)]]$label)) == 0) {
+                                label <- .self$buckets[[names(.self$bucketID)]]$label[idx]
+                              } else {
+                                label <- .self$buckets[[names(.self$bucketID)]]$label[idx, , drop = F]
+                              }
+                              return(list(data = mx.nd.array(data), seq.mask = mx.nd.array(seq.mask), 
+                                          label = mx.nd.array(label)))
+                            }, num.pad = function() {
+                              if (bucketID == .self$batch.per.bucket[names(.self$bucketID)] & !.self$last.batch.pad[names(.self$bucketID)] == 0){
+                                return(.self$last.batch.pad[names(.self$bucketID)])
+                              } else return(0)
+                            }, finalize = function() {
+                            }))
+
+# 
+#' Create Bucket Iter
+#'
+#' @param buckets The data array.
+#' @param batch.size The batch size used to pack the array.
+#' @param data.mask.element The element to mask
+#' @param shuffle Whether shuffle the data
+#' @param seed The random seed
+#'
+#' @export
+mx.io.bucket.iter <- function(buckets, batch.size, data.mask.element = 0, shuffle = FALSE, 
+                              seed = 123) {
+  return(BucketIter$new(buckets = buckets, batch.size = batch.size, data.mask.element = data.mask.element, 
+                        shuffle = shuffle, seed = seed))
+}
diff --git a/R-package/R/rnn.R b/R-package/R/rnn.R
deleted file mode 100644
index b89559a58570..000000000000
--- a/R-package/R/rnn.R
+++ /dev/null
@@ -1,342 +0,0 @@
-# rnn cell symbol
-rnn <- function(num.hidden, indata, prev.state, param, seqidx, 
-                layeridx, dropout=0., batch.norm=FALSE) {
-    if (dropout > 0. )
-        indata <- mx.symbol.Dropout(data=indata, p=dropout)
-    i2h <- mx.symbol.FullyConnected(data=indata,
-                                    weight=param$i2h.weight,
-                                    bias=param$i2h.bias,
-                                    num.hidden=num.hidden,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".i2h"))
-    h2h <- mx.symbol.FullyConnected(data=prev.state$h,
-                                    weight=param$h2h.weight,
-                                    bias=param$h2h.bias,
-                                    num.hidden=num.hidden,
-                                    name=paste0("t", seqidx, ".l", layeridx, ".h2h"))
-    hidden <- i2h + h2h
-
-    hidden <- mx.symbol.Activation(data=hidden, act.type="tanh")
-    if (batch.norm)
-        hidden <- mx.symbol.BatchNorm(data=hidden)
-    return (list(h=hidden))
-}
-
-# unrolled rnn network
-rnn.unroll <- function(num.rnn.layer, seq.len, input.size, num.hidden, 
-                       num.embed, num.label, dropout=0., batch.norm=FALSE) {
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-    param.cells <- lapply(1:num.rnn.layer, function(i) {
-        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.rnn.layer, function(i) {
-        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    label <- mx.symbol.Variable("label")
-    data <- mx.symbol.Variable("data")
-    embed <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                                 weight=embed.weight, output_dim=num.embed, name="embed")
-    wordvec <- mx.symbol.SliceChannel(data=embed, num_outputs=seq.len, squeeze_axis=1)
-
-    last.hidden <- list()
-    for (seqidx in 1:seq.len) { 
-        hidden <- wordvec[[seqidx]]
-        # stack RNN
-        for (i in 1:num.rnn.layer) {
-            dp <- ifelse(i==1, 0, dropout)
-            next.state <- rnn(num.hidden, indata=hidden,
-                              prev.state=last.states[[i]],
-                              param=param.cells[[i]],
-                              seqidx=seqidx, layeridx=i, 
-                              dropout=dp, batch.norm=batch.norm)
-            hidden <- next.state$h
-            last.states[[i]] <- next.state
-        }
-        # decoder
-        if (dropout > 0.)
-            hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-        last.hidden <- c(last.hidden, hidden)
-    }
-    last.hidden$dim <- 0
-    last.hidden$num.args <- seq.len
-    concat <-mxnet:::mx.varg.symbol.Concat(last.hidden)
-    fc <- mx.symbol.FullyConnected(data=concat,
-                                   weight=cls.weight,
-                                   bias=cls.bias,
-                                   num.hidden=num.label)
-    label <- mx.symbol.transpose(data=label)
-    label <- mx.symbol.Reshape(data=label, target.shape=c(0))
-
-    loss.all <- mx.symbol.SoftmaxOutput(data=fc, label=label, name="sm")
-    return (loss.all)
-}
-
-# rnn inference model symbol
-rnn.inference.symbol <- function(num.rnn.layer, seq.len, input.size, num.hidden, 
-                                 num.embed, num.label, dropout=0., batch.norm=FALSE) {
-    seqidx <- 0
-    embed.weight <- mx.symbol.Variable("embed.weight")
-    cls.weight <- mx.symbol.Variable("cls.weight")
-    cls.bias <- mx.symbol.Variable("cls.bias")
-    param.cells <- lapply(1:num.rnn.layer, function(i) {
-        cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
-                     i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
-                     h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
-                     h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
-        return (cell)
-    })
-    last.states <- lapply(1:num.rnn.layer, function(i) {
-        state <- list(h=mx.symbol.Variable(paste0("l", i, ".init.h")))
-        return (state)
-    })
-
-    # embeding layer
-    data <- mx.symbol.Variable("data")
-    hidden <- mx.symbol.Embedding(data=data, input_dim=input.size,
-                                 weight=embed.weight, output_dim=num.embed, name="embed")
-    # stack RNN        
-    for (i in 1:num.rnn.layer) {
-        dp <- ifelse(i==1, 0, dropout)
-        next.state <- rnn(num.hidden, indata=hidden,
-                          prev.state=last.states[[i]],
-                          param=param.cells[[i]],
-                          seqidx=seqidx, layeridx=i, 
-                          dropout=dp, batch.norm=batch.norm)
-        hidden <- next.state$h
-        last.states[[i]] <- next.state
-    }
-    # decoder
-    if (dropout > 0.)
-        hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
-
-    fc <- mx.symbol.FullyConnected(data=hidden,
-                                   weight=cls.weight,
-                                   bias=cls.bias,
-                                   num_hidden=num.label)
-    sm <- mx.symbol.SoftmaxOutput(data=fc, name='sm')
-    unpack.h <- lapply(1:num.rnn.layer, function(i) {
-        state <- last.states[[i]]
-        state.h <- mx.symbol.BlockGrad(state$h, name=paste0("l", i, ".last.h"))
-        return (state.h)
-    })
-    list.all <- c(sm, unpack.h)
-    return (mx.symbol.Group(list.all))
-}
-
-#' Training RNN Unrolled Model
-#'
-#' @param train.data mx.io.DataIter or list(data=R.array, label=R.array)
-#'      The Training set.
-#' @param eval.data mx.io.DataIter or list(data=R.array, label=R.array), optional
-#'      The validation set used for validation evaluation during the progress.
-#' @param num.rnn.layer integer
-#'      The number of the layer of rnn.
-#' @param seq.len integer
-#'      The length of the input sequence.
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer
-#'      The batch size used for R array training.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param ctx mx.context, optional
-#'      The device used to perform training.
-#' @param num.round integer, default=10
-#'      The number of iterations over training data to train the model.
-#' @param update.period integer, default=1
-#'      The number of iterations to update parameters during training period.
-#' @param initializer initializer object. default=mx.init.uniform(0.01)
-#'      The initialization scheme for parameters.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @param optimizer string, default="sgd"
-#'      The optimization method.
-#' @param batch.norm boolean, default=FALSE
-#'      Whether to use batch normalization.
-#' @param ... other parameters passing to \code{mx.rnn}/.
-#' @return model A trained rnn unrolled model.
-#'
-#' @export
-mx.rnn <- function( train.data, eval.data=NULL,
-                    num.rnn.layer, seq.len,
-                    num.hidden, num.embed, num.label,
-                    batch.size, input.size,
-                    ctx=mx.ctx.default(),
-                    num.round=10, update.period=1,
-                    initializer=mx.init.uniform(0.01),
-                    dropout=0, optimizer='sgd',
-                    batch.norm=FALSE,
-                    ...) {
-    # check data and change data into iterator
-    train.data <- check.data(train.data, batch.size, TRUE)
-    eval.data <- check.data(eval.data, batch.size, FALSE)
-
-    # get unrolled rnn symbol
-    rnn.sym <- rnn.unroll( num.rnn.layer=num.rnn.layer,
-                           num.hidden=num.hidden,
-                           seq.len=seq.len,
-                           input.size=input.size,
-                           num.embed=num.embed,
-                           num.label=num.label,
-                           dropout=dropout,
-                           batch.norm=batch.norm)
-    init.states.name <- lapply(1:num.rnn.layer, function(i) {
-        state <- paste0("l", i, ".init.h")
-        return (state)
-    })
-    # set up rnn model
-    model <- setup.rnn.model(rnn.sym=rnn.sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.rnn.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=initializer,
-                             dropout=dropout)
-    # train rnn model
-    model <- train.rnn( model, train.data, eval.data,
-                        num.round=num.round,
-                        update.period=update.period,
-                        ctx=ctx,
-                        init.states.name=init.states.name,
-                        ...)
-    # change model into MXFeedForwardModel
-    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays, aux.params=model$rnn.exec$ref.aux.arrays)
-    return(structure(model, class="MXFeedForwardModel"))
-}
-
-#' Create a RNN Inference Model
-#'
-#' @param num.rnn.layer integer
-#'      The number of the layer of rnn.
-#' @param input.size integer
-#'       The input dim of one-hot encoding of embedding
-#' @param num.hidden integer
-#'      The number of hidden nodes.
-#' @param num.embed integer
-#'      The output dim of embedding.
-#' @param num.label  integer
-#'      The number of labels.
-#' @param batch.size integer, default=1
-#'      The batch size used for R array training.
-#' @param arg.params list
-#'      The batch size used for R array training.
-#' @param ctx mx.context, optional
-#'      Model parameter, list of name to NDArray of net's weights.
-#' @param dropout float, default=0
-#'      A number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @param batch.norm boolean, default=FALSE
-#'      Whether to use batch normalization.
-#' @return model list(rnn.exec=integer, symbol=mxnet symbol, num.rnn.layer=integer, num.hidden=integer, seq.len=integer, batch.size=integer, num.embed=integer) 
-#'      A rnn inference model.
-#'
-#' @export
-mx.rnn.inference <- function( num.rnn.layer,
-                              input.size,
-                              num.hidden,
-                              num.embed,
-                              num.label,
-                              batch.size=1,
-                              arg.params,
-                              ctx=mx.cpu(),
-                              dropout=0.,
-                              batch.norm=FALSE) {
-    sym <- rnn.inference.symbol( num.rnn.layer=num.rnn.layer,
-                                 input.size=input.size,
-                                 num.hidden=num.hidden,
-                                 num.embed=num.embed,
-                                 num.label=num.label,
-                                 dropout=dropout,
-                                 batch.norm=batch.norm)
-    # init.states.name <- c()
-    # for (i in 1:num.rnn.layer) {
-    #     init.states.name <- c(init.states.name, paste0("l", i, ".init.c"))
-    #     init.states.name <- c(init.states.name, paste0("l", i, ".init.h"))
-    # }
-    init.states.name <- lapply(1:num.rnn.layer, function(i) {
-        state <- paste0("l", i, ".init.h")
-        return (state)
-    })
-    
-    seq.len <- 1
-    # set up rnn model
-    model <- setup.rnn.model(rnn.sym=sym,
-                             ctx=ctx,
-                             num.rnn.layer=num.rnn.layer,
-                             seq.len=seq.len,
-                             num.hidden=num.hidden,
-                             num.embed=num.embed,
-                             num.label=num.label,
-                             batch.size=batch.size,
-                             input.size=input.size,
-                             init.states.name=init.states.name,
-                             initializer=mx.init.uniform(0.01),
-                             dropout=dropout)
-    arg.names <- names(model$rnn.exec$ref.arg.arrays)
-    for (k in names(arg.params)) {
-        if ((k %in% arg.names) && is.param.name(k) ) {
-            rnn.input <- list()
-            rnn.input[[k]] <- arg.params[[k]]
-            mx.exec.update.arg.arrays(model$rnn.exec, rnn.input, match.name=TRUE)
-        }
-    }
-    init.states <- list()
-    for (i in 1:num.rnn.layer) {
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-
-    return (model)
-}
-
-#' Using forward function to predict in rnn inference model
-#'
-#' @param model rnn model
-#'      A rnn inference model
-#' @param input.data, array.matrix
-#'      The input data for forward function
-#' @param new.seq boolean, default=FALSE
-#'      Whether the input is the start of a new sequence
-#'
-#' @return result A list(prob=prob, model=model) containing the result probability of each label and the model.
-#'
-#' @export
-mx.rnn.forward <- function(model, input.data, new.seq=FALSE) {
-    if (new.seq == TRUE) {
-        init.states <- list()
-        for (i in 1:model$num.rnn.layer) {
-            init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.arg.arrays[[paste0("l", i, ".init.h")]]*0
-        }
-        mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    }
-    dim(input.data) <- c(model$batch.size)
-    data <- list(data=mx.nd.array(input.data))
-    mx.exec.update.arg.arrays(model$rnn.exec, data, match.name=TRUE)
-    mx.exec.forward(model$rnn.exec, is.train=FALSE)
-    init.states <- list()
-    for (i in 1:model$num.rnn.layer) {
-        init.states[[paste0("l", i, ".init.h")]] <- model$rnn.exec$ref.outputs[[paste0("l", i, ".last.h_output")]]
-    }
-    mx.exec.update.arg.arrays(model$rnn.exec, init.states, match.name=TRUE)
-    #print (model$rnn.exec$ref)
-    prob <- model$rnn.exec$ref.outputs[["sm_output"]]
-    print ("prob")
-    print (prob)
-    return (list(prob=prob, model=model))
-}
diff --git a/R-package/R/rnn.graph.R b/R-package/R/rnn.graph.R
new file mode 100644
index 000000000000..2c099f08028a
--- /dev/null
+++ b/R-package/R/rnn.graph.R
@@ -0,0 +1,329 @@
+# 
+#' Generate a RNN symbolic model - requires CUDA
+#' 
+#' @param config Either seq-to-one or one-to-one
+#' @param cell.type Type of RNN cell: either gru or lstm
+#' @param num.rnn.layer int, number of stacked layers
+#' @param num.hidden int, size of the state in each RNN layer
+#' @param num.embed  int, default = NULL - no embedding. Dimension of the embedding vectors
+#' @param num.decode int, number of output variables in the decoding layer
+#' @param input.size int, number of levels in the data - only used for embedding
+#' @param dropout
+#' 
+#' @export
+rnn.graph <- function(num.rnn.layer, 
+                      input.size = NULL,
+                      num.embed = NULL, 
+                      num.hidden,
+                      num.decode,
+                      dropout = 0,
+                      ignore_label = -1,
+                      loss_output = NULL, 
+                      config,
+                      cell.type,
+                      masking = F,
+                      output_last_state = F) {
+  
+  # define input arguments
+  data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  seq.mask <- mx.symbol.Variable("seq.mask")
+  
+  if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight")
+  
+  rnn.params.weight <- mx.symbol.Variable("rnn.params.weight")
+  rnn.state <- mx.symbol.Variable("rnn.state")
+  
+  if (cell.type == "lstm") {
+    rnn.state.cell <- mx.symbol.Variable("rnn.state.cell")
+  }
+  
+  cls.weight <- mx.symbol.Variable("cls.weight")
+  cls.bias <- mx.symbol.Variable("cls.bias")
+  
+  if (!is.null(num.embed)){
+    data <- mx.symbol.Embedding(data=data, input_dim=input.size,
+                                weight=embed.weight, output_dim=num.embed, name="embed")
+  }
+  
+  # RNN cells
+  if (cell.type == "lstm") {
+    rnn <- mx.symbol.RNN(data=data, state=rnn.state, state_cell = rnn.state.cell, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
+    
+  } else {
+    rnn <- mx.symbol.RNN(data=data, state=rnn.state, parameters=rnn.params.weight, state.size=num.hidden, num.layers=num.rnn.layer, bidirectional=F, mode=cell.type, state.outputs=output_last_state, p=dropout, name=paste(cell.type, num.rnn.layer, "layer", sep="_"))
+  }
+  
+  # Decode
+  if (config=="seq-to-one") {
+    
+    if (masking) mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
+      mask <- mx.symbol.SequenceLast(data=rnn[[1]], use.sequence.length = F, name = "mask")
+    
+    decode <- mx.symbol.FullyConnected(data=mask,
+                                       weight=cls.weight,
+                                       bias=cls.bias,
+                                       num.hidden=num.decode,
+                                       name = "decode")
+    
+    if (!is.null(loss_output)) {
+      loss <- switch(loss_output,
+                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
+                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
+                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      )
+    } else loss <- decode
+    
+  } else if (config=="one-to-one"){
+    
+    if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
+      mask <- mx.symbol.identity(data = rnn[[1]], name = "mask")
+    
+    mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
+    
+    decode <- mx.symbol.FullyConnected(data=reshape,
+                                       weight=cls.weight,
+                                       bias=cls.bias,
+                                       num.hidden=num.decode,
+                                       name = "decode")
+    
+    label <- mx.symbol.reshape(data=label, shape=c(-1), name = "label_reshape")
+    
+    if (!is.null(loss_output)) {
+      loss <- switch(loss_output,
+                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
+                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
+                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      )
+    } else loss <- decode
+  }
+  return(loss)
+}
+
+
+# LSTM cell symbol
+lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0) {
+  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias, 
+                                  num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".i2h"))
+  
+  if (dropout > 0) 
+    i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
+  
+  if (!is.null(prev.state)) {
+    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight, 
+                                    bias = param$h2h.bias, num.hidden = num.hidden * 4, 
+                                    name = paste0("t", seqidx, ".l", layeridx, ".h2h"))
+    gates <- i2h + h2h
+  } else {
+    gates <- i2h
+  }
+  
+  split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F, 
+                                 name = paste0("t", seqidx, ".l", layeridx, ".slice"))
+  
+  in.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
+  in.transform <- mx.symbol.Activation(split.gates[[2]], act.type = "tanh")
+  forget.gate <- mx.symbol.Activation(split.gates[[3]], act.type = "sigmoid")
+  out.gate <- mx.symbol.Activation(split.gates[[4]], act.type = "sigmoid")
+  
+  if (is.null(prev.state)) {
+    next.c <- in.gate * in.transform
+  } else {
+    next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
+  }
+  
+  next.h <- out.gate * mx.symbol.Activation(next.c, act.type = "tanh")
+  
+  return(list(c = next.c, h = next.h))
+}
+
+# GRU cell symbol
+gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0) {
+  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight, 
+                                  bias = param$gates.i2h.bias, num.hidden = num.hidden * 2, 
+                                  name = paste0("t", seqidx, ".l", layeridx, ".gates.i2h"))
+  
+  if (dropout > 0) 
+    i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
+  
+  if (!is.null(prev.state)) {
+    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight, 
+                                    bias = param$gates.h2h.bias, num.hidden = num.hidden * 2, 
+                                    name = paste0("t", seqidx, ".l", layeridx, ".gates.h2h"))
+    gates <- i2h + h2h
+  } else {
+    gates <- i2h
+  }
+  
+  split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F, 
+                                 name = paste0("t", seqidx, ".l", layeridx, ".split"))
+  
+  update.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
+  reset.gate <- mx.symbol.Activation(split.gates[[2]], act.type = "sigmoid")
+  
+  htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight, 
+                                         bias = param$trans.i2h.bias, num.hidden = num.hidden, 
+                                         name = paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
+  
+  if (is.null(prev.state)) {
+    h.after.reset <- reset.gate * 0
+  } else {
+    h.after.reset <- prev.state$h * reset.gate
+  }
+  
+  htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight, 
+                                         bias = param$trans.h2h.bias, num.hidden = num.hidden, 
+                                         name = paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
+  
+  h.trans <- htrans.i2h + htrans.h2h
+  h.trans.active <- mx.symbol.Activation(h.trans, act.type = "tanh")
+  
+  if (is.null(prev.state)) {
+    next.h <- update.gate * h.trans.active
+  } else {
+    next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h)
+  }
+  
+  return(list(h = next.h))
+}
+
+# 
+#' unroll representation of RNN running on non CUDA device - under development
+#' 
+#' @export
+rnn.graph.unroll <- function(num.rnn.layer, 
+                             seq.len, 
+                             input.size = NULL,
+                             num.embed = NULL, 
+                             num.hidden,
+                             num.decode,
+                             dropout = 0,
+                             ignore_label = -1,
+                             loss_output = NULL, 
+                             init.state = NULL,
+                             config,
+                             cell.type = "lstm", 
+                             masking = F, 
+                             output_last_state = F) {
+  
+  
+  if (!is.null(num.embed)) embed.weight <- mx.symbol.Variable("embed.weight")
+  
+  cls.weight <- mx.symbol.Variable("cls.weight")
+  cls.bias <- mx.symbol.Variable("cls.bias")
+  
+  param.cells <- lapply(1:num.rnn.layer, function(i) {
+    
+    if (cell.type=="lstm"){
+      cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
+                   i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")),
+                   h2h.weight = mx.symbol.Variable(paste0("l", i, ".h2h.weight")),
+                   h2h.bias = mx.symbol.Variable(paste0("l", i, ".h2h.bias")))
+    } else if (cell.type=="gru"){
+      cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
+                   gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
+                   gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
+                   gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
+                   trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
+                   trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
+                   trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
+                   trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
+    }
+    return (cell)
+  })
+  
+  # embeding layer
+  data <- mx.symbol.Variable("data")
+  label <- mx.symbol.Variable("label")
+  seq.mask <- mx.symbol.Variable("seq.mask")
+  
+  if (!is.null(num.embed)) {
+    data <- mx.symbol.Embedding(data = data, input_dim = input.size,
+                                weight=embed.weight, output_dim = num.embed, name = "embed")
+  }
+  
+  data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq.len, squeeze_axis = T)
+  
+  last.hidden <- list()
+  last.states <- list()
+  
+  for (seqidx in 1:seq.len) {
+    hidden <- data[[seqidx]]
+    
+    for (i in 1:num.rnn.layer) {
+      
+      if (seqidx==1) prev.state<- init.state[[i]] else prev.state <- last.states[[i]]
+      
+      if (cell.type=="lstm") {
+        cell.symbol <- lstm.cell
+      } else if (cell.type=="gru"){
+        cell.symbol <- gru.cell
+      }
+      
+      next.state <- cell.symbol(num.hidden = num.hidden, 
+                                indata = hidden,
+                                prev.state = prev.state,
+                                param = param.cells[[i]],
+                                seqidx = seqidx, 
+                                layeridx = i,
+                                dropout = dropout)
+      hidden <- next.state$h
+      last.states[[i]] <- next.state
+    }
+    
+    # Aggregate outputs from each timestep
+    last.hidden <- c(last.hidden, hidden)
+  }
+  
+  # concat hidden units - concat seq.len blocks of dimension num.hidden x batch.size
+  concat <- mx.symbol.concat(data = last.hidden, num.args = seq.len, dim = 0, name = "concat")
+  concat <- mx.symbol.reshape(data = concat, shape = c(num.hidden, -1, seq.len), name = "rnn_reshape")
+  
+  if (config=="seq-to-one"){
+    
+    if (masking) mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
+      mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = F, name = "mask")
+    
+    decode <- mx.symbol.FullyConnected(data = mask,
+                                       weight = cls.weight,
+                                       bias = cls.bias,
+                                       num.hidden = num.decode,
+                                       name = "decode")
+    
+    if (!is.null(loss_output)) {
+      loss <- switch(loss_output,
+                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
+                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
+                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      )
+    } else loss <- decode
+    
+  } else if (config=="one-to-one"){
+    
+    if (masking) mask <- mx.symbol.SequenceMask(data = concat, use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
+      mask <- mx.symbol.identity(data = concat, name = "mask")
+    
+    mask = mx.symbol.reshape(mask, shape=c(num.hidden, -1))
+    
+    decode <- mx.symbol.FullyConnected(data = mask,
+                                       weight = cls.weight,
+                                       bias = cls.bias,
+                                       num.hidden = num.decode,
+                                       name = "decode")
+    
+    label <- mx.symbol.reshape(data = label, shape = -1, name = "label_reshape")
+    
+    if (!is.null(loss_output)) {
+      loss <- switch(loss_output,
+                     softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"),
+                     linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = "loss"),
+                     logictic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = "loss"),
+                     MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = "loss")
+      )
+    } else loss <- decode
+  }
+  return(loss)
+}
diff --git a/R-package/R/rnn.infer.R b/R-package/R/rnn.infer.R
new file mode 100644
index 000000000000..c9ccecbddbeb
--- /dev/null
+++ b/R-package/R/rnn.infer.R
@@ -0,0 +1,177 @@
+# 
+#' Inference of RNN model
+#'
+#' @param infer.data Data iterator created by mx.io.bucket.iter
+#' @param model Model used for inference
+#' @param ctx The element to mask
+#'
+#' @export
+mx.infer.buckets <- function(infer.data, model, ctx = mx.cpu()) {
+  
+  ### Initialise the iterator
+  infer.data$reset()
+  infer.data$iter.next()
+  
+  if (is.null(ctx)) 
+    ctx <- mx.ctx.default()
+  if (is.mx.context(ctx)) {
+    ctx <- list(ctx)
+  }
+  if (!is.list(ctx)) 
+    stop("ctx must be mx.context or list of mx.context")
+  
+  ndevice <- length(ctx)
+  symbol <- model$symbol
+  if (is.list(symbol)) sym_ini <- symbol[[names(train.data$bucketID)]] else sym_ini <- symbol
+  
+  arguments <- sym_ini$arguments
+  input.names <- intersect(names(infer.data$value()), arguments)
+  
+  input.shape <- sapply(input.names, function(n) {
+    dim(infer.data$value()[[n]])
+  }, simplify = FALSE)
+  
+  shapes <- sym_ini$infer.shape(input.shape)
+  
+  # initialize all arguments with zeros
+  arguments.ini <- lapply(shapes$arg.shapes, function(shape) {
+    mx.nd.zeros(shape = shape, ctx = mx.cpu())
+  })
+
+  arg.params <- model$arg.params
+  arg.params.names <- names(arg.params)
+  aux.params <- model$aux.params
+  
+  # Initial binding
+  dlist <- arguments.ini[input.names]
+  
+  # Assign fixed parameters to their value and keep non initialized arguments to zero
+  arg.params.fix.names <- setdiff(arguments, c(arg.params.names, input.names))
+  
+  # Assign zeros to non initialized arg parameters
+  arg.params.fix <- arguments.ini[arg.params.fix.names]
+  
+  # Grad request
+  grad.req <- rep("null", length(arguments))
+  
+  # Arg array order
+  update_names <- c(input.names, arg.params.fix.names, arg.params.names)
+  arg_update_idx <- match(arguments, update_names)
+  
+  execs <- mx.symbol.bind(symbol = symbol, arg.arrays = c(dlist, arg.params.fix, arg.params)[arg_update_idx], 
+                                  aux.arrays = aux.params, ctx = ctx[[1]], grad.req = grad.req)
+  
+  # Initial input shapes - need to be adapted for multi-devices - divide highest
+  # dimension by device nb
+  
+  packer <- mx.nd.arraypacker()
+  infer.data$reset()
+  while (infer.data$iter.next()) {
+    
+    # Get input data slice
+    dlist <- infer.data$value()  #[input.names]
+    
+    execs <- mx.symbol.bind(symbol = symbol, arg.arrays = c(dlist, execs$arg.arrays[arg.params.fix.names], execs$arg.arrays[arg.params.names])[arg_update_idx], 
+                                    aux.arrays = execs$aux.arrays, ctx = ctx[[1]], grad.req = grad.req)
+    
+    mx.exec.forward(execs, is.train = FALSE)
+    
+    out.pred <- mx.nd.copyto(execs$ref.outputs[[1]], mx.cpu())
+    padded <- infer.data$num.pad()
+    oshape <- dim(out.pred)
+    ndim <- length(oshape)
+    packer$push(mx.nd.slice.axis(data = out.pred, axis = 0, begin = 0, end = oshape[[ndim]] - padded))
+    
+  }
+  infer.data$reset()
+  return(packer$get())
+}
+
+
+
+### inference for one-to-one models
+mx.infer.buckets.one <- function(infer.data, 
+                                 symbol, arg.params, aux.params, input.params = NULL, 
+                                 ctx = mx.cpu()) {
+  
+  ### Initialise the iterator
+  infer.data$reset()
+  infer.data$iter.next()
+  
+  if (is.null(ctx)) 
+    ctx <- mx.ctx.default()
+  if (is.mx.context(ctx)) {
+    ctx <- list(ctx)
+  }
+  if (!is.list(ctx)) 
+    stop("ctx must be mx.context or list of mx.context")
+  
+  ndevice <- length(ctx)
+  
+  arguments <- symbol$arguments
+  input.names <- intersect(names(infer.data$value()), arguments)
+  
+  input.shape <- sapply(input.names, function(n) {
+    dim(infer.data$value()[[n]])
+  }, simplify = FALSE)
+  
+  shapes <- symbol$infer.shape(input.shape)
+  
+  # initialize all arguments with zeros
+  arguments.ini <- lapply(shapes$arg.shapes, function(shape) {
+    mx.nd.zeros(shape = shape, ctx = mx.cpu())
+  })
+  
+  arg.params <- arg.params
+  arg.params.names <- names(arg.params)
+  
+  dlist <- arguments.ini[input.names]
+  
+  # Assign fixed parameters to their value and keep non initialized arguments to zero
+  arg.params.fix.names <- unique(c(names(input.params), setdiff(arguments, c(arg.params.names, input.names))))
+  
+  # Assign zeros to non initialized arg parameters
+  arg.params.fix <- arguments.ini[arg.params.fix.names]
+  # Assign weights to arguments specifies by input.params
+  arg.params.fix[names(input.params)] <- input.params
+  
+  aux.params <- aux.params
+  
+  # Grad request
+  grad.req <- rep("null", length(arguments))
+  
+  # Arg array order
+  update_names <- c(input.names, arg.params.fix.names, arg.params.names)
+  arg_update_idx <- match(arguments, update_names)
+  
+  # Initial binding
+  execs <- mx.symbol.bind(symbol = symbol, 
+                                  arg.arrays = c(dlist, arg.params.fix, arg.params)[arg_update_idx], 
+                                  aux.arrays = aux.params, ctx = ctx[[1]], grad.req = grad.req)
+  
+  # Initial input shapes - need to be adapted for multi-devices - divide highest
+  # dimension by device nb
+  
+  infer.data$reset()
+  while (infer.data$iter.next()) {
+    
+    # Get input data slice
+    dlist <- infer.data$value()[input.names]
+    
+    execs <- mx.symbol.bind(symbol = symbol, 
+                                    arg.arrays = c(dlist, execs$arg.arrays[arg.params.fix.names], execs$arg.arrays[arg.params.names])[arg_update_idx],
+                                    aux.arrays = execs$aux.arrays, ctx = ctx[[1]], grad.req = grad.req)
+    
+    mx.exec.forward(execs, is.train = FALSE)
+    
+    out.pred <- mx.nd.copyto(execs$ref.outputs[[1]], mx.cpu())
+    state <- mx.nd.copyto(execs$ref.outputs[[2]], mx.cpu())
+    state_cell <- mx.nd.copyto(execs$ref.outputs[[3]], mx.cpu())
+    
+    out <- lapply(execs$ref.outputs, function(out) {
+      mx.nd.copyto(out, mx.cpu())
+    })
+  }
+  infer.data$reset()
+  return(out)
+}
diff --git a/R-package/R/rnn_model.R b/R-package/R/rnn_model.R
deleted file mode 100644
index aa4a7d03ca9b..000000000000
--- a/R-package/R/rnn_model.R
+++ /dev/null
@@ -1,258 +0,0 @@
-is.param.name <- function(name) {
-    return (grepl('weight$', name) || grepl('bias$', name) ||
-           grepl('gamma$', name) || grepl('beta$', name) )
-}
-
-# Initialize the data iter
-mx.model.init.iter.rnn <- function(X, y, batch.size, is.train) {
-  if (is.mx.dataiter(X)) return(X)
-  shape <- dim(X)
-  if (is.null(shape)) {
-    num.data <- length(X)
-  } else {
-    ndim <- length(shape)
-    num.data <- shape[[ndim]]
-  }
-  if (is.null(y)) {
-    if (is.train) stop("Need to provide parameter y for training with R arrays.")
-    y <- c(1:num.data) * 0
-  }
-
-  batch.size <- min(num.data, batch.size)
-
-  return(mx.io.arrayiter(X, y, batch.size=batch.size, shuffle=is.train))
-}
-
-# set up rnn model with rnn cells
-setup.rnn.model <- function(rnn.sym, ctx,
-                            num.rnn.layer, seq.len,
-                            num.hidden, num.embed, num.label,
-                            batch.size, input.size,
-                            init.states.name,
-                            initializer=mx.init.uniform(0.01),
-                            dropout=0) {
-
-    arg.names <- rnn.sym$arguments
-    input.shapes <- list()
-    for (name in arg.names) {
-        if (name %in% init.states.name) {
-            input.shapes[[name]] <- c(num.hidden, batch.size)
-        }
-        else if (grepl('data$', name) || grepl('label$', name) ) {
-            if (seq.len == 1) {
-                input.shapes[[name]] <- c(batch.size)
-            } else {
-            input.shapes[[name]] <- c(seq.len, batch.size)
-            }
-        }
-    }
-    params <- mx.model.init.params(rnn.sym, input.shapes, NULL, initializer, mx.cpu())
-    args <- input.shapes
-    args$symbol <- rnn.sym
-    args$ctx <- ctx
-    args$grad.req <- "write"
-    rnn.exec <- do.call(mx.simple.bind, args)
-
-    mx.exec.update.arg.arrays(rnn.exec, params$arg.params, match.name=TRUE)
-    mx.exec.update.aux.arrays(rnn.exec, params$aux.params, match.name=TRUE)
-
-    grad.arrays <- list()
-    for (name in names(rnn.exec$ref.grad.arrays)) {
-        if (is.param.name(name))
-            grad.arrays[[name]] <- rnn.exec$ref.arg.arrays[[name]]*0
-    }
-    mx.exec.update.grad.arrays(rnn.exec, grad.arrays, match.name=TRUE)
-
-    return (list(rnn.exec=rnn.exec, symbol=rnn.sym,
-                 num.rnn.layer=num.rnn.layer, num.hidden=num.hidden,
-                 seq.len=seq.len, batch.size=batch.size,
-                 num.embed=num.embed))
-
-}
-
-
-calc.nll <- function(seq.label.probs, batch.size) {
-    nll = - sum(log(seq.label.probs)) / batch.size
-    return (nll)
-}
-
-get.label <- function(label, ctx) {
-    label <- as.array(label)
-    seq.len <- dim(label)[[1]]
-    batch.size <- dim(label)[[2]]
-    sm.label <- array(0, dim=c(seq.len*batch.size))
-    for (seqidx in 1:seq.len) {
-        sm.label[((seqidx-1)*batch.size+1) : (seqidx*batch.size)] <- label[seqidx,]
-    }
-    return (mx.nd.array(sm.label, ctx))
-}
-
-
-# training rnn model
-train.rnn <- function (model, train.data, eval.data,
-                       num.round, update.period,
-                       init.states.name,
-                       optimizer='sgd', ctx=mx.ctx.default(), 
-                       epoch.end.callback,
-                       batch.end.callback,
-                       verbose=TRUE,
-                       ...) {
-    m <- model
-    
-    model <- list(symbol=model$symbol, arg.params=model$rnn.exec$ref.arg.arrays,
-                  aux.params=model$rnn.exec$ref.aux.arrays)
-    
-    seq.len <- m$seq.len
-    batch.size <- m$batch.size
-    num.rnn.layer <- m$num.rnn.layer
-    num.hidden <- m$num.hidden
-
-    opt <- mx.opt.create(optimizer, rescale.grad=(1/batch.size), ...)
-
-    updater <- mx.opt.get.updater(opt, m$rnn.exec$ref.arg.arrays)
-    epoch.counter <- 0
-    log.period <- max(as.integer(1000 / seq.len), 1)
-    last.perp <- 10000000.0
-
-    for (iteration in 1:num.round) {
-        nbatch <- 0
-        train.nll <- 0
-        # reset states
-        init.states <- list()
-        for (name in init.states.name) {
-            init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
-        }
-
-        mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-
-        tic <- Sys.time()
-
-        train.data$reset()
-
-        while (train.data$iter.next()) {
-            # set rnn input
-            rnn.input <- train.data$value()
-            mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
-
-            mx.exec.forward(m$rnn.exec, is.train=TRUE)
-            seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
-
-            mx.exec.backward(m$rnn.exec)
-            init.states <- list()
-            for (name in init.states.name) {
-                init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
-            }
-
-            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-            # update epoch counter
-            epoch.counter <- epoch.counter + 1
-            if (epoch.counter %% update.period == 0) {
-                # the gradient of initial c and inital h should be zero
-                init.grad <- list()
-                for (name in init.states.name) {
-                    init.grad[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
-                }
-
-                mx.exec.update.grad.arrays(m$rnn.exec, init.grad, match.name=TRUE)
-
-                arg.blocks <- updater(m$rnn.exec$ref.arg.arrays, m$rnn.exec$ref.grad.arrays)
-
-                mx.exec.update.arg.arrays(m$rnn.exec, arg.blocks, skip.null=TRUE)
-
-                grad.arrays <- list()
-                for (name in names(m$rnn.exec$ref.grad.arrays)) {
-                    if (is.param.name(name))
-                        grad.arrays[[name]] <- m$rnn.exec$ref.grad.arrays[[name]]*0
-                }
-                mx.exec.update.grad.arrays(m$rnn.exec, grad.arrays, match.name=TRUE)
-
-            }
-
-            train.nll <- train.nll + calc.nll(as.array(seq.label.probs), batch.size)
-
-            nbatch <- nbatch + seq.len
-            
-            if (!is.null(batch.end.callback)) {
-              batch.end.callback(iteration, nbatch, environment())
-            }
-            
-            if ((epoch.counter %% log.period) == 0) {
-                message(paste0("Epoch [", epoch.counter,
-                           "] Train: NLL=", train.nll / nbatch,
-                           ", Perp=", exp(train.nll / nbatch)))
-            }
-        }
-        train.data$reset()
-        # end of training loop
-        toc <- Sys.time()
-        message(paste0("Iter [", iteration,
-                   "] Train: Time: ", as.numeric(toc - tic, units="secs"),
-                   " sec, NLL=", train.nll / nbatch,
-                   ", Perp=", exp(train.nll / nbatch)))
-
-        if (!is.null(eval.data)) {
-            val.nll <- 0.0
-            # validation set, reset states
-            init.states <- list()
-            for (name in init.states.name) {
-                init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
-            }
-            mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-
-            eval.data$reset()
-            nbatch <- 0
-            while (eval.data$iter.next()) {
-                # set rnn input
-                rnn.input <- eval.data$value()
-                mx.exec.update.arg.arrays(m$rnn.exec, rnn.input, match.name=TRUE)
-                mx.exec.forward(m$rnn.exec, is.train=FALSE)
-                # probability of each label class, used to evaluate nll
-                seq.label.probs <- mx.nd.choose.element.0index(m$rnn.exec$ref.outputs[["sm_output"]], get.label(m$rnn.exec$ref.arg.arrays[["label"]], ctx))
-                # transfer the states
-                init.states <- list()
-                for (name in init.states.name) {
-                    init.states[[name]] <- m$rnn.exec$ref.arg.arrays[[name]]*0
-                }
-                mx.exec.update.arg.arrays(m$rnn.exec, init.states, match.name=TRUE)
-                val.nll <- val.nll + calc.nll(as.array(seq.label.probs), batch.size)
-                nbatch <- nbatch + seq.len
-            }
-            eval.data$reset()
-            perp <- exp(val.nll / nbatch)
-            message(paste0("Iter [", iteration,
-                       "] Val: NLL=", val.nll / nbatch,
-                       ", Perp=", exp(val.nll / nbatch)))
-        }
-        # get the model out
-
-
-        epoch_continue <- TRUE
-        if (!is.null(epoch.end.callback)) {
-          epoch_continue <- epoch.end.callback(iteration, 0, environment(), verbose = verbose)
-        }
-        
-        if (!epoch_continue) {
-          break
-        }
-    }
-
-    return (m)
-}
-
-# check data and translate data into iterator if data is array/matrix
-check.data <- function(data, batch.size, is.train) {
-    if (!is.null(data) && !is.list(data) && !is.mx.dataiter(data)) {
-        stop("The dataset should be either a mx.io.DataIter or a R list")
-    }
-    if (is.list(data)) {
-        if (is.null(data$data) || is.null(data$label)){
-            stop("Please provide dataset as list(data=R.array, label=R.array)")
-        }
-    data <- mx.model.init.iter.rnn(data$data, data$label, batch.size=batch.size, is.train = is.train)
-    }
-    if (!is.null(data) && !data$iter.next()) {
-        data$reset()
-        if (!data$iter.next()) stop("Empty input")
-    }
-    return (data)
-}
diff --git a/R-package/R/viz.graph.R b/R-package/R/viz.graph.R
index 7d0365b1433f..6d13de0af1d0 100644
--- a/R-package/R/viz.graph.R
+++ b/R-package/R/viz.graph.R
@@ -45,6 +45,7 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi
       "MAERegressionOutput"=,
       "SVMOutput"=,
       "LogisticRegressionOutput"=,
+      "MakeLoss"=,
       "SoftmaxOutput" = "#b3de69",
       "#fccde5" # default value
     )
@@ -122,11 +123,14 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi
     stringsAsFactors=F)
   edges_df$from<- id_dic[as.character(edges_df$from)]
   
-  nodes_df_new<- create_node_df(n = nrow(nodes_df), label=nodes_df$label, shape=nodes_df$shape, type="base", penwidth=2, color=nodes_df$color, style="filled", fillcolor=adjustcolor(nodes_df$color, alpha.f = 1))
-  edge_df_new<- create_edge_df(from = edges_df$from, to=edges_df$to, color="black")
+  nodes_df_new<- create_node_df(n = nrow(nodes_df), label=nodes_df$label, shape=nodes_df$shape, type="base", penwidth=2, color=nodes_df$color, style="filled", 
+                                fillcolor=adjustcolor(nodes_df$color, alpha.f = 1), fontcolor = "black")
+  edge_df_new<- create_edge_df(from = edges_df$from, to=edges_df$to, color="black", fontcolor = "black")
   
   if (!is.null(shape)){
-    edges_labels_raw<- symbol$get.internals()$infer.shape(list(data=shape))$out.shapes
+    if (is.list(shape)) {
+      edges_labels_raw<- symbol$get.internals()$infer.shape(shape)$out.shapes
+    } else edges_labels_raw<- symbol$get.internals()$infer.shape(list(data=shape))$out.shapes
     if (!is.null(edges_labels_raw)){
       edge_label_str <- function(x) paste0(x, collapse="X")
       edges_labels_raw<- sapply(edges_labels_raw, edge_label_str)
@@ -145,9 +149,6 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi
   } else {
     graph_render<- render_graph(graph = graph, output = "graph", width = graph.width.px, height = graph.height.px)
   }
-
-  # graph <-visNetwork(nodes = nodes_df, edges = edges_df, main = graph.title) %>%
-  #   visHierarchicalLayout(direction = "UD", sortMethod = "directed")
   
   return(graph_render)
 }
diff --git a/R-package/README.md b/R-package/README.md
index 6576700e11c6..c39b2b101d2e 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -19,7 +19,7 @@ You can install the CPU package directly from the R console:
 
 ```r
 cran <- getOption("repos")
-cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/"
+cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
 options(repos = cran)
 install.packages("mxnet")
 ```
diff --git a/R-package/tests/testthat/get_data.R b/R-package/tests/testthat/get_data.R
index 6d8de8516ae1..2676b20fa80b 100644
--- a/R-package/tests/testthat/get_data.R
+++ b/R-package/tests/testthat/get_data.R
@@ -19,7 +19,7 @@ GetMNIST_csv <- function() {
   }
   if (!file.exists('data/train.csv') |
       !file.exists('data/test.csv')) {
-    download.file('https://s3-us-west-2.amazonaws.com/apache-mxnet/R/data/mnist_csv.zip',
+    download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip',
                   destfile = 'data/mnist_csv.zip')
     unzip('data/mnist_csv.zip', exdir = 'data/')
     file.remove('data/mnist_csv.zip')
@@ -61,7 +61,7 @@ GetCatDog <- function() {
   }
   if (!file.exists('data/cats_dogs/cats_dogs_train.rec') |
       !file.exists('data/cats_dogs/cats_dogs_val.rec')) {
-    download.file('https://s3-us-west-2.amazonaws.com/apache-mxnet/R/data/cats_dogs.zip',
+    download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/cats_dogs.zip',
                   destfile = 'data/cats_dogs.zip')
     unzip('data/cats_dogs.zip', exdir = 'data/')
     file.remove('data/cats_dogs.zip')
@@ -86,7 +86,7 @@ GetISBI_data <- function() {
   }
   if (!file.exists('data/ISBI/train-volume.tif') |
       !file.exists('data/ISBI/train-labels.tif')) {
-    download.file('https://s3-us-west-2.amazonaws.com/apache-mxnet/R/data/ISBI.zip',
+    download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/ISBI.zip',
                   destfile = 'data/ISBI.zip')
     unzip('data/ISBI.zip', exdir = 'data/')
     file.remove('data/ISBI.zip')
@@ -99,7 +99,7 @@ GetCaptcha_data <- function() {
   }
   if (!file.exists('data/captcha_example/captcha_train.rec') |
       !file.exists('data/captcha_example/captcha_test.rec')) {
-    download.file('https://s3-us-west-2.amazonaws.com/apache-mxnet/R/data/captcha_example.zip',
+    download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/captcha_example.zip',
                   destfile = 'data/captcha_example.zip')
     unzip('data/captcha_example.zip', exdir = 'data/')
     file.remove('data/captcha_example.zip')
diff --git a/R-package/tests/testthat/test_img_seg.R b/R-package/tests/testthat/test_img_seg.R
index fbca92e2a8a2..b3400cd3bbc6 100644
--- a/R-package/tests/testthat/test_img_seg.R
+++ b/R-package/tests/testthat/test_img_seg.R
@@ -90,7 +90,7 @@ context("Image segmentation")
 test_that("UNET", {
   list.of.packages <- c("imager")
   new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
-  if(length(new.packages)) install.packages(new.packages)
+  if(length(new.packages)) install.packages(new.packages, repos = "https://cloud.r-project.org/")
   GetISBI_data()
   library(imager)
   IMG_SIZE <- 168
@@ -132,4 +132,4 @@ test_that("UNET", {
                                        learning.rate = 0.05,
                                        momentum = 0.99,
                                        array.batch.size = 2)
-})
\ No newline at end of file
+})
diff --git a/R-package/tests/testthat/test_lstm.R b/R-package/tests/testthat/test_lstm.R
deleted file mode 100644
index 4a5cdbeb436f..000000000000
--- a/R-package/tests/testthat/test_lstm.R
+++ /dev/null
@@ -1,57 +0,0 @@
-require(mxnet)
-
-if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 1) {
-  mx.ctx.default(new = mx.gpu())
-  message("Using GPU for testing.")
-}
-
-context("lstm models")
-
-get.nll <- function(s) {
-    pat <- ".*\\NLL=(.+), Perp=.*"
-    nll <- sub(pat, "\\1", s)
-    return (as.numeric(nll))
-} 
-
-test_that("training error decreasing", {
-
-    # Set basic network parameters.
-    batch.size = 2
-    seq.len = 2
-    num.hidden = 1
-    num.embed = 2
-    num.lstm.layer = 2
-    num.round = 5
-    learning.rate= 0.1
-    wd=0.00001
-    clip_gradient=1
-    update.period = 1
-    vocab=17
-
-    X.train <- list(data=array(1:16, dim=c(2,8)), label=array(2:17, dim=c(2,8)))
-
-    s <- capture.output(model <- mx.lstm( X.train, 
-                                          ctx=mx.ctx.default(),
-                                          num.round=num.round, 
-                                          update.period=update.period,
-                                          num.lstm.layer=num.lstm.layer, 
-                                          seq.len=seq.len,
-                                          num.hidden=num.hidden, 
-                                          num.embed=num.embed, 
-                                          num.label=vocab,
-                                          batch.size=batch.size, 
-                                          input.size=vocab,
-                                          initializer=mx.init.uniform(0.01), 
-                                          learning.rate=learning.rate,
-                                          wd=wd,
-                                          clip_gradient=clip_gradient))
-
-    prev.nll <- 10000000.0
-    for (r in s) {
-        nll <- get.nll(r)
-        expect_true(prev.nll >= nll)
-        prev.nll <- nll
-
-    }
-
-})
\ No newline at end of file
diff --git a/R-package/vignettes/CustomIterator.Rmd b/R-package/vignettes/CustomIterator.Rmd
index 22ac90fe0400..b5a6576a5bc6 100644
--- a/R-package/vignettes/CustomIterator.Rmd
+++ b/R-package/vignettes/CustomIterator.Rmd
@@ -10,7 +10,7 @@ The data we are going to use is the [MNIST dataset](http://yann.lecun.com/exdb/m
 To download the data:
 
 ```{r}
-download.file('https://s3-us-west-2.amazonaws.com/apache-mxnet/R/data/mnist_csv.zip',
+download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip',
               destfile = 'mnist_csv.zip')
 unzip('mnist_csv.zip', exdir = '.')
 ```
diff --git a/R-package/vignettes/mnistCompetition.Rmd b/R-package/vignettes/mnistCompetition.Rmd
index 988fd18e8b4d..055f1ae51d7e 100644
--- a/R-package/vignettes/mnistCompetition.Rmd
+++ b/R-package/vignettes/mnistCompetition.Rmd
@@ -10,7 +10,7 @@ First, let us download the data from [here](https://www.kaggle.com/c/digit-recog
 Then we can read them in R and convert to matrices.
 
 ```{r, echo=FALSE}
-download.file('https://s3-us-west-2.amazonaws.com/apache-mxnet/R/data/mnist_csv.zip', destfile = 'mnist_csv.zip')
+download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip', destfile = 'mnist_csv.zip')
 unzip('mnist_csv.zip', exdir = '.')
 ```
 
diff --git a/README.md b/README.md
index a11780aa019b..6e7dc41c1e5b 100644
--- a/README.md
+++ b/README.md
@@ -1,28 +1,31 @@
 Apache MXNet (incubating) for Deep Learning
 =====
 
-[![Build Status](https://travis-ci.org/dmlc/mxnet.svg?branch=master)](https://travis-ci.org/dmlc/mxnet)
-[![Documentation Status](https://readthedocs.org/projects/mxnet/badge/?version=latest)](http://mxnet.io/)
+[![Build Status](https://builds.apache.org/job/incubator-mxnet/job/master/badge/icon)](https://builds.apache.org/job/incubator-mxnet/job/master/)
+[![Documentation Status](https://builds.apache.org/job/incubator-mxnet-build-site/badge/icon)](https://mxnet.incubator.apache.org/)
 [![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
 
 ![banner](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/banner.png)
 
 Apache MXNet (incubating) is a deep learning framework designed for both *efficiency* and *flexibility*.
-It allows you to ***mix*** [symbolic and imperative programming](http://mxnet.io/architecture/index.html#deep-learning-system-design-concepts)
+It allows you to ***mix*** [symbolic and imperative programming](https://mxnet.incubator.apache.org/architecture/index.html#deep-learning-system-design-concepts)
 to ***maximize*** efficiency and productivity.
 At its core, MXNet contains a dynamic dependency scheduler that automatically parallelizes both symbolic and imperative operations on the fly.
 A graph optimization layer on top of that makes symbolic execution fast and memory efficient.
 MXNet is portable and lightweight, scaling effectively to multiple GPUs and multiple machines.
 
 MXNet is also more than a deep learning project. It is also a collection of
-[blue prints and guidelines](http://mxnet.io/architecture/index.html#deep-learning-system-design-concepts) for building
+[blue prints and guidelines](https://mxnet.incubator.apache.org/architecture/index.html#deep-learning-system-design-concepts) for building
 deep learning systems, and interesting insights of DL systems for hackers.
 
 [![Join the chat at https://gitter.im/dmlc/mxnet](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/mxnet?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 What's New
 ----------
-* [Version 0.11.0-rc2 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.11.0.rc2) - MXNet 0.11.0-rc2 Release.
+* [Version 1.0.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/1.0.0) - MXNet 1.0.0 Release.
+* [Version 0.12.1 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.1) - MXNet 0.12.1 Patch Release.
+* [Version 0.12.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.12.0) - MXNet 0.12.0 Release.
+* [Version 0.11.0 Release](https://github.com/apache/incubator-mxnet/releases/tag/0.11.0) - MXNet 0.11.0 Release.
 * [Apache Incubator](http://incubator.apache.org/projects/mxnet.html) - We are now an Apache Incubator project.
 * [Version 0.10.0 Release](https://github.com/dmlc/mxnet/releases/tag/v0.10.0) - MXNet 0.10.0 Release.
 * [Version 0.9.3 Release](./docs/architecture/release_note_0_9.md) - First 0.9 official release.
@@ -33,26 +36,26 @@ What's New
 * [MKLDNN for Faster CPU Performance](./MKL_README.md)
 * [MXNet Memory Monger, Training Deeper Nets with Sublinear Memory Cost](https://github.com/dmlc/mxnet-memonger)
 * [Tutorial for NVidia GTC 2016](https://github.com/dmlc/mxnet-gtc-tutorial)
-* [Embedding Torch layers and functions in MXNet](http://mxnet.io/how_to/torch.html)
+* [Embedding Torch layers and functions in MXNet](https://mxnet.incubator.apache.org/how_to/torch.html)
 * [MXNet.js: Javascript Package for Deep Learning in Browser (without server)
 ](https://github.com/dmlc/mxnet.js/)
-* [Design Note: Design Efficient Deep Learning Data Loading Module](http://mxnet.io/architecture/note_data_loading.html)
-* [MXNet on Mobile Device](http://mxnet.io/how_to/smart_device.html)
-* [Distributed Training](http://mxnet.io/how_to/multi_devices.html)
-* [Guide to Creating New Operators (Layers)](http://mxnet.io/how_to/new_op.html)
+* [Design Note: Design Efficient Deep Learning Data Loading Module](https://mxnet.incubator.apache.org/architecture/note_data_loading.html)
+* [MXNet on Mobile Device](https://mxnet.incubator.apache.org/how_to/smart_device.html)
+* [Distributed Training](https://mxnet.incubator.apache.org/how_to/multi_devices.html)
+* [Guide to Creating New Operators (Layers)](https://mxnet.incubator.apache.org/how_to/new_op.html)
 * [Go binding for inference](https://github.com/songtianyi/go-mxnet-predictor)
 * [Amalgamation and Go Binding for Predictors](https://github.com/jdeng/gomxnet/) - Outdated
-* [Training Deep Net on 14 Million Images on A Single Machine](http://mxnet.io/tutorials/computer_vision/imagenet_full.html)
+* [Large Scale Image Classification](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification)
 
 Contents
 --------
-* [Documentation](http://mxnet.io/) and  [Tutorials](http://mxnet.io/tutorials/)
-* [Design Notes](http://mxnet.io/architecture/index.html)
+* [Documentation](https://mxnet.incubator.apache.org/) and  [Tutorials](https://mxnet.incubator.apache.org/tutorials/)
+* [Design Notes](https://mxnet.incubator.apache.org/architecture/index.html)
 * [Code Examples](https://github.com/dmlc/mxnet/tree/master/example)
-* [Installation](http://mxnet.io/get_started/install.html)
+* [Installation](https://mxnet.incubator.apache.org/get_started/install.html)
 * [Pretrained Models](https://github.com/dmlc/mxnet-model-gallery)
-* [Contribute to MXNet](http://mxnet.io/community/contribute.html)
-* [Frequent Asked Questions](http://mxnet.io/how_to/faq.html)
+* [Contribute to MXNet](https://mxnet.incubator.apache.org/community/contribute.html)
+* [Frequent Asked Questions](https://mxnet.incubator.apache.org/how_to/faq.html)
 
 Features
 --------
@@ -70,7 +73,7 @@ Ask Questions
 
 License
 -------
-© Contributors, 2015-2017. Licensed under an [Apache-2.0](https://github.com/dmlc/mxnet/blob/master/LICENSE) license.
+Licensed under an [Apache-2.0](https://github.com/dmlc/mxnet/blob/master/LICENSE) license.
 
 Reference Paper
 ---------------
diff --git a/amalgamation/amalgamation.py b/amalgamation/amalgamation.py
index 22b421d79fba..2aba8f4bdc77 100644
--- a/amalgamation/amalgamation.py
+++ b/amalgamation/amalgamation.py
@@ -32,6 +32,10 @@
 minimum = int(sys.argv[6]) if len(sys.argv) > 5 else 0
 android = int(sys.argv[7]) if len(sys.argv) > 6 else 0
 
+# blacklist linear algebra headers when building without blas.
+if minimum != 0:
+    blacklist.append('linalg.h')
+
 def pprint(lst):
     for item in lst:
         print item
diff --git a/amalgamation/dmlc-minimum0.cc b/amalgamation/dmlc-minimum0.cc
index 3f7a97bb0139..be1793a51d7c 100644
--- a/amalgamation/dmlc-minimum0.cc
+++ b/amalgamation/dmlc-minimum0.cc
@@ -18,11 +18,13 @@
  */
 
 /*!
+ * Copyright 2015 by Contributors.
  * \brief Mininum DMLC library Amalgamation, used for easy plugin of dmlc lib.
  *  Normally this is not needed.
  */
 #include "../dmlc-core/src/io/line_split.cc"
 #include "../dmlc-core/src/io/recordio_split.cc"
+#include "../dmlc-core/src/io/indexed_recordio_split.cc"
 #include "../dmlc-core/src/io/input_split_base.cc"
 #include "../dmlc-core/src/io/local_filesys.cc"
 #include "../dmlc-core/src/data.cc"
diff --git a/amalgamation/mxnet_predict0.cc b/amalgamation/mxnet_predict0.cc
index badf23771dbc..053dc1e7a691 100644
--- a/amalgamation/mxnet_predict0.cc
+++ b/amalgamation/mxnet_predict0.cc
@@ -45,9 +45,11 @@
 
 
 #include "src/ndarray/ndarray_function.cc"
-#include "src/ndarray/autograd.cc"
 #include "src/ndarray/ndarray.cc"
 
+#include "src/imperative/imperative.cc"
+#include "src/imperative/cached_op.cc"
+
 #include "src/engine/engine.cc"
 #include "src/engine/naive_engine.cc"
 #include "src/engine/profiler.cc"
@@ -74,9 +76,11 @@
 #include "src/operator/softmax_activation.cc"
 #include "src/operator/softmax_output.cc"
 #include "src/operator/tensor/elemwise_binary_broadcast_op_basic.cc"
+#include "src/operator/tensor/elemwise_binary_op.cc"
 #include "src/operator/tensor/elemwise_binary_op_basic.cc"
 #include "src/operator/tensor/elemwise_binary_scalar_op_basic.cc"
-#include "src/operator/tensor/elemwise_unary_op.cc"
+#include "src/operator/tensor/elemwise_unary_op_basic.cc"
+#include "src/operator/tensor/elemwise_unary_op_trig.cc"
 #include "src/operator/tensor/matrix_op.cc"
 
 #include "src/storage/storage.cc"
@@ -88,3 +92,4 @@
 #include "src/c_api/c_api_symbolic.cc"
 #include "src/c_api/c_api_ndarray.cc"
 #include "src/c_api/c_api_error.cc"
+
diff --git a/amalgamation/prep_nnvm.sh b/amalgamation/prep_nnvm.sh
index baf6d4d2d0a7..60c96743307c 100755
--- a/amalgamation/prep_nnvm.sh
+++ b/amalgamation/prep_nnvm.sh
@@ -1,4 +1,20 @@
 #! /bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
 DMLC_CORE=$(pwd)/../dmlc-core
 cd ../nnvm/amalgamation
 make clean
diff --git a/benchmark/python/sparse/cast_storage.py b/benchmark/python/sparse/cast_storage.py
new file mode 100644
index 000000000000..7ae537398c42
--- /dev/null
+++ b/benchmark/python/sparse/cast_storage.py
@@ -0,0 +1,99 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+
+from mxnet.test_utils import *
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+
+parser = argparse.ArgumentParser(description="Benchmark cast storage operators",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet')
+args = parser.parse_args()
+
+def measure_cost(repeat, f, *args, **kwargs):
+    start = time.time()
+    results = []
+    for i in range(repeat):
+        (f(*args, **kwargs)).wait_to_read()
+    end = time.time()
+    diff = end - start
+    return diff / repeat
+
+
+def run_cast_storage_synthetic():
+    def dense_to_sparse(m, n, density, ctx, repeat, stype):
+        set_default_context(ctx)
+        data_shape = (m, n)
+        dns_data = rand_ndarray(data_shape, stype, density).tostype('default')
+        dns_data.wait_to_read()
+
+        # do one warm up run, verify correctness
+        assert same(mx.nd.cast_storage(dns_data, stype).asnumpy(), dns_data.asnumpy())
+
+        # start benchmarking
+        cost = measure_cost(repeat, mx.nd.cast_storage, dns_data, stype)
+        results = '{:10.1f} {:>10} {:8d} {:8d} {:10.2f}'.format(density*100, str(ctx), m, n, cost*1000)
+        print(results)
+
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
+
+    # params
+    # m           number of rows
+    # n           number of columns
+    # density     density of the matrix
+    # num_repeat  number of benchmark runs to average over
+    # contexts    mx.cpu(), mx.gpu()
+    #             note: benchmark different contexts separately; to benchmark cpu, compile without CUDA
+    # benchmarks  dns_to_csr, dns_to_rsp
+    m = [  512,    512]
+    n = [50000, 100000]
+    density = [1.00, 0.80, 0.60, 0.40, 0.20, 0.10, 0.05, 0.02, 0.01]
+    num_repeat = 10
+    contexts = [mx.gpu()]
+    benchmarks = ["dns_to_csr", "dns_to_rsp"]
+
+    # run benchmark
+    for b in benchmarks:
+        stype = ''
+        print("==================================================")
+        if b is "dns_to_csr":
+            stype = 'csr'
+            print(" cast_storage benchmark: dense to csr, size m x n ")
+        elif b is "dns_to_rsp":
+            stype = 'row_sparse'
+            print(" cast_storage benchmark: dense to rsp, size m x n ")
+        else:
+            print("invalid benchmark: %s" %b)
+            continue
+        print("==================================================")
+        headline = '{:>10} {:>10} {:>8} {:>8} {:>10}'.format('density(%)', 'context', 'm', 'n', 'time(ms)')
+        print(headline)
+        for i in range(len(n)):
+            for ctx in contexts:
+                for den in density:
+                    dense_to_sparse(m[i], n[i], den, ctx, num_repeat, stype)
+            print("")
+        print("")
+
+
+if __name__ == "__main__":
+    run_cast_storage_synthetic()
diff --git a/benchmark/python/sparse/dot.py b/benchmark/python/sparse/dot.py
new file mode 100644
index 000000000000..164e50aef051
--- /dev/null
+++ b/benchmark/python/sparse/dot.py
@@ -0,0 +1,445 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+
+import os
+import time
+import argparse
+import subprocess
+import scipy.sparse as sp
+
+import mxnet as mx
+import numpy as np
+import numpy.random as rnd
+from mxnet.test_utils import rand_ndarray, set_default_context, assert_almost_equal, get_bz2_data
+from mxnet.base import check_call, _LIB
+from util import estimate_density
+
+PARSER = argparse.ArgumentParser(description="Benchmark sparse operators",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+PARSER.add_argument('--num-omp-threads', type=int,
+                    default=1, help='number of omp threads to set in MXNet')
+PARSER.add_argument('--gpu', action='store_true',
+                    help="to be run on gpu")
+# TODO: Use logging later
+PARSER.add_argument('--verbose', action='store_true',
+                    help="Verbose output")
+ARGS = PARSER.parse_args()
+
+# some data information
+KDDA = {
+    'data_mini': 'kdda.t.mini',
+    'data_name': 'kdda.t',
+    'data_origin_name': 'kdda.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2",
+    'feature_dim': 20216830,
+    'm': [1, 8, 32],
+    'batch_size': [64],
+    'default_index': {'batch_size': 0,
+                      'output_dim': 2},
+    'num_batches': 10
+}
+
+AVAZU = {
+    'data_mini': 'avazu-app.t.mini',
+    'data_name': 'avazu-app.t',
+    'data_origin_name': 'avazu-app.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2",
+    'feature_dim': 1000000,
+    'm': [1, 1000, 2000],
+    'batch_size': [128, 256],
+    'default_index': {'batch_size': 0,
+                      'output_dim': 1},
+    'num_batches': 10
+}
+
+CRITEO = {
+    'data_mini': 'criteo.t.mini',
+    'data_name': 'criteo.t',
+    'data_origin_name': 'criteo.t.bz2',
+    'url' : "https://s3-us-west-2.amazonaws.com/sparse-dataset/criteo.t.bz2",
+    'feature_dim': 8388621,
+    'm': [1, 8, 16, 32, 64],
+    'batch_size': [64, 128],
+    'default_index': {'batch_size': 1,
+                      'output_dim': 3},
+    'num_batches': 10
+}
+
+SYNTHETIC1 = {
+    'feature_dim': [1000000],
+    'm': [256, 1000],
+    'density': [0.001, 0.005, 0.01, 0.02, 0.05,
+                0.1, 0.2, 0.5, 0.65],
+    'batch_size': [64, 128],
+    'default_index': {'batch_size': 1,
+                      'density': 2,
+                      'output_dim': 1,
+                      'feature_dim': 0},
+    'num_repeat': 10
+}
+
+SYNTHETIC2 = {
+    'feature_dim': [8000000, 16000000],
+    'm': [1, 32],
+    'density': [0.001, 0.005, 0.01, 0.02, 0.05,
+                0.1, 0.2, 0.5, 0.65],
+    'batch_size': [64, 128],
+    'default_index': {'batch_size': 1,
+                      'density': 2,
+                      'output_dim': 1,
+                      'feature_dim': 0},
+    'num_repeat': 10
+}
+
+def measure_cost(repeat, scipy_trans_lhs, scipy_dns_lhs, func_name, *args, **kwargs):
+    """Measure time cost of running a function
+    """
+    mx.nd.waitall()
+    args_list = []
+    for arg in args:
+        args_list.append(arg)
+    start = time.time()
+    if scipy_trans_lhs:
+        args_list[0] = np.transpose(args_list[0]) if scipy_dns_lhs else sp.spmatrix.transpose(args_list[0])
+    for _ in range(repeat):
+        func_name(*args_list, **kwargs)
+    mx.nd.waitall()
+    end = time.time()
+    diff = end - start
+    return diff / repeat
+
+
+def _get_iter(path, data_shape, batch_size):
+    data_train = mx.io.LibSVMIter(data_libsvm=path,
+                                  data_shape=data_shape,
+                                  batch_size=batch_size)
+    data_iter = iter(data_train)
+    return data_iter
+
+
+def _line_count(path):
+    return int(subprocess.check_output('wc -l {}'.format(path), shell=True).split()[0])
+
+
+def _compare_sparse_dense(data_dir, file_name, mini_file_name, feature_dim,
+                          output_dim, density, batch_size, num_batches=3, num_repeat=5, transpose=False,
+                          rsp=False):
+
+    def create_mini_path(mini_path, path, num_batches):
+        """Samples batches of size: batch_size, total number: num_batches
+        from the dataset files for running benchmarks"""
+        if not os.path.exists(mini_path):
+            last = _line_count(path) - num_batches * batch_size
+            last = last if last >= 1 else 1
+            start = int(rnd.uniform(1, last))
+            os.system("sed -n '%d,%dp' %r > %r"
+                      %(start, start + num_batches * batch_size, path, mini_path))
+            assert os.path.exists(mini_path)
+
+
+    def run_benchmark(mini_path):
+        """Run benchmarks
+        """
+        data_shape = (feature_dim, )
+        train_iter = _get_iter(mini_path, data_shape, batch_size)
+        weight_row_dim = batch_size if transpose else feature_dim
+        weight_shape = (weight_row_dim, output_dim)
+        if not rsp:
+            weight = mx.nd.random.uniform(low=0, high=1, shape=weight_shape)
+        else:
+            weight = rand_ndarray(weight_shape, "row_sparse", density=0.05, distribution="uniform")
+        total_cost = {}
+        average_cost = {}
+        count = 0
+        total_cost["sparse"] = 0.
+        total_cost["dense"] = 0.
+        for _ in train_iter:
+            csr_data = train_iter.getdata()
+            dns_data = csr_data.tostype('default')
+            cost_sparse = measure_cost(num_repeat, False, False, mx.nd.sparse.dot, csr_data, weight, transpose_a=transpose)
+            cost_dense = measure_cost(num_repeat, False, False, mx.nd.dot, dns_data, weight, transpose_a=transpose)
+            total_cost["sparse"] += cost_sparse
+            total_cost["dense"] += cost_dense
+            count = count + 1
+        average_cost["sparse"] = total_cost["sparse"] / count
+        average_cost["dense"] = total_cost["dense"] / count
+        return (average_cost["sparse"], average_cost["dense"])
+
+
+    def print_result(average_cost_sparse, average_cost_dense):
+        """Print result of comparison between sparse and dense
+        """
+        ratio = average_cost_dense / average_cost_sparse
+        fmt = '{:15.4f} {:10d} {:10d} {:10d} {:20.2f} {:15.2f} {:15.2f} {:10} {:10}'
+        print(fmt.format(density * 100, batch_size, output_dim, feature_dim,
+                         ratio, average_cost_dense*1000, average_cost_sparse*1000,
+                         transpose, rsp))
+
+    mini_path = os.path.join(data_dir, mini_file_name)
+    path = os.path.join(data_dir, file_name)
+    create_mini_path(mini_path, path, num_batches)
+    average_cost_sparse, average_cost_dense = run_benchmark(mini_path)
+    print_result(average_cost_sparse, average_cost_dense)
+
+
+def test_dot_real(data_dict):
+    """Dot operator testing with real datasets"""
+    data_dir = os.path.join(os.getcwd(), 'data')
+
+    path = os.path.join(data_dir, data_dict['data_name'])
+    if not os.path.exists(path):
+        get_bz2_data(
+            data_dir,
+            data_dict['data_name'],
+            data_dict['url'],
+            data_dict['data_origin_name']
+        )
+        assert os.path.exists(path)
+
+    k = data_dict['feature_dim']
+    m = data_dict['m']
+    batch_size_list = data_dict['batch_size']
+
+    default_output_index = data_dict['default_index']['output_dim']
+    default_batch_size_index = data_dict['default_index']['batch_size']
+    density = estimate_density(path, data_dict['feature_dim'])
+    num_batches = data_dict['num_batches']
+
+    assert default_batch_size_index < len(batch_size_list)
+    assert default_output_index < len(m)
+    if ARGS.verbose:
+        print("Running Benchmarking on %r data") % data_dict['data_mini']
+    print('{:>15} {:>10} {:>10} {:>10} {:>20} {:>15} {:>15} {:>10} {:>10}'.format('density(%)',
+                                                                                 'n',
+                                                                                 'm',
+                                                                                 'k',
+                                                                                 't_dense/t_sparse',
+                                                                                 't_dense(ms)',
+                                                                                 't_sparse(ms)',
+                                                                                 'is_transpose',
+                                                                                 'rhs_rsp'))
+
+
+    for output_dim in m:
+        _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'],
+                              k, output_dim, density,
+                              batch_size_list[default_batch_size_index], num_batches)
+        _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'],
+                              k, output_dim, density,
+                              batch_size_list[default_batch_size_index], num_batches,
+                              transpose=True)
+        _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'],
+                              k, output_dim, density,
+                              batch_size_list[default_batch_size_index], num_batches, rsp=True)
+
+    for batch_size in batch_size_list:
+        _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'],
+                              k, m[default_output_index], density, batch_size, num_batches)
+        _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'],
+                              k, m[default_output_index], density, batch_size, num_batches,
+                              transpose=True)
+        _compare_sparse_dense(data_dir, data_dict['data_name'], data_dict['data_mini'],
+                              k, output_dim, density,
+                              batch_size_list[default_batch_size_index], num_batches, rsp=True)
+
+
+def test_dot_synthetic(data_dict):
+    """benchmark sparse mxnet dot and scipy dot operator with matrices of given density.
+    `t_sparse` is the runtime of the invoked sparse dot operator in ms, while `t_dense` is the
+    runtime of dot(dns, dns), with the same matrices except that they are in default storage type.
+    """
+    # Benchmark MXNet and Scipys dot operator
+    def bench_dot(lhs_shape, rhs_shape, lhs_stype, rhs_stype,
+                  lhs_den, rhs_den, trans_lhs, ctx, num_repeat=10, fw="mxnet", distribution="uniform"):
+        set_default_context(ctx)
+        assert fw == "mxnet" or fw == "scipy"
+        # Set funcs
+        dot_func_sparse = mx.nd.sparse.dot if fw == "mxnet" else sp.spmatrix.dot
+        dot_func_dense = mx.nd.dot if fw == "mxnet" else np.dot
+        # Create matrix instances
+        lhs_nd = rand_ndarray(lhs_shape, lhs_stype, density=lhs_den, distribution=distribution)
+        # only uniform distribution supported for rhs
+        rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_den, distribution="uniform")
+        lhs_dns = None
+        rhs_dns = None
+        dense_cost = None
+        sparse_cost = None
+
+        if fw == "mxnet":
+            lhs_dns = lhs_nd if lhs_stype == 'default' else lhs_nd.tostype('default')
+            rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype('default')
+            # One warm up run, verify correctness
+            out = dot_func_sparse(lhs_nd, rhs_dns, trans_lhs)
+            out_expected = dot_func_dense(lhs_dns, rhs_dns, trans_lhs)
+            assert_almost_equal(out.asnumpy(), out_expected.asnumpy(), rtol=1e-1, atol=1e-1)
+            sparse_cost = measure_cost(num_repeat, False, False, dot_func_sparse, lhs_nd, rhs_nd, trans_lhs)
+            dense_cost = measure_cost(num_repeat, False, False, dot_func_dense, lhs_dns, rhs_dns, trans_lhs)
+        else:
+            lhs_dns = lhs_nd.asnumpy()
+            rhs_dns = rhs_nd.asnumpy()
+            lhs_nd = sp.csr_matrix(lhs_nd.asnumpy())
+            rhs_nd = rhs_nd.asnumpy()
+            # One warm up run, verify correctness
+            lhs_nd_copy = sp.spmatrix.transpose(lhs_nd) if trans_lhs else lhs_nd
+            out = dot_func_sparse(lhs_nd_copy, rhs_dns)
+            sparse_cost = measure_cost(num_repeat, trans_lhs, False, dot_func_sparse, lhs_nd, rhs_nd)
+            dense_cost = measure_cost(num_repeat, trans_lhs, True, dot_func_dense, lhs_dns, rhs_dns)
+
+        speedup = dense_cost / sparse_cost
+        # Print results
+        m = lhs_shape[0]
+        k = lhs_shape[1]
+        n = rhs_shape[1]
+        result_pattern = '{:15.1f} {:15.1f} {:>10} {:8d} {:8d} {:8d} {:13.2f} {:13.2f} {:8.2f}'
+        results = result_pattern.format(lhs_den*100,
+                                        rhs_den*100,
+                                        str(ctx),
+                                        m,
+                                        k,
+                                        n,
+                                        sparse_cost*1000,
+                                        dense_cost*1000,
+                                        speedup)
+        print(results)
+
+    def print_benchmark_info(lhs, rhs, lhs_trans, fw):
+        trans_str = "^T" if lhs_trans else ""
+        print("========================================================")
+        print("  %s sparse dot benchmark: dot(%s, %s) = %s  ") % (fw, lhs, rhs, rhs)
+        print("  (matrix multiplication: (m x k)%s * (k x n) = m x n)  ") % (trans_str)
+        print("========================================================")
+        headline_pattern = '{:>15} {:>15} {:>10} {:>8} {:>8} {:>8} {:>13} {:>13} {:>8}'
+        headline = headline_pattern.format('lhs_density(%)',
+                                           'rhs_density(%)',
+                                           'context',
+                                           'm', 'k', 'n',
+                                           't_sparse(ms)',
+                                           't_dense(ms)',
+                                           'speedup')
+        print(headline)
+
+
+    def run_benchmark(ctx=None, lhs="csr", lhs_trans=False, rhs="dns", fw="mxnet", rhs_density=1,
+                      distribution="uniform"):
+        if lhs != "csr":
+            raise ValueError("Value other than csr for lhs not supported")
+        if rhs_density > 1 or rhs_density < 0:
+            raise ValueError("rhs_density has to be between 0 and 1")
+
+        print_benchmark_info(lhs, rhs, lhs_trans, fw)
+
+
+        lhs_stype = "csr"
+        rhs_stype = "row_sparse" if rhs == "rsp" else "default"
+
+        feature_dim_list = data_dict['feature_dim']
+        output_dim_list = data_dict['m']
+        batch_size_list = data_dict['batch_size']
+        density_list = data_dict['density']
+
+        default_output_index = data_dict['default_index']['output_dim']
+        default_batch_size_index = data_dict['default_index']['batch_size']
+        default_feature_index = data_dict['default_index']['feature_dim']
+        default_density_index = data_dict['default_index']['density']
+        num_repeat = data_dict['num_repeat']
+
+        for output_dim in output_dim_list:
+            if lhs_trans:
+                output_row_dim = batch_size_list[default_batch_size_index]
+            else:
+                output_row_dim = feature_dim_list[default_feature_index]
+            bench_dot((batch_size_list[default_batch_size_index],
+                       feature_dim_list[default_feature_index]),
+                      (output_row_dim, output_dim),
+                      lhs_stype, rhs_stype,
+                      density_list[default_density_index], rhs_density,
+                      lhs_trans, ctx, num_repeat=num_repeat,
+                      fw=fw, distribution=distribution)
+
+        for feature_dim in feature_dim_list:
+            if lhs_trans:
+                output_row_dim = batch_size_list[default_batch_size_index]
+            else:
+                output_row_dim = feature_dim
+            bench_dot((batch_size_list[default_batch_size_index], feature_dim),
+                      (output_row_dim, output_dim_list[default_output_index]),
+                      lhs_stype, rhs_stype, density_list[default_density_index], rhs_density,
+                      lhs_trans, ctx, num_repeat=num_repeat, fw=fw, distribution=distribution)
+
+        for batch_size in batch_size_list:
+            if lhs_trans:
+                output_row_dim = batch_size
+            else:
+                output_row_dim = feature_dim_list[default_feature_index]
+            bench_dot((batch_size, feature_dim_list[default_feature_index]),
+                      (output_row_dim,
+                       output_dim_list[default_output_index]),
+                      lhs_stype, rhs_stype, density_list[default_density_index],
+                      rhs_density, lhs_trans, ctx, num_repeat=num_repeat,
+                      fw=fw, distribution=distribution)
+
+        for density in density_list:
+            if lhs_trans:
+                output_row_dim = batch_size_list[default_batch_size_index]
+            else:
+                output_row_dim = feature_dim_list[default_feature_index]
+            bench_dot((batch_size_list[default_batch_size_index],
+                       feature_dim_list[default_feature_index]),
+                      (output_row_dim,
+                       output_dim_list[default_output_index]),
+                      lhs_stype, rhs_stype, density, rhs_density, lhs_trans, ctx,
+                      num_repeat=num_repeat, fw=fw, distribution=distribution)
+
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(ARGS.num_omp_threads)))
+    context = mx.gpu() if ARGS.gpu else mx.cpu()
+    # TODO(anirudh): make the data dicts to config which can be passed at runtime
+    distributions = ["uniform", "powerlaw"]
+    for distribution in distributions:
+        run_benchmark(context, lhs="csr",
+                      rhs="default", lhs_trans=False,
+                      fw="mxnet", rhs_density=1,
+                      distribution=distribution)
+        run_benchmark(context, lhs="csr",
+                      rhs="default", lhs_trans=True,
+                      fw="mxnet", rhs_density=1,
+                      distribution=distribution)
+        run_benchmark(context, lhs="csr",
+                      rhs="rsp", lhs_trans=False,
+                      fw="mxnet", rhs_density=0.05,
+                      distribution=distribution)
+        if not ARGS.gpu:
+            run_benchmark(context, lhs="csr",
+                          rhs="default", lhs_trans=False,
+                          fw="scipy", rhs_density=1,
+                          distribution=distribution)
+            run_benchmark(context, lhs="csr",
+                          rhs="default", lhs_trans=True,
+                          fw="scipy", rhs_density=1,
+                          distribution=distribution)
+
+
+if __name__ == "__main__":
+    begin_time = time.time()
+    test_dot_real(KDDA)
+    test_dot_real(AVAZU)
+    test_dot_real(CRITEO)
+    test_dot_synthetic(SYNTHETIC1)
+    test_dot_synthetic(SYNTHETIC2)
+    total_time = time.time() - begin_time
+    print("total time is %f") % total_time
diff --git a/benchmark/python/sparse/memory_benchmark.py b/benchmark/python/sparse/memory_benchmark.py
new file mode 100644
index 000000000000..b60f214ec79e
--- /dev/null
+++ b/benchmark/python/sparse/memory_benchmark.py
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Should be run with valgrind to get memory consumption
+   for sparse format storage and dot operators. This script can be
+   used for memory benchmarking on CPU only"""
+import ctypes
+import sys
+import argparse
+import mxnet as mx
+from mxnet.test_utils import rand_ndarray
+from mxnet.base import check_call, _LIB
+
+
+def parse_args():
+    """ Function to parse arguments
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--lhs-row-dim",
+                        required=True,
+                        help="Provide batch_size")
+    parser.add_argument("--lhs-col-dim",
+                        required=True,
+                        help="Provide feature_dim")
+    parser.add_argument("--rhs-col-dim",
+                        required=True,
+                        help="Provide output_dim")
+    parser.add_argument("--density",
+                        required=True,
+                        help="Density for lhs")
+    parser.add_argument("--num-omp-threads", type=int,
+                        default=1, help="number of omp threads to set in MXNet")
+    parser.add_argument("--lhs-stype", default="csr",
+                        choices=["csr", "default", "row_sparse"],
+                        help="stype for lhs",
+                        required=True)
+    parser.add_argument("--rhs-stype", default="default",
+                        choices=["default", "row_sparse"],
+                        help="rhs stype",
+                        required=True)
+    parser.add_argument("--only-storage",
+                        action="store_true",
+                        help="only storage")
+    parser.add_argument("--rhs-density",
+                        help="rhs_density")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    lhs_row_dim = int(args.lhs_row_dim)
+    lhs_col_dim = int(args.lhs_col_dim)
+    rhs_col_dim = int(args.rhs_col_dim)
+    density = float(args.density)
+    lhs_stype = args.lhs_stype
+    rhs_stype = args.rhs_stype
+    if args.rhs_density:
+        rhs_density = float(args.rhs_density)
+    else:
+        rhs_density = density
+    dot_func = mx.nd.sparse.dot if lhs_stype == "csr" else mx.nd.dot
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
+    bench_dot(lhs_row_dim, lhs_col_dim, rhs_col_dim, density,
+              rhs_density, dot_func, False, lhs_stype, rhs_stype, args.only_storage)
+
+def bench_dot(lhs_row_dim, lhs_col_dim, rhs_col_dim, density,
+              rhs_density, dot_func, trans_lhs, lhs_stype,
+              rhs_stype, only_storage, distribution="uniform"):
+    """ Benchmarking both storage and dot
+    """
+    lhs_nd = rand_ndarray((lhs_row_dim, lhs_col_dim), lhs_stype, density, distribution=distribution)
+    if not only_storage:
+        rhs_nd = rand_ndarray((lhs_col_dim, rhs_col_dim), rhs_stype,
+                              density=rhs_density, distribution=distribution)
+        out = dot_func(lhs_nd, rhs_nd, trans_lhs)
+    mx.nd.waitall()
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/benchmark/python/sparse/sparse_end2end.py b/benchmark/python/sparse/sparse_end2end.py
new file mode 100644
index 000000000000..ecd9057dedfc
--- /dev/null
+++ b/benchmark/python/sparse/sparse_end2end.py
@@ -0,0 +1,307 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+import argparse
+import os
+import multiprocessing
+from mxnet.test_utils import *
+
+MAX_NUM_BATCH = 99999999
+COMP = "compute"
+COMM = "communication"
+IO = "io"
+
+parser = argparse.ArgumentParser(description="Run sparse linear regression " \
+                                             "with distributed kvstore",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--profiler', type=int, default=0,
+                    help='whether to use profiler')
+parser.add_argument('--num-epoch', type=int, default=1,
+                    help='number of epochs to train')
+parser.add_argument('--batch-size', type=int, default=512,
+                    help='number of examples per batch')
+parser.add_argument('--num-batch', type=int, default=MAX_NUM_BATCH,
+                    help='number of batches per epoch')
+parser.add_argument('--dummy-iter', type=int, default=0,
+                    help='whether to use dummy iterator to exclude io cost')
+parser.add_argument('--kvstore', type=str, default=None,
+                    help='what kvstore to use [local, dist_sync, etc]')
+parser.add_argument('--sparse-log-level', type=str, default='DEBUG',
+                    help='logging level [DEBUG, INFO, ERROR]')
+parser.add_argument('--dataset', type=str, default='avazu',
+                    help='what test dataset to use')
+parser.add_argument('--num-gpu', type=int, default=0,
+                    help='number of gpus to use. 0 means using cpu(0);'
+                         'otherwise, use gpu(0),...,gpu(num_gpu-1)')
+parser.add_argument('--output-dim', type=int, default=4,
+                    help='number of columns of the forward output')
+parser.add_argument('--dummy-metric', type=int, default=0,
+                    help='whether to call update_metric')
+parser.add_argument('--enable-logging-for', default="0",
+                    help="Enable logging for the specified list of workers")
+parser.add_argument('--measure-only', default=None,
+                    help="Measure only",
+                    choices=[IO, COMP, COMM])
+parser.add_argument('--omit-row-sparse-push', action='store_true',
+                    help="omit row_sparse_push")
+
+class DummyIter(mx.io.DataIter):
+    "A dummy iterator that always return the same batch, used for speed testing"
+    def __init__(self, real_iter):
+        super(DummyIter, self).__init__()
+        self.real_iter = real_iter
+        self.provide_data = real_iter.provide_data
+        self.provide_label = real_iter.provide_label
+        self.batch_size = real_iter.batch_size
+
+        for batch in real_iter:
+            self.the_batch = batch
+            break
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        return self.the_batch
+
+# testing dataset sources
+avazu = {
+    'data_name': 'avazu-app.t',
+    'data_origin_name': 'avazu-app.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2",
+    'feature_dim': 1000001,
+    'lc': 1719304,
+}
+
+kdda = {
+    'data_name': 'kdda.t',
+    'data_origin_name': 'kdda.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2",
+    'feature_dim': 20216831,
+    'lc': 510302,
+}
+
+criteo = {
+    'data_name': 'criteo.t',
+    'data_origin_name': 'criteo.t.bz2',
+    'url': "https://s3-us-west-2.amazonaws.com/sparse-dataset/criteo.t.bz2",
+    'feature_dim': 8388621,
+    'lc': 548787,
+}
+
+datasets = { 'kdda' : kdda, 'avazu' : avazu , 'criteo': criteo }
+
+
+def get_sym(feature_dim):
+    inputs = mx.symbol.Variable("data", stype='csr')
+    norm_init = mx.initializer.Normal(sigma=0.01)
+    weights = mx.symbol.Variable("w", shape=(feature_dim, args.output_dim),
+                                 init=norm_init, stype='row_sparse')
+    embed = mx.symbol.sparse.dot(inputs, weights)
+    softmax_output = mx.symbol.Variable("softmax_label")
+    model = mx.symbol.SoftmaxOutput(data=embed, label=softmax_output, name="out")
+    return model
+
+
+def row_sparse_push(kv, param_arrays, grad_arrays, param_names):
+    for index, pair in enumerate(zip(param_arrays, grad_arrays)):
+        arg_list, grad_list = pair
+        if grad_list[0] is None:
+            continue
+        name = param_names[index]
+        kv.push(name, grad_list, priority=-index)
+
+
+def row_sparse_pull(kv, key, data, slices, weight_array, priority):
+    # if have kvstore, need to pull corresponding rows of
+    # the weights to each context
+    # column indices (NDArray type) of the csr data
+    # used as the row_idx of the weight row-sparse matrix
+    row_indices = data.indices
+    if len(slices) == 1:
+        kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_indices)
+    else:  # more than one slices, multi-GPU training. Need to retain weight rows according to data slices
+        # TODO(junwu):
+        # the following line blocks, may need to pre-compute
+        # and cache it outside the for loop
+        indptr = data.indptr.asnumpy()
+        row_idx_array = []
+        for s in slices:
+            row_idx_array.append(row_indices[indptr[s.start]:indptr[s.stop]])
+        kv.row_sparse_pull(key, weight_array, priority=priority, row_ids=row_idx_array)
+
+
+if __name__ == '__main__':
+
+    # arg parser
+    args = parser.parse_args()
+    num_epoch = args.num_epoch
+    num_batch = args.num_batch
+    kvstore = args.kvstore
+    profiler = args.profiler > 0
+    batch_size = args.batch_size if args.num_gpu == 0 else args.num_gpu * args.batch_size
+    dummy_iter = args.dummy_iter
+    dataset = args.dataset
+    log_level = args.sparse_log_level
+    measure_only = args.measure_only
+    num_cores = multiprocessing.cpu_count()
+    omit_row_sparse_push = args.omit_row_sparse_push
+    if measure_only == COMP or measure_only == IO:
+        assert not kvstore, "when compute_only or io_only is set, kvstore should be None"
+        num_batch = datasets[dataset]['lc'] / batch_size if num_batch == MAX_NUM_BATCH else num_batch
+    if measure_only == COMM:
+        assert (kvstore == "dist_async"), "when communication_only is set kvstore should be dist_async"
+        num_batch = datasets[dataset]['lc'] / batch_size if num_batch == MAX_NUM_BATCH else num_batch
+
+
+    contexts = mx.context.cpu(0) if args.num_gpu < 1\
+        else [mx.context.gpu(i) for i in range(args.num_gpu)]
+
+    # create kvstore when there are gpus
+    kv = mx.kvstore.create(kvstore) if kvstore else None
+    rank = kv.rank if kv is not None else 0
+    num_worker = kv.num_workers if kv is not None else 1
+
+    # only print log for rank 0 worker
+    import logging
+    if log_level == 'ERROR':
+        log_level = logging.ERROR
+    elif log_level == 'DEBUG':
+        log_level = logging.DEBUG
+    else:
+        log_level = logging.INFO
+
+    # Only log if it is in the list of workers to be logged
+    logging_workers_list = [int(i) for i in args.enable_logging_for.split(",")]
+    log_level = log_level if rank in logging_workers_list else logging.CRITICAL
+
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=log_level, format=head)
+
+    # dataset
+    assert(dataset in datasets), "unknown dataset " + dataset
+    metadata = datasets[dataset]
+    feature_dim = metadata['feature_dim']
+    if logging:
+        logging.debug('preparing data ... ')
+    data_dir = os.path.join(os.getcwd(), 'data')
+    path = os.path.join(data_dir, metadata['data_name'])
+    if not os.path.exists(path):
+        get_bz2_data(data_dir, metadata['data_name'], metadata['url'],
+                        metadata['data_origin_name'])
+        assert os.path.exists(path)
+
+    # data iterator
+    train_data = mx.io.LibSVMIter(data_libsvm=path, data_shape=(feature_dim,),
+                                  batch_size=batch_size, num_parts=num_worker,
+                                  part_index=rank)
+    if dummy_iter or measure_only == COMP or measure_only  == COMM:
+        train_data = DummyIter(train_data)
+
+    # model
+    model = get_sym(feature_dim)
+
+    # module
+    mod = mx.mod.Module(symbol=model, data_names=['data'],
+                        label_names=['softmax_label'], context=contexts)
+    mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    mod.init_params(initializer=mx.init.Uniform(scale=.1))
+    sgd = mx.optimizer.SGD(momentum=0.0, clip_gradient=5.0,
+                           learning_rate=0.1, rescale_grad=1.0/batch_size/num_worker)
+    mod.init_optimizer(optimizer=sgd, kvstore=kv)
+    # use accuracy as the metric
+    metric = mx.metric.create('acc')
+
+    index = mod._exec_group.param_names.index('w')
+    # weight_array bound to executors of the contexts
+    weight_array = mod._exec_group.param_arrays[index]
+
+    mx.nd.waitall()  # sync point for initialization
+    # start profiler
+    if profiler:
+        device = 'cpu'
+        if args.num_gpu > 0:
+            device = 'gpu' + str(args.num_gpu)
+        name = 'profile_' + args.dataset + '_' + device + '_nworker' + str(num_worker)\
+               + '_batchsize' + str(args.batch_size) + '_outdim' + str(args.output_dim) + '.json'
+        mx.profiler.profiler_set_config(mode='all', filename=name)
+        mx.profiler.profiler_set_state('run')
+
+    logging.debug('start training ...')
+    start = time.time()
+    data_iter = iter(train_data)
+    time_cost_epoch = 0.
+    sum_cost_epoch = 0.
+    average_cost_epoch = 0.
+
+    for epoch in range(num_epoch):
+        start_time_epoch = time.time()
+        nbatch = 0
+        end_of_batch = False
+        metric.reset()
+        next_batch = next(data_iter)
+        if kv is not None:
+            row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index)
+        while not end_of_batch:
+            nbatch += 1
+            batch = next_batch
+
+            if measure_only != IO and measure_only != COMM:
+                mod.forward_backward(batch)
+                # update parameters
+                mod.update()
+            if measure_only == COMM:
+                if nbatch == 1:
+                    mod.forward_backward(batch)
+                    mod.update()
+                elif not omit_row_sparse_push:
+                    row_sparse_push(kv, mod._exec_group.param_arrays, mod._exec_group.grad_arrays, mod._exec_group.param_names)
+
+
+            try:
+                # pre fetch next batch
+                next_batch = next(data_iter)
+                if nbatch == num_batch:
+                    raise StopIteration
+                if kv is not None:
+                    row_sparse_pull(kv, 'w', next_batch.data[0], mod._exec_group.slices, weight_array, -index)
+            except StopIteration:
+                end_of_batch = True
+            # accumulate prediction accuracy
+            if args.dummy_metric == 0:
+                mod.update_metric(metric, batch.label)
+            else:  # call waitall to replace update_metric as sync point
+                mx.nd.waitall()  # sync point for the current minibatch
+        logging.info('epoch {}, {}'.format(epoch, metric.get()))
+        end_time_epoch = time.time()
+        if epoch == 0:
+            logging.debug("num_batches = {}".format(nbatch))
+            logging.info('|device|num_worker|average_cost_epoch|rank|')
+        time_cost_epoch = end_time_epoch - start_time_epoch
+        if epoch > 0:
+            sum_cost_epoch = sum_cost_epoch + time_cost_epoch
+            average_cost_epoch = float(sum_cost_epoch) / epoch
+        logging.info('num_worker = {}, time cost per epoch = {}'.format(str(num_worker), str(time_cost_epoch)))
+        if args.num_gpu < 1:
+            logging.info('|cpu/{} cores| {} | {} | {} |'.format(str(num_cores), str(num_worker), str(average_cost_epoch), rank))
+        data_iter.reset()
+    if profiler:
+        mx.profiler.profiler_set_state('stop')
+    end = time.time()
+    time_cost = end - start
+    logging.info('num_worker = {}, rank = {}, time cost = {}'.format(str(num_worker), str(rank), str(time_cost)))
diff --git a/benchmark/python/sparse/sparse_op.py b/benchmark/python/sparse/sparse_op.py
new file mode 100644
index 000000000000..ebe62af05da6
--- /dev/null
+++ b/benchmark/python/sparse/sparse_op.py
@@ -0,0 +1,245 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import ctypes
+
+from mxnet.test_utils import *
+import scipy.sparse as sp
+import os
+import time
+import argparse
+
+from mxnet.base import check_call, _LIB
+from util import get_data, estimate_density
+
+parser = argparse.ArgumentParser(description="Benchmark sparse operators",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-omp-threads', type=int, default=1, help='number of omp threads to set in MXNet')
+args = parser.parse_args()
+
+# some data information
+kdda = {
+    'data_mini': 'kdda.t.mini',
+    'data_name': 'kdda.t',
+    'data_origin_name': 'kdda.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2",
+    'feature_dim': 20216830,
+    'm': 200,
+    'batch_size': [64]
+}
+
+avazu = {
+    'data_mini': 'avazu-app.t.mini',
+    'data_name': 'avazu-app.t',
+    'data_origin_name': 'avazu-app.t.bz2',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/avazu-app.t.bz2",
+    'feature_dim': 1000000,
+    'm': 500,
+    'batch_size': [64, 128]
+}
+
+
+def measure_cost(repeat, f, *args, **kwargs):
+    # start bench
+    start = time.time()
+    results = []
+    for i in range(repeat):
+        results.append(f(*args, **kwargs))
+    for result in results:
+        result.wait_to_read()
+    end = time.time()
+    diff = end - start
+    return diff / repeat
+
+
+def test_dot_real(data_dict):
+    def get_iter(path, data_shape, batch_size):
+        data_train = mx.io.LibSVMIter(data_libsvm=path,
+                                      data_shape=data_shape,
+                                      batch_size=batch_size)
+        data_iter = iter(data_train)
+        return data_iter
+
+    data_dir = os.path.join(os.getcwd(), 'data')
+
+    path = os.path.join(data_dir, data_dict['data_name'])
+    if not os.path.exists(path):
+        get_data(
+            data_dir,
+            data_dict['data_name'],
+            data_dict['url'],
+            data_dict['data_origin_name']
+        )
+        assert os.path.exists(path)
+
+    k = data_dict['feature_dim']
+    m = data_dict['m']
+    density = estimate_density(path, data_dict['feature_dim'])
+
+    mini_path = os.path.join(data_dir, data_dict['data_mini'])
+    if not os.path.exists(mini_path):
+        os.system("head -n 2000 %r > %r" % (path, mini_path))
+        assert os.path.exists(mini_path)
+
+    print "Running Benchmarking on %r data" % data_dict['data_mini']
+    for batch_size in data_dict['batch_size']:  # iterator through different batch size of choice
+        print "batch_size is %d" % batch_size
+        # model
+        data_shape = (k, )
+        train_iter = get_iter(mini_path, data_shape, batch_size)
+        weight = mx.nd.random.uniform(low=0, high=1, shape=(k, m))
+
+        csr_data = []
+        dns_data = []
+        num_batch = 0
+        for batch in train_iter:
+            data = train_iter.getdata()
+            csr_data.append(data)
+            dns_data.append(data.tostype('default'))
+            num_batch += 1
+        bag_of_data = [csr_data, dns_data]
+        num_repeat = 5
+        costs = []
+        for d in bag_of_data:
+            weight.wait_to_read()
+            cost = 0.
+            count = 0
+            for d_batch in d:
+                d_batch.wait_to_read()
+                cost += measure_cost(num_repeat, mx.nd.dot, d_batch, weight)
+                count += 1
+            costs.append(cost/count)
+        t_sparse = costs[0]
+        t_dense = costs[1]
+        ratio = t_dense / t_sparse
+        print('density(%)\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse')
+        fmt = "%0.4f\t\t%d\t%d\t%d\t%0.2f\t\t\t%0.4f\t%0.6f"
+        print(fmt % (density * 100, batch_size, m, k, ratio, t_dense, t_sparse))
+
+
+def test_dot_synthetic():
+    """benchmark mx.nd.dot(sparse_ndarray, dense_ndarray) with given density.
+    `t_sparse` is the time cost of dot(csr, dns), while `t_dense` is the time cost
+    of dot(dns, dns), with the same matrix except that it is in default storage type.
+    """
+    def measure_cost_forward_baseline(repeat, dot, lhs, rhs):
+        start = time.time()
+        for i in range(repeat):
+            dot(lhs, rhs)
+        end = time.time()
+        diff = end - start
+        return diff / repeat
+
+    def measure_cost_backward_baseline(repeat, dot, transpose, lhs, rhs):
+        start = time.time()
+        for i in range(repeat):
+            dot(transpose(lhs), rhs)
+        end = time.time()
+        diff = end - start
+        return diff / repeat
+
+    def bench_dot_forward(m, k, n, density, ctx, repeat):
+        set_default_context(ctx)
+        dns = mx.nd.random.uniform(shape=(k, n)).copyto(ctx)
+        data_shape = (m, k)
+        csr_data = rand_ndarray(data_shape, 'csr', density)
+        dns_data = csr_data.tostype('default')
+        rhs_dns_np = dns.asnumpy()
+        lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy())  # csr in scipy
+        lhs_dns_np = lhs_csr_sp.tostype('default')
+
+        data = [dns_data, csr_data]
+        costs = []
+        for d in data:
+            dns.wait_to_read()
+            d.wait_to_read()
+            cost = measure_cost(repeat, mx.nd.dot, d, dns)
+            costs.append(cost)
+        ratio = costs[0] / costs[1]
+
+        costs_baseline = []
+        cost = measure_cost_forward_baseline(repeat, np.dot, lhs_dns_np, rhs_dns_np)
+        costs_baseline.append(cost)
+        cost = measure_cost_forward_baseline(repeat, sp.spmatrix.dot, lhs_csr_sp, rhs_dns_np)
+        costs_baseline.append(cost)
+        ratio_baseline = costs_baseline[0] / costs_baseline[1]
+        fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f"
+        print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1],
+                     ratio_baseline, costs_baseline[0], costs_baseline[1]))
+
+    def bench_dot_backward(m, k, n, density, ctx, repeat):
+        set_default_context(ctx)
+        dns = mx.nd.random.uniform(shape=(m, n)).copyto(ctx)
+        data_shape = (m, k)
+        csr_data = rand_ndarray(data_shape, 'csr', density)
+        dns_data = csr_data.tostype('default')
+        rhs_dns_np = dns.asnumpy()
+        lhs_csr_sp = sp.csr_matrix(dns_data.asnumpy())
+        lhs_dns_np = lhs_csr_sp.tostype('default')
+
+        data = [dns_data, csr_data]
+        costs = []
+        for d in data:
+            dns.wait_to_read()
+            d.wait_to_read()
+            cost = measure_cost(repeat, mx.nd.dot, d, dns, transpose_a=True)
+            costs.append(cost)
+        ratio = costs[0] / costs[1]
+
+        costs_baseline = []
+        cost = measure_cost_backward_baseline(repeat, np.dot, np.transpose, lhs_dns_np, rhs_dns_np)
+        costs_baseline.append(cost)
+        cost = measure_cost_backward_baseline(repeat, sp.spmatrix.dot, sp.spmatrix.transpose, lhs_csr_sp, rhs_dns_np)
+        costs_baseline.append(cost)
+        ratio_baseline = costs_baseline[0] / costs_baseline[1]
+        fmt = "%0.1f\t\t%s\t%d\t%d\t%d\t%0.2f\t\t\t%0.2f\t%0.5f\t\t%0.2f\t\t\t\t%0.6f\t%0.5f"
+        print(fmt % (density * 100, str(ctx), n, m, k, ratio, costs[0], costs[1],
+                     ratio_baseline, costs_baseline[0], costs_baseline[1]))
+
+    print("A = sparse NDArray of shape(m, k)")
+    print("B = dense NDArray of shape(k, n)")
+    print("dot_forward\tdot(csr, dns)")
+    print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse'
+          '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse')
+
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
+    # TODO(haibin) make these runtime options
+    m = 512
+    k = [50000, 100000]
+    n = [64, 128]
+    density = [1.00, 0.90, 0.70, 0.50, 0.30, 0.20, 0.10, 0.07, 0.05, 0.02, 0.01, 0.005, 0.001]
+    num_repeat = 10
+    # contexts = [mx.cpu(), mx.gpu(0)]
+    contexts = [mx.cpu()]
+    for i in range(2):
+        for ctx in contexts:
+            for den in density:
+                bench_dot_forward(m, k[i], n[i], den, ctx, num_repeat)
+
+    print("dot_backward\tdot(csr.T, dns)")
+    print('density(%)\tcontext\tn\tm\tk\tt_dense/t_sparse\tt_dense\tt_sparse'
+          '\tt_scipy_dense/t_scipy_sparse\tt_scipy_dense\tt_scipy_sparse')
+    for i in range(2):
+        for ctx in contexts:
+            for den in density:
+                bench_dot_backward(m, k[i], n[i], den, ctx, num_repeat)
+
+
+if __name__ == "__main__":
+    test_dot_real(avazu)
+    test_dot_real(kdda)
+    test_dot_synthetic()
diff --git a/benchmark/python/sparse/util.py b/benchmark/python/sparse/util.py
new file mode 100644
index 000000000000..c20b33a86d65
--- /dev/null
+++ b/benchmark/python/sparse/util.py
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import random
+
+def estimate_density(DATA_PATH, feature_size):
+    """sample 10 times of a size of 1000 for estimating the density of the sparse dataset"""
+    if not os.path.exists(DATA_PATH):
+        raise Exception("Data is not there!")
+    density = []
+    P = 0.01
+    for _ in xrange(10):
+        num_non_zero = 0
+        num_sample = 0
+        with open(DATA_PATH) as f:
+            for line in f:
+                if (random.random() < P):
+                    num_non_zero += len(line.split(" ")) - 1
+                    num_sample += 1
+        density.append(num_non_zero * 1.0 / (feature_size * num_sample))
+    return sum(density) / len(density)
+
diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBlas.cmake
new file mode 100644
index 000000000000..3a8723a5dd5e
--- /dev/null
+++ b/cmake/ChooseBlas.cmake
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set(BLAS "Open" CACHE STRING "Selected BLAS library")
+set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL")
+
+if(USE_MKL_IF_AVAILABLE)
+  if(NOT MKL_FOUND)
+    find_package(MKL)
+  endif()
+  if(MKL_FOUND)
+    if(USE_MKLML_MKL)
+      set(BLAS "open")
+    else()
+      set(BLAS "MKL")
+    endif()
+  endif()
+endif()
+
+if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas")
+  find_package(Atlas REQUIRED)
+  include_directories(SYSTEM ${Atlas_INCLUDE_DIR})
+  list(APPEND mshadow_LINKER_LIBS ${Atlas_LIBRARIES})
+  add_definitions(-DMSHADOW_USE_CBLAS=1)
+  add_definitions(-DMSHADOW_USE_MKL=0)
+elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open")
+  find_package(OpenBLAS REQUIRED)
+  include_directories(SYSTEM ${OpenBLAS_INCLUDE_DIR})
+  list(APPEND mshadow_LINKER_LIBS ${OpenBLAS_LIB})
+  add_definitions(-DMSHADOW_USE_CBLAS=1)
+  add_definitions(-DMSHADOW_USE_MKL=0)
+elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
+  find_package(MKL REQUIRED)
+  include_directories(SYSTEM ${MKL_INCLUDE_DIR})
+  list(APPEND mshadow_LINKER_LIBS ${MKL_LIBRARIES})
+  add_definitions(-DMSHADOW_USE_CBLAS=0)
+  add_definitions(-DMSHADOW_USE_MKL=1)
+elseif(BLAS STREQUAL "apple")
+  find_package(Accelerate REQUIRED)
+  include_directories(SYSTEM ${Accelerate_INCLUDE_DIR})
+  list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES})
+  add_definitions(-DMSHADOW_USE_MKL=0)
+  add_definitions(-DMSHADOW_USE_CBLAS=1)
+endif()
\ No newline at end of file
diff --git a/cmake/FirstClassLangCuda.cmake b/cmake/FirstClassLangCuda.cmake
new file mode 100644
index 000000000000..73f075806243
--- /dev/null
+++ b/cmake/FirstClassLangCuda.cmake
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#this file is CUDA help function with CMAKE first class CUDA
+
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-std=c++11"   SUPPORT_CXX11)
+
+################################################################################################
+# Short command for cuDNN detection. Believe it soon will be a part of CUDA toolkit distribution.
+# That's why not FindcuDNN.cmake file, but just the macro
+# Usage:
+#   detect_cuDNN()
+function(detect_cuDNN)
+  set(CUDNN_ROOT "" CACHE PATH "CUDNN root folder")
+
+  find_path(CUDNN_INCLUDE cudnn.h
+            PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT}
+            DOC "Path to cuDNN include directory." )
+
+
+  find_library(CUDNN_LIBRARY NAMES libcudnn.so cudnn.lib # libcudnn_static.a
+                             PATHS ${CUDNN_ROOT} $ENV{CUDNN_ROOT} ${CUDNN_INCLUDE}
+                             DOC "Path to cuDNN library.")
+
+  if(CUDNN_INCLUDE AND CUDNN_LIBRARY)
+    set(HAVE_CUDNN  TRUE PARENT_SCOPE)
+    set(CUDNN_FOUND TRUE PARENT_SCOPE)
+
+    mark_as_advanced(CUDNN_INCLUDE CUDNN_LIBRARY CUDNN_ROOT)
+    message(STATUS "Found cuDNN (include: ${CUDNN_INCLUDE}, library: ${CUDNN_LIBRARY})")
+  endif()
+endfunction()
+
+
+
+################################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   mshadow_detect_installed_gpus(out_variable)
+function(mshadow_detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${__cufile} ""
+      "#include <cstdio>\n"
+      "int main()\n"
+      "{\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device)\n"
+      "  {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+    enable_language(CUDA)
+
+    try_run(__nvcc_res __compile_result ${PROJECT_BINARY_DIR} ${file}
+            COMPILE_OUTPUT_VARIABLE __compile_out
+            RUN_OUTPUT_VARIABLE __nvcc_out)
+
+    if(__nvcc_res EQUAL 0 AND __compile_result)
+      # nvcc outputs text containing line breaks when building with MSVC.
+      # The line below prevents CMake from inserting a variable with line
+      # breaks in the cache
+      string(REGEX MATCH "([1-9].[0-9])" __nvcc_out "${__nvcc_out}")
+      string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
+      set(CUDA_gpu_detect_output ${__nvcc_out})
+    else()
+      message(WARNING "Running GPU detection script with nvcc failed: ${__nvcc_out} ${__compile_out}")
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(WARNING "Automatic GPU detection failed. Building for all known architectures (${mshadow_known_gpu_archs}).")
+    set(${out_variable} ${mshadow_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+# This list will be used for CUDA_ARCH_NAME = All option
+set(CUDA_KNOWN_GPU_ARCHITECTURES "Fermi" "Kepler" "Maxwell")
+
+# This list will be used for CUDA_ARCH_NAME = Common option (enabled by default)
+set(CUDA_COMMON_GPU_ARCHITECTURES "3.0" "3.5" "5.0")
+
+if (CUDA_TOOLSET VERSION_GREATER "6.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
+endif ()
+
+if (CUDA_TOOLSET VERSION_GREATER "7.5")
+  list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Pascal")
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "6.0" "6.1" "6.1+PTX")
+else()
+  list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2+PTX")
+endif ()
+
+################################################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   mshadow_select_nvcc_arch_flags(out_variable)
+function(mshadow_select_nvcc_arch_flags out_variable)
+  
+  set(CUDA_ARCH_LIST "All" CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_LIST PROPERTY STRINGS "" "All" ${CUDA_KNOWN_GPU_ARCHITECTURES} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+    
+    
+  if("X${CUDA_ARCH_LIST}" STREQUAL "X" )
+    set(CUDA_ARCH_LIST "All")
+  endif()
+
+  set(cuda_arch_bin)
+  set(cuda_arch_ptx)
+
+  if("${CUDA_ARCH_LIST}" STREQUAL "All")
+    set(CUDA_ARCH_LIST ${CUDA_KNOWN_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Common")
+    set(CUDA_ARCH_LIST ${CUDA_COMMON_GPU_ARCHITECTURES})
+  elseif("${CUDA_ARCH_LIST}" STREQUAL "Auto")
+    mshadow_detect_installed_gpus(CUDA_ARCH_LIST)
+    message(STATUS "Autodetected CUDA architecture(s): ${CUDA_ARCH_LIST}")
+  endif()
+
+  # Now process the list and look for names
+  string(REGEX REPLACE "[ \t]+" ";" CUDA_ARCH_LIST "${CUDA_ARCH_LIST}")
+  list(REMOVE_DUPLICATES CUDA_ARCH_LIST)
+  foreach(arch_name ${CUDA_ARCH_LIST})
+    set(arch_bin)
+    set(arch_ptx)
+    set(add_ptx FALSE)
+    # Check to see if we are compiling PTX
+    if(arch_name MATCHES "(.*)\\+PTX$")
+      set(add_ptx TRUE)
+      set(arch_name ${CMAKE_MATCH_1})
+    endif()
+    if(arch_name MATCHES "^([0-9]\\.[0-9](\\([0-9]\\.[0-9]\\))?)$")
+      set(arch_bin ${CMAKE_MATCH_1})
+      set(arch_ptx ${arch_bin})
+    else()
+      # Look for it in our list of known architectures
+      if(${arch_name} STREQUAL "Fermi")
+        if (CUDA_TOOLSET VERSION_LESS "8.0")
+          set(arch_bin 2.0 "2.1(2.0)")
+        endif()
+      elseif(${arch_name} STREQUAL "Kepler+Tegra")
+        set(arch_bin 3.2)
+      elseif(${arch_name} STREQUAL "Kepler+Tesla")
+        set(arch_bin 3.7)
+      elseif(${arch_name} STREQUAL "Kepler")
+        set(arch_bin 3.0 3.5)
+        set(arch_ptx 3.5)
+      elseif(${arch_name} STREQUAL "Maxwell+Tegra")
+        set(arch_bin 5.3)
+      elseif(${arch_name} STREQUAL "Maxwell")
+        set(arch_bin 5.0 5.2)
+        set(arch_ptx 5.2)
+      elseif(${arch_name} STREQUAL "Pascal")
+        set(arch_bin 6.0 6.1)
+        set(arch_ptx 6.1)
+      else()
+        message(SEND_ERROR "Unknown CUDA Architecture Name ${arch_name} in CUDA_SELECT_NVCC_ARCH_FLAGS")
+      endif()
+    endif()
+    list(APPEND cuda_arch_bin ${arch_bin})
+    if(add_ptx)
+      if (NOT arch_ptx)
+        set(arch_ptx ${arch_bin})
+      endif()
+      list(APPEND cuda_arch_ptx ${arch_ptx})
+    endif()
+  endforeach()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+
+  if(cuda_arch_bin)
+    list(REMOVE_DUPLICATES cuda_arch_bin)
+  endif()
+  if(cuda_arch_ptx)
+    list(REMOVE_DUPLICATES cuda_arch_ptx)
+  endif()
+    
+  message(STATUS "cuda arch bin: ${cuda_arch_bin}")
+  message(STATUS "cuda arch ptx: ${cuda_arch_ptx}")
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified ARCH for the concrete CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify ARCH for the concrete CODE, we assume ARCH=CODE
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+
diff --git a/cmake/Modules/FindGperftools.cmake b/cmake/Modules/FindGperftools.cmake
new file mode 100644
index 000000000000..180f4785d396
--- /dev/null
+++ b/cmake/Modules/FindGperftools.cmake
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Tries to find Gperftools.
+#
+# Usage of this module as follows:
+#
+#     find_package(Gperftools)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  Gperftools_ROOT_DIR  Set this variable to the root installation of
+#                       Gperftools if the module has problems finding
+#                       the proper installation path.
+#
+# Variables defined by this module:
+#
+#  GPERFTOOLS_FOUND              System has Gperftools libs/headers
+#  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
+#  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
+
+find_library(GPERFTOOLS_TCMALLOC
+  NAMES tcmalloc
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_PROFILER
+  NAMES profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
+  NAMES tcmalloc_and_profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+
+find_path(GPERFTOOLS_INCLUDE_DIR
+  NAMES gperftools/heap-profiler.h
+  HINTS ${Gperftools_ROOT_DIR}/include)
+
+set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  Gperftools
+  DEFAULT_MSG
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
+mark_as_advanced(
+  Gperftools_ROOT_DIR
+  GPERFTOOLS_TCMALLOC
+  GPERFTOOLS_PROFILER
+  GPERFTOOLS_TCMALLOC_AND_PROFILER
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+
diff --git a/cmake/Modules/FindJeMalloc.cmake b/cmake/Modules/FindJeMalloc.cmake
index 57f47448f0a0..0ab1cec55f1f 100644
--- a/cmake/Modules/FindJeMalloc.cmake
+++ b/cmake/Modules/FindJeMalloc.cmake
@@ -1,28 +1,27 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
+# Distributed under the Boost Software License, Version 1.0.
+# Boost Software License - Version 1.0 - August 17th, 2003
 #
-#   http://www.apache.org/licenses/LICENSE-2.0
+# Permission is hereby granted, free of charge, to any person or organization
+# obtaining a copy of the software and accompanying documentation covered by
+# this license (the "Software") to use, reproduce, display, distribute,
+# execute, and transmit the Software, and to prepare derivative works of the
+# Software, and to permit third-parties to whom the Software is furnished to
+# do so, all subject to the following:
 #
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-# Copyright (c)      2014 Thomas Heller
-# Copyright (c) 2007-2012 Hartmut Kaiser
-# Copyright (c) 2010-2011 Matt Anderson
-# Copyright (c) 2011      Bryce Lelbach
+# The copyright notices in the Software and this entire statement, including
+# the above license grant, this restriction and the following disclaimer,
+# must be included in all copies of the Software, in whole or in part, and
+# all derivative works of the Software, unless such copies or derivative
+# works are solely in the form of machine-executable object code generated by
+# a source language processor.
 #
-# Distributed under the Boost Software License, Version 1.0. (See accompanying
-# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+# FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
 
 find_package(PkgConfig)
 pkg_check_modules(PC_JEMALLOC QUIET jemalloc)
diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake
index 7c5272b7f779..a3a79caae461 100644
--- a/cmake/Modules/FindOpenBLAS.cmake
+++ b/cmake/Modules/FindOpenBLAS.cmake
@@ -15,9 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-if(MKL_FOUND)
-  message(ERROR " OpenBLAS is not required since MKL is enabled")
-endif()
 file(TO_CMAKE_PATH "$ENV{OpenBLAS_HOME}" OpenBLAS_HOME)
 file(TO_CMAKE_PATH "$ENV{OpenBLAS}" OpenBLAS_DIR)
 
diff --git a/cpp-package/README.md b/cpp-package/README.md
index dcfcbc81f3a7..cc656352c170 100644
--- a/cpp-package/README.md
+++ b/cpp-package/README.md
@@ -1,8 +1,12 @@
-# MxNet C++ Package
+# MXNet C++ Package
 
+<!--
 [![Build Status](https://travis-ci.org/dmlc/MXNet.cpp.svg?branch=master)](https://travis-ci.org/dmlc/MXNet.cpp)
 [![Build status](https://ci.appveyor.com/api/projects/status/ckfq6j53sg5ll01d/branch/master?svg=true)](https://ci.appveyor.com/project/lx75249/mxnet-cpp/branch/master)
+-->
 
-The examples dir containers examples for you to get started.
-The lib dir should contain the compiled mxnet library.
-Windows dir contains Visual C++ solution files and project files.
+To build the package, please refer to <https://mxnet.incubator.apache.org/get_started/build_from_source.html#build-the-c-package>.
+
+A basic tutorial can be found at <https://mxnet.incubator.apache.org/tutorials/c++/basics.html>.
+
+The example directory contains examples for you to get started. 
diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt
index 7083dfd014e9..b4cea68fbd05 100644
--- a/cpp-package/example/CMakeLists.txt
+++ b/cpp-package/example/CMakeLists.txt
@@ -17,11 +17,13 @@ file(GLOB_RECURSE CPP_PACKAGE_HEADERS
   "${CPP_PACKAGE_INCLUDE_DIR}/*.hpp"
   )
 
-add_custom_target(
-  cpp_package_deploy_library ALL
-  DEPENDS mxnet
-  COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:mxnet> $<TARGET_FILE_DIR:mlp>
-)
+if (MSVC)
+ add_custom_target(
+   cpp_package_deploy_library ALL
+   DEPENDS mxnet
+   COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:mxnet> $<TARGET_FILE_DIR:mlp>
+ )
+endif()
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
 
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
index 4194b5bae905..dd5d2b4b06d6 100644
--- a/cpp-package/example/alexnet.cpp
+++ b/cpp-package/example/alexnet.cpp
@@ -23,8 +23,7 @@
 #include <map>
 #include <string>
 #include "mxnet-cpp/MxNetCpp.h"
-// Allow IDE to parse the types
-#include "../include/mxnet-cpp/op.h"
+
 
 using namespace std;
 using namespace mxnet::cpp;
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index f5fff853cbad..218d11efc9c8 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -43,8 +43,6 @@
 #include <chrono>
 #include "mxnet-cpp/MxNetCpp.h"
 
-// Allow IDE to parse the types
-#include "../include/mxnet-cpp/op.h"
 
 using namespace std;
 using namespace mxnet::cpp;
diff --git a/cpp-package/example/feature_extract/README.md b/cpp-package/example/feature_extract/README.md
new file mode 100644
index 000000000000..4367a0c2efe3
--- /dev/null
+++ b/cpp-package/example/feature_extract/README.md
@@ -0,0 +1,8 @@
+This example shows how to extract features with a pretrained model.
+
+You can first get a pretrained model from <https://github.com/dmlc/mxnet-model-gallery/blob/master/imagenet-1k-inception-bn.md>,
+then prepare 2 pictures 1.jpg and 2.jpg to extract by executing `run.sh`.
+
+Note:
+1. The filename of network parameters may vary, line 67 in `feature_extract.cpp` should be updated accordingly.
+2. As the build system has changed a lot, to build this example, you need to put the compiled library `libmxnet.so` in `../lib/linux`.
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
index ac0585e81a70..fe5dea6a1f58 100644
--- a/cpp-package/example/googlenet.cpp
+++ b/cpp-package/example/googlenet.cpp
@@ -22,10 +22,8 @@
 #include <string>
 #include <vector>
 #include <map>
-
 #include "mxnet-cpp/MxNetCpp.h"
-// Allow IDE to parse the types
-#include "../include/mxnet-cpp/op.h"
+
 
 using namespace mxnet::cpp;
 
@@ -159,8 +157,8 @@ int main(int argc, char const *argv[]) {
     train_iter.Reset();
     while (train_iter.Next()) {
       auto data_batch = train_iter.GetDataBatch();
-      args_map["data"] = data_batch.data.Copy(Context::gpu());
-      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
       exec->Forward(true);
       exec->Backward();
@@ -174,8 +172,8 @@ int main(int argc, char const *argv[]) {
     val_iter.Reset();
     while (val_iter.Next()) {
       auto data_batch = val_iter.GetDataBatch();
-      args_map["data"] = data_batch.data.Copy(Context::gpu());
-      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
       exec->Forward(false);
       NDArray::WaitAll();
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
index de21aadea9b5..e6f47904e0eb 100644
--- a/cpp-package/example/inception_bn.cpp
+++ b/cpp-package/example/inception_bn.cpp
@@ -19,13 +19,11 @@
 
 /*!
  */
-#include <iostream>
 #include <map>
 #include <string>
 #include <vector>
 #include "mxnet-cpp/MxNetCpp.h"
-// Allow IDE to parse the types
-#include "../include/mxnet-cpp/op.h"
+
 
 using namespace mxnet::cpp;
 
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index 05cc4517fe1e..4c5a1f1165c1 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -19,14 +19,12 @@
 
 /*!
  */
-#include <iostream>
 #include <fstream>
 #include <map>
 #include <string>
 #include <vector>
 #include "mxnet-cpp/MxNetCpp.h"
-// Allow IDE to parse the types
-#include "../include/mxnet-cpp/op.h"
+
 
 using namespace std;
 using namespace mxnet::cpp;
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
index 077f55622561..04f5cbca3a9d 100644
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ b/cpp-package/example/lenet_with_mxdataiter.cpp
@@ -19,14 +19,12 @@
 
 /*!
  */
-#include <iostream>
-#include <fstream>
 #include <map>
 #include <string>
 #include <vector>
+#include <chrono>
 #include "mxnet-cpp/MxNetCpp.h"
-// Allow IDE to parse the types
-#include "../include/mxnet-cpp/op.h"
+
 
 using namespace std;
 using namespace mxnet::cpp;
@@ -89,15 +87,15 @@ int main(int argc, char const *argv[]) {
   args_map["fc2_b"] = 0;
 
   auto train_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./train-images-idx3-ubyte")
-      .SetParam("label", "./train-labels-idx1-ubyte")
+      .SetParam("image", "./mnist_data/train-images-idx3-ubyte")
+      .SetParam("label", "./mnist_data/train-labels-idx1-ubyte")
       .SetParam("batch_size", batch_size)
       .SetParam("shuffle", 1)
       .SetParam("flat", 0)
       .CreateDataIter();
   auto val_iter = MXDataIter("MNISTIter")
-      .SetParam("image", "./t10k-images-idx3-ubyte")
-      .SetParam("label", "./t10k-labels-idx1-ubyte")
+      .SetParam("image", "./mnist_data/t10k-images-idx3-ubyte")
+      .SetParam("label", "./mnist_data/t10k-labels-idx1-ubyte")
       .CreateDataIter();
 
   Optimizer* opt = OptimizerRegistry::Find("ccsgd");
@@ -111,35 +109,62 @@ int main(int argc, char const *argv[]) {
   auto *exec = lenet.SimpleBind(Context::gpu(), args_map);
   auto arg_names = lenet.ListArguments();
 
+  // Create metrics
+  Accuracy train_acc, val_acc;
+
   for (int iter = 0; iter < max_epoch; ++iter) {
-    LG << "Epoch: " << iter;
-    train_iter.Reset();
-    while (train_iter.Next()) {
+      int samples = 0;
+      train_iter.Reset();
+      train_acc.Reset();
+
+      auto tic = chrono::system_clock::now();
+
+     while (train_iter.Next()) {
+      samples += batch_size;
       auto data_batch = train_iter.GetDataBatch();
-      args_map["data"] = data_batch.data.Copy(Context::gpu());
-      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
+
+      // Compute gradients
       exec->Forward(true);
       exec->Backward();
+
       // Update parameters
       for (size_t i = 0; i < arg_names.size(); ++i) {
         if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
         opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
       }
+
+      // Update metric
+      train_acc.Update(data_batch.label, exec->outputs[0]);
     }
 
+     // one epoch of training is finished
+     auto toc = chrono::system_clock::now();
+     float duration = chrono::duration_cast<chrono::milliseconds>(toc - tic).count() / 1000.0;
+     LG << "Epoch[" << iter << "] " << samples / duration \
+         << " samples/sec " << "Train-Accuracy=" << train_acc.Get();;
+
+      val_iter.Reset();
+      val_acc.Reset();
+
     Accuracy acu;
     val_iter.Reset();
     while (val_iter.Next()) {
       auto data_batch = val_iter.GetDataBatch();
-      args_map["data"] = data_batch.data.Copy(Context::gpu());
-      args_map["data_label"] = data_batch.label.Copy(Context::gpu());
+      data_batch.data.CopyTo(&args_map["data"]);
+      data_batch.label.CopyTo(&args_map["data_label"]);
       NDArray::WaitAll();
+
+      // Only forward pass is enough as no gradient is needed when evaluating
       exec->Forward(false);
       NDArray::WaitAll();
       acu.Update(data_batch.label, exec->outputs[0]);
+      val_acc.Update(data_batch.label, exec->outputs[0]);
     }
-    LG << "Accuracy: " << acu.Get();
+    LG << "Epoch[" << iter << "] Val-Accuracy=" << val_acc.Get();
   }
 
   delete exec;
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
index c9c4ff245180..b40328da6e9a 100644
--- a/cpp-package/example/mlp.cpp
+++ b/cpp-package/example/mlp.cpp
@@ -24,8 +24,7 @@
 #include <vector>
 #include <string>
 #include "mxnet-cpp/MxNetCpp.h"
-// Allow IDE to parse the types
-#include "../include/mxnet-cpp/op.h"
+
 
 using namespace std;
 using namespace mxnet::cpp;
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
index 748c32e8c274..051bad1bd26a 100644
--- a/cpp-package/example/mlp_cpu.cpp
+++ b/cpp-package/example/mlp_cpu.cpp
@@ -106,8 +106,8 @@ int main(int argc, char** argv) {
       samples += batch_size;
       auto data_batch = train_iter.GetDataBatch();
       // Set data and label
-      args["X"] = data_batch.data;
-      args["label"] = data_batch.label;
+      data_batch.data.CopyTo(&args["X"]);
+      data_batch.label.CopyTo(&args["label"]);
 
       // Compute gradients
       exec->Forward(true);
@@ -124,8 +124,8 @@ int main(int argc, char** argv) {
     val_iter.Reset();
     while (val_iter.Next()) {
       auto data_batch = val_iter.GetDataBatch();
-      args["X"] = data_batch.data;
-      args["label"] = data_batch.label;
+      data_batch.data.CopyTo(&args["X"]);
+      data_batch.label.CopyTo(&args["label"]);
       // Forward pass is enough as no gradient is needed when evaluating
       exec->Forward(false);
       acc.Update(data_batch.label, exec->outputs[0]);
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
index ca5643de9d81..03b3d7217648 100644
--- a/cpp-package/example/resnet.cpp
+++ b/cpp-package/example/resnet.cpp
@@ -19,13 +19,11 @@
 
 /*!
  */
-#include <iostream>
 #include <map>
 #include <string>
 #include <vector>
 #include "mxnet-cpp/MxNetCpp.h"
-// Allow IDE to parse the types
-#include "../include/mxnet-cpp/op.h"
+
 
 using namespace mxnet::cpp;
 
diff --git a/cpp-package/include/mxnet-cpp/MxNetCpp.h b/cpp-package/include/mxnet-cpp/MxNetCpp.h
index 882bbead51e5..7ac039dd8816 100644
--- a/cpp-package/include/mxnet-cpp/MxNetCpp.h
+++ b/cpp-package/include/mxnet-cpp/MxNetCpp.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file MxNetCpp.h
  * \brief meta include file for mxnet.cpp
  * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/base.h b/cpp-package/include/mxnet-cpp/base.h
index 19375c0f81e8..d0f1bea15f00 100644
--- a/cpp-package/include/mxnet-cpp/base.h
+++ b/cpp-package/include/mxnet-cpp/base.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file base.h
 * \brief base definitions for mxnetcpp
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/executor.h b/cpp-package/include/mxnet-cpp/executor.h
index 7e45ef56ab95..4cb28819de02 100644
--- a/cpp-package/include/mxnet-cpp/executor.h
+++ b/cpp-package/include/mxnet-cpp/executor.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file executor.h
 * \brief executor definition
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/initializer.h b/cpp-package/include/mxnet-cpp/initializer.h
index e5bfa4da8eed..61e95469b76d 100644
--- a/cpp-package/include/mxnet-cpp/initializer.h
+++ b/cpp-package/include/mxnet-cpp/initializer.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file initializer.h
  * \brief random initializer
  * \author Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/io.h b/cpp-package/include/mxnet-cpp/io.h
index 7281416ae36a..7099d7d46fee 100644
--- a/cpp-package/include/mxnet-cpp/io.h
+++ b/cpp-package/include/mxnet-cpp/io.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file operator.h
 * \brief definition of io, such as DataIter
 * \author Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/kvstore.h b/cpp-package/include/mxnet-cpp/kvstore.h
index 9c3c81f37ff7..d5aa1509a8f0 100644
--- a/cpp-package/include/mxnet-cpp/kvstore.h
+++ b/cpp-package/include/mxnet-cpp/kvstore.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file kvstore.h
 * \brief definition of kvstore
 * \author Chuntao Hong
diff --git a/cpp-package/include/mxnet-cpp/lr_scheduler.h b/cpp-package/include/mxnet-cpp/lr_scheduler.h
index b9381a830a88..cffd1c7576e5 100644
--- a/cpp-package/include/mxnet-cpp/lr_scheduler.h
+++ b/cpp-package/include/mxnet-cpp/lr_scheduler.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2017 by Contributors
 * \file lr_scheduler.h
 * \brief Scheduling learning rate
 */
diff --git a/cpp-package/include/mxnet-cpp/metric.h b/cpp-package/include/mxnet-cpp/metric.h
index 6dbb197dae49..d015d8b4acc9 100644
--- a/cpp-package/include/mxnet-cpp/metric.h
+++ b/cpp-package/include/mxnet-cpp/metric.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file base.h
 * \brief metrics defined
 * \author Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/model.h b/cpp-package/include/mxnet-cpp/model.h
index c8af6a476a52..b3a0a9dbef6e 100644
--- a/cpp-package/include/mxnet-cpp/model.h
+++ b/cpp-package/include/mxnet-cpp/model.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file model.h
 * \brief MXNET.cpp model module
 * \author Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/monitor.h b/cpp-package/include/mxnet-cpp/monitor.h
index 33ef4855c1a9..c1494d0bd0a6 100644
--- a/cpp-package/include/mxnet-cpp/monitor.h
+++ b/cpp-package/include/mxnet-cpp/monitor.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2017 by Contributors
 * \file monitor.h
 * \brief monitor definition
 * \author Xin Li
diff --git a/cpp-package/include/mxnet-cpp/ndarray.h b/cpp-package/include/mxnet-cpp/ndarray.h
index 9e196d0730a8..082c06981cf9 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.h
+++ b/cpp-package/include/mxnet-cpp/ndarray.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file ndarray.h
 * \brief definition of ndarray
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/ndarray.hpp b/cpp-package/include/mxnet-cpp/ndarray.hpp
index 5ed04a547b85..3c3b85d37326 100644
--- a/cpp-package/include/mxnet-cpp/ndarray.hpp
+++ b/cpp-package/include/mxnet-cpp/ndarray.hpp
@@ -33,6 +33,7 @@
 #include <iterator>
 #include "dmlc/logging.h"
 #include "mxnet-cpp/ndarray.h"
+#include "mxnet-cpp/operator.h"
 
 namespace mxnet {
 namespace cpp {
@@ -239,10 +240,10 @@ inline void NDArray::WaitToWrite() {
 }
 inline void NDArray::WaitAll() { CHECK_EQ(MXNDArrayWaitAll(), 0); }
 inline void NDArray::SampleGaussian(mx_float mu, mx_float sigma, NDArray *out) {
-  Operator("_sample_normal")(mu, sigma).Invoke(*out);
+  Operator("_random_normal")(mu, sigma).Invoke(*out);
 }
 inline void NDArray::SampleUniform(mx_float begin, mx_float end, NDArray *out) {
-  Operator("_sample_uniform")(begin, end).Invoke(*out);
+  Operator("_random_uniform")(begin, end).Invoke(*out);
 }
 inline void NDArray::Load(const std::string &file_name,
                           std::vector<NDArray> *array_list,
@@ -359,7 +360,6 @@ inline int NDArray::GetDType() const {
 
 inline const mx_float *NDArray::GetData() const {
   void *ret;
-  CHECK_NE(GetContext().GetDeviceType(), DeviceType::kGPU);
   MXNDArrayGetData(blob_ptr_->handle_, &ret);
   if (GetDType() != 0) {
     return NULL;
diff --git a/cpp-package/include/mxnet-cpp/op_map.h b/cpp-package/include/mxnet-cpp/op_map.h
index b54cc0ae2c01..17746d1fa596 100644
--- a/cpp-package/include/mxnet-cpp/op_map.h
+++ b/cpp-package/include/mxnet-cpp/op_map.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file op_map.h
 * \brief definition of OpMap
 * \author Chuntao Hong
diff --git a/cpp-package/include/mxnet-cpp/op_suppl.h b/cpp-package/include/mxnet-cpp/op_suppl.h
index 52cdae772a68..4f3011c17caa 100644
--- a/cpp-package/include/mxnet-cpp/op_suppl.h
+++ b/cpp-package/include/mxnet-cpp/op_suppl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file op_suppl.h
 * \brief A supplement and amendment of the operators from op.h
 * \author Zhang Chen, zhubuntu, Xin Li
diff --git a/cpp-package/include/mxnet-cpp/op_util.h b/cpp-package/include/mxnet-cpp/op_util.h
index 20e06a851814..b2b442fd8a88 100644
--- a/cpp-package/include/mxnet-cpp/op_util.h
+++ b/cpp-package/include/mxnet-cpp/op_util.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2017 by Contributors
 * \file op_util.h
 * \brief operator helper functions
 * \author Chris Olivier
diff --git a/cpp-package/include/mxnet-cpp/operator.h b/cpp-package/include/mxnet-cpp/operator.h
index 02bd21ebe8c9..4d4bedac8fec 100644
--- a/cpp-package/include/mxnet-cpp/operator.h
+++ b/cpp-package/include/mxnet-cpp/operator.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file operator.h
 * \brief definition of operator
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/optimizer.h b/cpp-package/include/mxnet-cpp/optimizer.h
index e57da5d95ceb..4aebb55c50d1 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.h
+++ b/cpp-package/include/mxnet-cpp/optimizer.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file optimizer.h
 * \brief definition of optimizer
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/shape.h b/cpp-package/include/mxnet-cpp/shape.h
index 2793e436c072..01ee47636351 100644
--- a/cpp-package/include/mxnet-cpp/shape.h
+++ b/cpp-package/include/mxnet-cpp/shape.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file shape.h
 * \brief definition of shape
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/symbol.h b/cpp-package/include/mxnet-cpp/symbol.h
index 888aebd6f3ad..127ef156eb62 100644
--- a/cpp-package/include/mxnet-cpp/symbol.h
+++ b/cpp-package/include/mxnet-cpp/symbol.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+*  Copyright (c) 2016 by Contributors
 * \file symbol.h
 * \brief definition of symbol
 * \author Chuntao Hong, Zhang Chen
diff --git a/cpp-package/include/mxnet-cpp/symbol.hpp b/cpp-package/include/mxnet-cpp/symbol.hpp
index ee1a11e26a40..11590fad6041 100644
--- a/cpp-package/include/mxnet-cpp/symbol.hpp
+++ b/cpp-package/include/mxnet-cpp/symbol.hpp
@@ -103,6 +103,7 @@ inline Symbol Symbol::Load(const std::string &file_name) {
   return Symbol(handle);
 }
 inline Symbol Symbol::LoadJSON(const std::string &json_str) {
+  op_map();
   SymbolHandle handle;
   CHECK_EQ(MXSymbolCreateFromJSON(json_str.c_str(), &(handle)), 0);
   return Symbol(handle);
diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index 83495febcc63..ac957730d689 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -124,12 +124,15 @@ def __init__(self, opName = '', argName = '', typeString = '', descString = ''):
                 self.defaultString = self.enum.GetDefaultValueString(self.defaultString)
             elif self.defaultString == 'None':
                 self.defaultString = self.type + '()'
-            elif self.defaultString == 'False':
-                self.defaultString = 'false'
-            elif self.defaultString == 'True':
-                self.defaultString = 'true'
+            elif self.type == "bool":
+                if self.defaultString == "1" or self.defaultString == "True":
+                    self.defaultString = "true"
+                else:
+                    self.defaultString = "false"
             elif self.defaultString[0] == '(':
                 self.defaultString = 'Shape' + self.defaultString
+            elif self.defaultString[0] == '[':
+                self.defaultString = 'Shape(' + self.defaultString[1:-1] + ")"
             elif self.type == 'dmlc::optional<int>':
                 self.defaultString = self.type + '(' + self.defaultString + ')'
             elif typeString.startswith('caffe-layer-parameter'):
diff --git a/dmlc-core b/dmlc-core
index 71bfbd3a9460..87b7ffa59eb7 160000
--- a/dmlc-core
+++ b/dmlc-core
@@ -1 +1 @@
-Subproject commit 71bfbd3a946075cea66ca9e19bad86dd33c19b46
+Subproject commit 87b7ffa59eb78f753073ac56f5f60e46d930b93c
diff --git a/docker/install/perl.sh b/docker/install/perl.sh
index a981746bc18d..af49952f97d6 100755
--- a/docker/install/perl.sh
+++ b/docker/install/perl.sh
@@ -19,4 +19,4 @@
 
 # install libraries for mxnet's perl package on ubuntu
 apt-get update && apt-get install -y libmouse-perl pdl cpanminus swig libgraphviz-perl
-cpanm -q Function::Parameters
+cpanm -q Function::Parameters Hash::Ordered
diff --git a/docker/install/scala.sh b/docker/install/scala.sh
index bb0bb9c900d4..b1bfe28074f0 100755
--- a/docker/install/scala.sh
+++ b/docker/install/scala.sh
@@ -19,7 +19,15 @@
 
 # install libraries for mxnet's scala package on ubuntu
 
-apt-get install -y maven default-jdk
+
+apt-get install -y software-properties-common
+add-apt-repository -y ppa:webupd8team/java
+apt-get update
+echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections
+apt-get install -y oracle-java8-installer
+apt-get install -y oracle-java8-set-default
+
+apt-get install -y maven
 
 wget http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.deb
 dpkg -i scala-2.11.8.deb
diff --git a/docker/run.sh b/docker/run.sh
old mode 100644
new mode 100755
diff --git a/docker_multiarch/.gitignore b/docker_multiarch/.gitignore
new file mode 100644
index 000000000000..2a07fbf7c51e
--- /dev/null
+++ b/docker_multiarch/.gitignore
@@ -0,0 +1,2 @@
+mxnet/
+build/
diff --git a/docker_multiarch/Dockerfile.build.android.arm64 b/docker_multiarch/Dockerfile.build.android.arm64
new file mode 100644
index 000000000000..995e718bfb5f
--- /dev/null
+++ b/docker_multiarch/Dockerfile.build.android.arm64
@@ -0,0 +1,77 @@
+# -*- mode: dockerfile -*-
+FROM dockcross/base:latest
+MAINTAINER Pedro Larroy "pllarroy@amazon.com"
+
+# The cross-compiling emulator
+RUN apt-get update && apt-get install -y \
+  qemu-user \
+  qemu-user-static \
+  unzip
+
+ENV CROSS_TRIPLE=aarch64-linux-android
+ENV CROSS_ROOT=/usr/${CROSS_TRIPLE}
+ENV AS=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-as \
+    AR=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ar \
+    CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-gcc \
+    CPP=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-cpp \
+    CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-g++ \
+    LD=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ld
+
+ENV ANDROID_NDK_REVISION 15c
+RUN mkdir -p /build && \
+    cd /build && \
+    curl -O https://dl.google.com/android/repository/android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
+    unzip ./android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
+    cd android-ndk-r${ANDROID_NDK_REVISION} && \
+    ./build/tools/make_standalone_toolchain.py \
+      --stl=libc++ \
+      --arch arm64 \
+      --api 21 \
+      --install-dir=${CROSS_ROOT} && \
+    cd / && \
+    rm -rf /build && \
+    find ${CROSS_ROOT} -exec chmod a+r '{}' \; && \
+    find ${CROSS_ROOT} -executable -exec chmod a+x '{}' \;
+
+
+ENV DEFAULT_DOCKCROSS_IMAGE dockcross/android-arm
+
+# COPY Toolchain.cmake ${CROSS_ROOT}/
+# ENV CMAKE_TOOLCHAIN_FILE ${CROSS_ROOT}/Toolchain.cmake
+
+# Build-time metadata as defined at http://label-schema.org
+ARG BUILD_DATE
+ARG IMAGE
+ARG VCS_REF
+ARG VCS_URL
+LABEL org.label-schema.build-date=$BUILD_DATE \
+      org.label-schema.name=$IMAGE \
+      org.label-schema.vcs-ref=$VCS_REF \
+      org.label-schema.vcs-url=$VCS_URL \
+      org.label-schema.schema-version="1.0"
+
+ENV ARCH aarch64
+
+# Build OpenBLAS
+# https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android
+RUN git clone https://github.com/xianyi/OpenBLAS.git && \
+    cd OpenBLAS && \
+    make -j$(nproc) TARGET=ARMV8 ARM_SOFTFP_ABI=1 HOSTCC=gcc NOFORTRAN=1 libs
+
+ENV OPENBLAS_ROOT /work/OpenBLAS
+ENV LIBRARY_PATH /work/OpenBLAS/lib/:/work/OpenBLAS/:$LIBRARY_PATH
+ENV CPLUS_INCLUDE_PATH /work/OpenBLAS/include/:/work/OpenBLAS/:$CPLUS_INCLUDE_PATH
+WORKDIR /work
+
+ENV CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang
+ENV CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-clang++
+ENV BUILD_OPTS "USE_BLAS=openblas USE_SSE=0 DMLC_LOG_STACK_TRACE=0 USE_OPENCV=0 USE_LAPACK=0"
+
+# Build MXNet
+ADD mxnet mxnet
+ADD arm.crosscompile.android.mk /work/mxnet/make/config.mk
+RUN cd mxnet && \
+    make -j$(nproc) $BUILD_OPTS
+
+WORKDIR /work/build/
+RUN cp /work/mxnet/lib/* .
diff --git a/docker_multiarch/Dockerfile.build.android.armv7 b/docker_multiarch/Dockerfile.build.android.armv7
new file mode 100644
index 000000000000..12d53a412223
--- /dev/null
+++ b/docker_multiarch/Dockerfile.build.android.armv7
@@ -0,0 +1,78 @@
+# -*- mode: dockerfile -*-
+FROM dockcross/base:latest
+MAINTAINER Pedro Larroy "pllarroy@amazon.com"
+
+# The cross-compiling emulator
+RUN apt-get update && apt-get install -y \
+  qemu-user \
+  qemu-user-static \
+  unzip
+
+ENV CROSS_TRIPLE=arm-linux-androideabi
+ENV CROSS_ROOT=/usr/${CROSS_TRIPLE}
+ENV AS=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-as \
+    AR=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ar \
+    CC=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-gcc \
+    CPP=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-cpp \
+    CXX=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-g++ \
+    LD=${CROSS_ROOT}/bin/${CROSS_TRIPLE}-ld
+
+ENV ANDROID_NDK_REVISION 15c
+RUN mkdir -p /build && \
+    cd /build && \
+    curl -O https://dl.google.com/android/repository/android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
+    unzip ./android-ndk-r${ANDROID_NDK_REVISION}-linux-x86_64.zip && \
+    cd android-ndk-r${ANDROID_NDK_REVISION} && \
+    ./build/tools/make_standalone_toolchain.py \
+      --stl=libc++ \
+      --arch arm \
+      --api 16 \
+      --install-dir=${CROSS_ROOT} && \
+    cd / && \
+    rm -rf /build && \
+    find ${CROSS_ROOT} -exec chmod a+r '{}' \; && \
+    find ${CROSS_ROOT} -executable -exec chmod a+x '{}' \;
+
+
+ENV DEFAULT_DOCKCROSS_IMAGE dockcross/android-arm
+
+# COPY Toolchain.cmake ${CROSS_ROOT}/
+# ENV CMAKE_TOOLCHAIN_FILE ${CROSS_ROOT}/Toolchain.cmake
+
+# Build-time metadata as defined at http://label-schema.org
+ARG BUILD_DATE
+ARG IMAGE
+ARG VCS_REF
+ARG VCS_URL
+LABEL org.label-schema.build-date=$BUILD_DATE \
+      org.label-schema.name=$IMAGE \
+      org.label-schema.vcs-ref=$VCS_REF \
+      org.label-schema.vcs-url=$VCS_URL \
+      org.label-schema.schema-version="1.0"
+
+ENV CC /usr/arm-linux-androideabi/bin/arm-linux-androideabi-gcc
+ENV CXX /usr/arm-linux-androideabi/bin/arm-linux-androideabi-g++
+
+# Build OpenBLAS
+# https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android
+RUN git clone https://github.com/xianyi/OpenBLAS.git && \
+    cd OpenBLAS && \
+    make -j$(nproc) TARGET=ARMV7 ARM_SOFTFP_ABI=1 HOSTCC=gcc NOFORTRAN=1 libs
+
+ENV OPENBLAS_ROOT /work/OpenBLAS
+ENV LIBRARY_PATH /work/OpenBLAS/lib/:/work/OpenBLAS/:$LIBRARY_PATH
+ENV CPLUS_INCLUDE_PATH /work/OpenBLAS/include/:/work/OpenBLAS/:$CPLUS_INCLUDE_PATH
+WORKDIR /work
+
+ENV CC /usr/arm-linux-androideabi/bin/arm-linux-androideabi-clang
+ENV CXX /usr/arm-linux-androideabi/bin/arm-linux-androideabi-clang++
+ENV BUILD_OPTS "USE_BLAS=openblas USE_SSE=0 DMLC_LOG_STACK_TRACE=0 USE_OPENCV=0 USE_LAPACK=0"
+
+# Build MXNet
+ADD mxnet mxnet
+ADD arm.crosscompile.android.mk /work/mxnet/make/config.mk
+RUN cd mxnet && \
+    make -j$(nproc) $BUILD_OPTS
+
+WORKDIR /work/build/
+RUN cp /work/mxnet/lib/* .
diff --git a/docker_multiarch/Dockerfile.build.arm64 b/docker_multiarch/Dockerfile.build.arm64
new file mode 100644
index 000000000000..33f7b3f45e9b
--- /dev/null
+++ b/docker_multiarch/Dockerfile.build.arm64
@@ -0,0 +1,37 @@
+# -*- mode: dockerfile -*-
+# dockerfile to build libmxnet.so for armv7
+FROM dockcross/linux-arm64
+
+ENV ARCH aarch64
+ENV BUILD_OPTS "USE_BLAS=openblas USE_SSE=0 USE_OPENCV=0"
+ENV CC /usr/bin/aarch64-linux-gnu-gcc
+ENV CXX /usr/bin/aarch64-linux-gnu-g++
+ENV FC /usr/bin/aarch64-linux-gnu-gfortran-4.9
+ENV HOSTCC gcc
+
+WORKDIR /work
+
+# Build OpenBLAS
+ADD https://api.github.com/repos/xianyi/OpenBLAS/git/refs/heads/master /tmp/openblas_version.json
+RUN git clone https://github.com/xianyi/OpenBLAS.git && \
+    cd OpenBLAS && \
+    make -j$(nproc) TARGET=ARMV8 && \
+    make install && \
+    ln -s /opt/OpenBLAS/lib/libopenblas.so /usr/lib/libopenblas.so && \
+    ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/libopenblas.a && \
+    ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/liblapack.a
+
+ENV LD_LIBRARY_PATH /opt/OpenBLAS/lib
+ENV CPLUS_INCLUDE_PATH /opt/OpenBLAS/include
+
+# Build MXNet
+#ADD https://api.github.com/repos/apache/incubator-mxnet/git/refs/heads/master mxnet_version.json
+#RUN git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet
+ADD mxnet mxnet
+
+WORKDIR /work/mxnet
+ADD arm.crosscompile.mk make/config.mk
+RUN make -j$(nproc) $BUILD_OPTS
+
+WORKDIR /work/build/
+RUN cp /work/mxnet/lib/* .
diff --git a/docker_multiarch/Dockerfile.build.armv6 b/docker_multiarch/Dockerfile.build.armv6
new file mode 100644
index 000000000000..9adfc5b49d42
--- /dev/null
+++ b/docker_multiarch/Dockerfile.build.armv6
@@ -0,0 +1,38 @@
+# -*- mode: dockerfile -*-
+# dockerfile to build libmxnet.so for armv7
+FROM dockcross/linux-armv6
+
+ENV ARCH armv6l
+ENV BUILD_OPTS "USE_BLAS=openblas USE_SSE=0 USE_OPENCV=0"
+ENV CC /usr/bin/arm-linux-gnueabihf-gcc
+ENV CXX /usr/bin/arm-linux-gnueabihf-g++
+ENV FC /usr/bin/arm-linux-gnueabihf-gfortran
+ENV HOSTCC gcc
+
+WORKDIR /work
+
+# Build OpenBLAS
+ADD https://api.github.com/repos/xianyi/OpenBLAS/git/refs/heads/master openblas_version.json
+RUN git clone https://github.com/xianyi/OpenBLAS.git && \
+    cd OpenBLAS && \
+    make -j$(nproc) TARGET=ARMV6 && \
+    make install && \
+    ln -s /opt/OpenBLAS/lib/libopenblas.so /usr/lib/gcc/arm-linux-gnueabihf/libopenblas.so && \
+    ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/gcc/arm-linux-gnueabihf/libopenblas.a && \
+    ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/gcc/arm-linux-gnueabihf/liblapack.a && \
+    ln -s /opt/OpenBLAS/lib/libopenblas.a /usr/lib/liblapack.a
+
+ENV LD_LIBRARY_PATH /opt/OpenBLAS/lib
+ENV CPLUS_INCLUDE_PATH /opt/OpenBLAS/include
+
+# Build MXNet
+#ADD https://api.github.com/repos/apache/incubator-mxnet/git/refs/heads/master mxnet_version.json
+#RUN git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet
+ADD mxnet mxnet
+
+WORKDIR /work/mxnet
+ADD arm.crosscompile.mk make/config.mk
+RUN make -j$(nproc) $BUILD_OPTS
+
+WORKDIR /work/build/
+RUN cp /work/mxnet/lib/* .
diff --git a/docker_multiarch/Dockerfile.build.armv7 b/docker_multiarch/Dockerfile.build.armv7
new file mode 100644
index 000000000000..740f2b22da64
--- /dev/null
+++ b/docker_multiarch/Dockerfile.build.armv7
@@ -0,0 +1,26 @@
+# -*- mode: dockerfile -*-
+# dockerfile to build libmxnet.so for armv7
+FROM dockcross/linux-armv7
+
+RUN apt-get update && \
+    apt-get install -y libopenblas-dev:armhf && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV ARCH armv71
+ENV CC /usr/bin/arm-linux-gnueabihf-gcc
+ENV CXX /usr/bin/arm-linux-gnueabihf-g++
+ENV BUILD_OPTS "USE_OPENCV=0 USE_BLAS=openblas USE_SSE=0"
+
+# Build MXNet
+
+WORKDIR /work
+#ADD https://api.github.com/repos/apache/incubator-mxnet/git/refs/heads/master mxnet_version.json
+#RUN git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet
+ADD mxnet mxnet
+
+WORKDIR /work/mxnet
+ADD arm.crosscompile.mk make/config.mk
+RUN make -j$(nproc) $BUILD_OPTS
+
+WORKDIR /work/build/
+RUN cp /work/mxnet/lib/* .
diff --git a/docker_multiarch/Dockerfile.build.cmake.ubuntu-17.04 b/docker_multiarch/Dockerfile.build.cmake.ubuntu-17.04
new file mode 100644
index 000000000000..cf0a981e04fc
--- /dev/null
+++ b/docker_multiarch/Dockerfile.build.cmake.ubuntu-17.04
@@ -0,0 +1,37 @@
+FROM ubuntu:17.04
+
+
+RUN apt-get update &&\
+    apt-get install -y wget python3.5 gcc-4.9 gcc-5 g++-4.9 g++-5 cmake less python3-pip python3-dev\
+    build-essential git pkgconf\
+    libopenblas-dev liblapack-dev\
+    maven default-jdk\
+    ninja-build\
+    libgtest-dev\
+    &&\
+    rm -rf /var/lib/apt/lists/*
+
+
+
+###########################
+# Build gtest
+WORKDIR /work/googletest
+RUN cmake /usr/src/googletest/googletest/ -GNinja
+RUN ninja
+RUN cp libgtest.a /usr/lib
+###########################
+
+
+
+WORKDIR /work
+#RUN git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet
+ADD mxnet mxnet
+
+WORKDIR mxnet/build
+RUN cmake -DUSE_CUDA=OFF -DUSE_OPENCV=OFF -GNinja .. 
+RUN ninja
+
+
+# Copy artifacts
+RUN mkdir -p /work/build
+RUN cp *.a *.so /work/build
diff --git a/docker_multiarch/Dockerfile.build.ubuntu-16.04-cuda_8.0_cudnn5 b/docker_multiarch/Dockerfile.build.ubuntu-16.04-cuda_8.0_cudnn5
new file mode 100644
index 000000000000..071a351b3125
--- /dev/null
+++ b/docker_multiarch/Dockerfile.build.ubuntu-16.04-cuda_8.0_cudnn5
@@ -0,0 +1,32 @@
+FROM nvidia/cuda:8.0-cudnn5-devel
+
+RUN apt-get update &&\
+    apt-get install -y wget python3.5 gcc-4.9 gcc-5 g++-4.9 g++-5 cmake less python3-pip python3-dev\
+    build-essential git pkgconf\
+    libopenblas-dev liblapack-dev\
+    maven default-jdk\
+    &&\
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /work
+#RUN git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet
+ADD mxnet mxnet
+
+# Compile MxNet
+ENV BUILD_OPTS "USE_OPENCV=0 USE_BLAS=openblas USE_CUDA=1 USE_CUDA_PATH=/usr/local/cuda USE_CUDNN=1"
+WORKDIR /work/mxnet
+RUN make -j$(nproc) $BUILD_OPTS
+
+WORKDIR /work/build/
+RUN cp /work/mxnet/lib/* .
+
+# Scala packag
+#WORKDIR /work
+#RUN wget --quiet http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.deb
+#RUN dpkg -i scala-2.11.8.deb && rm scala-2.11.8.deb
+
+#WORKDIR /work/mxnet
+#RUN make scalapkg $BUILD_OPTS
+
+#WORKDIR /work/build/scala_gpu
+#RUN cp /work/mxnet/scala-package/assembly/linux-x86_64-gpu/target/*.jar .
diff --git a/docker_multiarch/Dockerfile.build.ubuntu-17.04 b/docker_multiarch/Dockerfile.build.ubuntu-17.04
new file mode 100644
index 000000000000..63b3c0716d90
--- /dev/null
+++ b/docker_multiarch/Dockerfile.build.ubuntu-17.04
@@ -0,0 +1,30 @@
+#
+# Base image to build MXNet from source in ubuntu
+#
+# Other images depend on it, so build it like:
+#
+# docker build -f Dockerfile.build.ubuntu-17.04 -t mxnet.build.ubuntu-17.04 .
+#
+FROM ubuntu:17.04
+
+
+RUN apt-get update &&\
+    apt-get install -y wget python3.5 gcc-4.9 gcc-5 g++-4.9 g++-5 cmake less python3-pip python3-dev\
+    build-essential git pkgconf\
+    libopenblas-dev liblapack-dev\
+    maven default-jdk
+
+RUN rm -rf /var/lib/apt/lists/*
+
+WORKDIR /work
+#RUN git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet
+ADD mxnet mxnet
+
+# Compile MxNet
+ENV BUILD_OPTS "USE_OPENCV=0 USE_BLAS=openblas"
+WORKDIR /work/mxnet
+RUN make -j$(nproc) $BUILD_OPTS
+
+# Copy artifacts
+WORKDIR /work/build/
+RUN cp /work/mxnet/lib/* .
diff --git a/docker_multiarch/Dockerfile.build.ubuntu-17.04.scala.docker b/docker_multiarch/Dockerfile.build.ubuntu-17.04.scala.docker
new file mode 100644
index 000000000000..a31ce893783c
--- /dev/null
+++ b/docker_multiarch/Dockerfile.build.ubuntu-17.04.scala.docker
@@ -0,0 +1,17 @@
+# Before building this image you would need to build MXNet by executing:
+# docker build -f Dockerfile.build.ubuntu-17.04 -t mxnet.build.ubuntu-17.04 .
+# if you haven't done it before.
+
+FROM mxnet.build.ubuntu-17.04
+
+# Scala package
+WORKDIR /work
+RUN wget --quiet http://downloads.lightbend.com/scala/2.11.8/scala-2.11.8.deb
+RUN dpkg -i scala-2.11.8.deb && rm scala-2.11.8.deb
+
+WORKDIR /work/mxnet
+RUN make scalapkg $BUILD_OPTS
+
+WORKDIR /work/build/scala
+RUN cp /work/mxnet/scala-package/core/target/*.jar .
+RUN cp /work/mxnet/scala-package/assembly/linux-x86_64-cpu/target/*.jar .
diff --git a/docker_multiarch/Dockerfile.run.ubuntu-17.04.julia b/docker_multiarch/Dockerfile.run.ubuntu-17.04.julia
new file mode 100644
index 000000000000..df3a036d83fe
--- /dev/null
+++ b/docker_multiarch/Dockerfile.run.ubuntu-17.04.julia
@@ -0,0 +1,24 @@
+# Before building this image you would need to build MXNet by executing:
+# docker build -f Dockerfile.build.ubuntu-17.04 -t mxnet.build.ubuntu-17.04 .
+# if you haven't done it before.
+
+FROM mxnet.build.ubuntu-17.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+##################
+# Julia installation
+RUN wget -q https://julialang.s3.amazonaws.com/bin/linux/x64/0.5/julia-0.5.1-linux-x86_64.tar.gz\
+  && tar -zxf julia-0.5.1-linux-x86_64.tar.gz\
+  && rm julia-0.5.1-linux-x86_64.tar.gz\
+  && ln -s $(pwd)/julia-6445c82d00/bin/julia /usr/bin/julia
+##################
+
+
+ENV MXNET_HOME /work/mxnet
+WORKDIR /work/mxnet
+RUN julia -e 'Pkg.add("MXNet")'
+
+
+
+
diff --git a/docker_multiarch/Dockerfile.run.ubuntu-17.04.perl b/docker_multiarch/Dockerfile.run.ubuntu-17.04.perl
new file mode 100644
index 000000000000..a1a637def23b
--- /dev/null
+++ b/docker_multiarch/Dockerfile.run.ubuntu-17.04.perl
@@ -0,0 +1,23 @@
+# Before building this image you would need to build MXNet by executing:
+# docker build -f Dockerfile.build.ubuntu-17.04 -t mxnet.build.ubuntu-17.04 .
+# if you haven't done it before.
+
+FROM mxnet.build.ubuntu-17.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update
+RUN apt-get install -y\
+    libmouse-perl pdl cpanminus swig libgraphviz-perl
+RUN rm -rf /var/lib/apt/lists/*
+
+RUN cpanm -q Function::Parameters
+
+WORKDIR /work/mxnet/perl-package/AI-MXNetCAPI
+RUN perl Makefile.PL && make install
+
+WORKDIR /work/mxnet/perl-package/AI-NNVMCAPI/
+RUN perl Makefile.PL && make install
+
+WORKDIR /work/mxnet/perl-package/AI-MXNet/
+RUN	perl Makefile.PL && make install
diff --git a/docker_multiarch/Dockerfile.run.ubuntu-17.04.python b/docker_multiarch/Dockerfile.run.ubuntu-17.04.python
new file mode 100644
index 000000000000..8bd262b891c4
--- /dev/null
+++ b/docker_multiarch/Dockerfile.run.ubuntu-17.04.python
@@ -0,0 +1,17 @@
+# Before building this image you would need to build MXNet by executing:
+# docker build -f Dockerfile.build.ubuntu-17.04 -t mxnet.build.ubuntu-17.04 .
+# if you haven't done it before.
+
+FROM mxnet.build.ubuntu-17.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+
+RUN apt-get update
+RUN apt-get install -y python-pip
+RUN rm -rf /var/lib/apt/lists/*
+
+WORKDIR /work/mxnet/python
+RUN pip3 install -e .
+RUN pip install -e .
+
diff --git a/docker_multiarch/Dockerfile.run.ubuntu-17.04.r b/docker_multiarch/Dockerfile.run.ubuntu-17.04.r
new file mode 100644
index 000000000000..493c7f0ef179
--- /dev/null
+++ b/docker_multiarch/Dockerfile.run.ubuntu-17.04.r
@@ -0,0 +1,36 @@
+# Before building this image you would need to build MXNet by executing:
+# docker build -f Dockerfile.build.ubuntu-17.04 -t mxnet.build.ubuntu-17.04 .
+# if you haven't done it before.
+
+FROM mxnet.build.ubuntu-17.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+#ENV BUILD_OPTS "USE_OPENCV=0 USE_BLAS=openblas GTEST_PATH=/usr/src/googletest/googletest"
+
+##################
+# R installation
+RUN apt-get update
+#RUN apt-get remove -y gnupg
+#RUN apt-get install -y --reinstall\
+#	 gnupg2 dirmngr
+
+RUN apt-get install -y dirmngr libopencv-dev
+RUN echo "deb http://cran.rstudio.com/bin/linux/ubuntu zesty/" >> /etc/apt/sources.list
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E084DAB9
+
+RUN apt-get install -y\
+	 r-base r-base-core r-recommended r-base-dev libxml2-dev libxt-dev libssl-dev libcurl4-openssl-dev
+
+
+WORKDIR /work/mxnet
+RUN cp R-package/DESCRIPTION .
+RUN Rscript -e "install.packages('devtools', repo = 'https://cran.rstudio.com')"
+RUN Rscript -e "library(devtools); library(methods); options(repos=c(CRAN='https://cran.rstudio.com')); install_deps(dependencies = TRUE)"
+
+
+##################
+# MXNet R package
+RUN make rpkg 
+RUN R CMD INSTALL mxnet_current_r.tar.gz
+##################
+
diff --git a/docker_multiarch/Dockerfile.test.ubuntu-17.04 b/docker_multiarch/Dockerfile.test.ubuntu-17.04
new file mode 100644
index 000000000000..1b0c145f488d
--- /dev/null
+++ b/docker_multiarch/Dockerfile.test.ubuntu-17.04
@@ -0,0 +1,41 @@
+FROM ubuntu-17.04
+RUN apt-get update &&\
+  apt-get install -y python3-nose python-nose python-pip libgtest-dev valgrind ninja-build\
+  &&\
+  rm -rf /var/lib/apt/lists/*
+
+###########################
+# Unit tests
+# Build google test
+WORKDIR /work/googletest
+RUN cmake /usr/src/googletest/googletest/ -GNinja
+RUN ninja
+# FIXME
+RUN mkdir -p /usr/src/googletest/googletest/lib/
+RUN cp libgtest.a /usr/src/googletest/googletest/lib/
+
+ENV BUILD_OPTS "USE_OPENCV=0 USE_BLAS=openblas GTEST_PATH=/usr/src/googletest/googletest"
+
+WORKDIR /work/mxnet
+RUN make -j$(nproc) test $BUILD_OPTS
+ENV MXNET_ENGINE_INFO=true
+RUN build/tests/cpp/mxnet_test
+RUN valgrind build/tests/cpp/mxnet_test
+############################
+
+############################
+# Python tests
+WORKDIR /work/mxnet/python
+RUN pip3 install -e .
+RUN pip install -e .
+
+WORKDIR /work/mxnet
+RUN nosetests3 --verbose tests/python/unittest
+RUN nosetests --verbose tests/python/unittest
+############################
+
+
+############################
+# Scala tests
+RUN make scalatest $BUILD_OPTS
+############################
diff --git a/docker_multiarch/README.md b/docker_multiarch/README.md
new file mode 100644
index 000000000000..a463d15bc82d
--- /dev/null
+++ b/docker_multiarch/README.md
@@ -0,0 +1,42 @@
+# Dockerized multi-architecture build
+
+These docker files and utilities will build mxnet and run tests for different architectures using cross compilation and produce
+runtime binary artifacts.
+
+This utilities require that you have docker installed. [Docker CE](https://docs.docker.com/engine/installation/linux/docker-ce/ubuntu/#install-docker) is recommended.
+
+
+To compile for all the supported architectures you can run the script
+```
+$ ./tool.py
+```
+
+To build a single arch, you can invoke docker directly:
+
+```
+$ docker build -f Dockerfile.build.<arch> -t <tag> .
+```
+
+Or call the dockerfile directly:
+
+```
+docker build -f <dockerfile> -t <tag> .
+```
+
+Or pass the architecture id to the tool:
+```
+$ ./tool.py -a ubuntu-17.04
+```
+
+By convention all the Dockerfiles produce the build artifacts on /work/build so they can be copied
+after.
+
+
+The tool will leave the resulting artifacts on the build/ directory
+
+# TODO
+
+- Handle dependencies between docker files, for example having a yaml file with the dependency graph
+  so they can be built in the right order. Right now the dependency is very simple so simple
+  alphabetical sorting of the images does the trick.
+
diff --git a/make/pip_linux_cpu.mk b/docker_multiarch/arm.crosscompile.android.mk
similarity index 80%
rename from make/pip_linux_cpu.mk
rename to docker_multiarch/arm.crosscompile.android.mk
index 01bc2702ebb7..22a5bfb6810e 100644
--- a/make/pip_linux_cpu.mk
+++ b/docker_multiarch/arm.crosscompile.android.mk
@@ -1,13 +1,26 @@
 #-------------------------------------------------------------------------------
-#  Template configuration for compiling mxnet for making python wheel
+#  Template configuration for compiling mxnet
+#
+#  If you want to change the configuration, please use the following
+#  steps. Assume you are on the root directory of mxnet. First copy the this
+#  file so that any local changes will be ignored by git
+#
+#  $ cp make/config.mk .
+#
+#  Next modify the according entries, and then compile by
+#
+#  $ make
+#
+#  or build in parallel with 8 threads
+#
+#  $ make -j8
 #-------------------------------------------------------------------------------
 
 #---------------------
-# choice of compiler
+# We do not assign compilers here.  Often when cross-compiling these will already 
+# be set correctly.
 #--------------------
 
-export CC = gcc
-export CXX = g++
 export NVCC = nvcc
 
 # whether compile with options for MXNet developer
@@ -20,32 +33,16 @@ DEBUG = 0
 USE_PROFILER =
 
 # the additional link flags you want to add
-ADD_LDFLAGS += -lopencv_core -lopencv_imgproc -lopencv_highgui
+# TODO: Move flags here
+ADD_LDFLAGS=-static-libstdc++ -L/work/OpenBLAS/
 
 # the additional compile flags you want to add
-ADD_CFLAGS += -Ldeps/lib -Ideps/include
+ADD_CFLAGS =
 
 #---------------------------------------------
 # matrix computation libraries for CPU/GPU
 #---------------------------------------------
 
-# choose the version of blas you want to use
-# can be: mkl, blas, atlas, openblas
-# in default use atlas for linux while apple for osx
-USE_BLAS=openblas
-
-# whether use lapack during compilation
-# only effective when compiled with blas versions openblas/apple/atlas/mkl
-USE_LAPACK = 1
-
-# path to lapack library in case of a non-standard installation
-USE_LAPACK_PATH =
-
-# whether use opencv during compilation
-# you can disable it, however, you will not able to use
-# imbin iterator
-USE_OPENCV = 1
-
 # whether use CUDA during compile
 USE_CUDA = 0
 
@@ -60,10 +57,14 @@ USE_CUDNN = 0
 # whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
 USE_NVRTC = 0
 
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 0
+
 # use openmp for parallelization
 USE_OPENMP = 1
 
-
 # MKL ML Library for Intel CPU/Xeon Phi
 # Please refer to MKL_README.md for details
 
@@ -82,25 +83,27 @@ USE_MKL2017_EXPERIMENTAL = 0
 # whether use NNPACK library
 USE_NNPACK = 0
 
+# For arm builds we're using openblas
+USE_BLAS = openblas
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 0
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH =
+
 # add path to intel library, you may need it for MKL, if you did not add the path
 # to environment variable
 USE_INTEL_PATH = NONE
 
-# If use MKL, choose static link automatically to allow python wrapper
+# If use MKL only for BLAS, choose static link automatically to allow python wrapper
+ifeq ($(USE_MKL2017), 0)
 ifeq ($(USE_BLAS), mkl)
 USE_STATIC_MKL = 1
-else
-USE_STATIC_MKL = NONE
 endif
-
-#----------------------------
-# Settings for power and arm arch
-#----------------------------
-ARCH := $(shell uname -a)
-ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
-	USE_SSE=0
 else
-	USE_SSE=1
+USE_STATIC_MKL = NONE
 endif
 
 #----------------------------
@@ -129,6 +132,12 @@ USE_S3 = 0
 # path to folders containing projects specific operators that you don't want to put in src/operators
 EXTRA_OPERATORS =
 
+#----------------------------
+# other features
+#----------------------------
+
+# Create C++ interface package
+USE_CPP_PACKAGE = 0
 
 #----------------------------
 # plugins
diff --git a/docker_multiarch/arm.crosscompile.mk b/docker_multiarch/arm.crosscompile.mk
new file mode 100644
index 000000000000..fea4e7777c22
--- /dev/null
+++ b/docker_multiarch/arm.crosscompile.mk
@@ -0,0 +1,162 @@
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet
+#
+#  If you want to change the configuration, please use the following
+#  steps. Assume you are on the root directory of mxnet. First copy the this
+#  file so that any local changes will be ignored by git
+#
+#  $ cp make/config.mk .
+#
+#  Next modify the according entries, and then compile by
+#
+#  $ make
+#
+#  or build in parallel with 8 threads
+#
+#  $ make -j8
+#-------------------------------------------------------------------------------
+
+#---------------------
+# We do not assign compilers here.  Often when cross-compiling these will already 
+# be set correctly.
+#--------------------
+
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether compiler with profiler
+USE_PROFILER =
+
+# the additional link flags you want to add
+# TODO: Move flags here
+ADD_LDFLAGS=-static-libstdc++
+
+# the additional compile flags you want to add
+ADD_CFLAGS =
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# whether use CUDA during compile
+USE_CUDA = 0
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+USE_CUDA_PATH = NONE
+
+# whether use CuDNN R3 library
+USE_CUDNN = 0
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+USE_NVRTC = 0
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 0
+
+# use openmp for parallelization
+USE_OPENMP = 1
+
+# MKL ML Library for Intel CPU/Xeon Phi
+# Please refer to MKL_README.md for details
+
+# MKL ML Library folder, need to be root for /usr/local
+# Change to User Home directory for standard user
+# For USE_BLAS!=mkl only
+MKLML_ROOT=/usr/local
+
+# whether use MKL2017 library
+USE_MKL2017 = 0
+
+# whether use MKL2017 experimental feature for high performance
+# Prerequisite USE_MKL2017=1
+USE_MKL2017_EXPERIMENTAL = 0
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# For arm builds we're using openblas
+USE_BLAS = openblas
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH =
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL only for BLAS, choose static link automatically to allow python wrapper
+ifeq ($(USE_MKL2017), 0)
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+endif
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 0
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 0
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+#----------------------------
+# other features
+#----------------------------
+
+# Create C++ interface package
+USE_CPP_PACKAGE = 0
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
\ No newline at end of file
diff --git a/docker_multiarch/tool.py b/docker_multiarch/tool.py
new file mode 100755
index 000000000000..d0003ec05822
--- /dev/null
+++ b/docker_multiarch/tool.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Multi arch dockerized build tool.
+
+"""
+
+__author__ = 'Pedro Larroy'
+__version__ = '0.1'
+
+import os
+import sys
+import subprocess
+import logging
+import argparse
+from subprocess import check_call
+import glob
+import re
+
+class CmdResult(object):
+    def __init__(self, std_out, std_err, status_code):
+        self.std_out = std_out
+        self.std_err = std_err
+        self.status_code = status_code if status_code is not None else 0
+
+    def __str__(self):
+        return "%s, %s, %s" % (self.std_out, self.std_err, self.status_code)
+
+def run(cmd, fail_on_error=True):
+    logging.debug("executing shell command:\n" + cmd)
+    proc = subprocess.Popen(
+        cmd,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    std_out, std_err = proc.communicate()
+    if fail_on_error:
+        if proc.returncode != 0:
+            logging.warn('Error running command: {}'.format(cmd))
+        assert proc.returncode == 0, std_err
+    res = CmdResult(std_out.decode('utf-8'), std_err.decode('utf-8'), proc.returncode)
+    return res
+
+
+def mkdir_p(d):
+    rev_path_list = list()
+    head = d
+    while len(head) and head != os.sep:
+        rev_path_list.append(head)
+        (head, tail) = os.path.split(head)
+
+    rev_path_list.reverse()
+    for p in rev_path_list:
+        try:
+            os.mkdir(p)
+        except OSError as e:
+            if e.errno != 17:
+                raise
+
+def get_arches():
+    """Get a list of architectures given our dockerfiles"""
+    dockerfiles = glob.glob("Dockerfile.build.*")
+    dockerfiles = list(filter(lambda x: x[-1] != '~', dockerfiles))
+    arches = list(map(lambda x: re.sub(r"Dockerfile.build.(.*)", r"\1", x), dockerfiles))
+    arches.sort()
+    return arches
+
+def sync_source():
+    logging.info("Copying sources")
+    check_call(["rsync","-a","--delete","--exclude=\".git/\"",'--exclude=/docker_multiarch/',"../","mxnet"])
+
+def get_docker_tag(arch):
+    return "mxnet.build.{0}".format(arch)
+
+def get_dockerfile(arch):
+    return "Dockerfile.build.{0}".format(arch)
+
+def build(arch):
+    """Build the given architecture in the container"""
+    assert arch in get_arches(), "No such architecture {0}, Dockerfile.build.{0} not found".format(arch)
+    logging.info("Building for target platform {0}".format(arch))
+    check_call(["docker", "build",
+        "-f", get_dockerfile(arch),
+        "-t", get_docker_tag(arch),
+        "."])
+
+def collect_artifacts(arch):
+    """Collects the artifacts built inside the docker container to the local fs"""
+    def artifact_path(arch):
+        return "{}/build/{}".format(os.getcwd(), arch)
+    logging.info("Collect artifacts from build in {0}".format(artifact_path(arch)))
+    mkdir_p("build/{}".format(arch))
+
+    # Mount artifact_path on /$arch inside the container and copy the build output so we can access
+    # locally from the host fs
+    check_call(["docker","run",
+        "-v", "{}:/{}".format(artifact_path(arch), arch),
+        get_docker_tag(arch),
+        "bash", "-c", "cp -r /work/build/* /{}".format(arch)])
+
+def main():
+    logging.getLogger().setLevel(logging.INFO)
+    logging.basicConfig(format='%(asctime)-15s %(message)s')
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-a", "--arch",
+        help="Architecture",
+        type=str)
+
+    parser.add_argument("-l", "--list_arch",
+        help="List architectures",
+        action='store_true')
+    args = parser.parse_args()
+
+    if args.list_arch:
+        arches = get_arches()
+        print(arches)
+
+    elif args.arch:
+        sync_source()
+        build(args.arch)
+        collect_artifacts(args.arch)
+
+    else:
+        arches = get_arches()
+        logging.info("Building for all architectures: {}".format(arches))
+        logging.info("Artifacts will be produced in the build/ directory.")
+        sync_source()
+        for arch in arches:
+            build(arch)
+            collect_artifacts(arch)
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/docs/README.md b/docs/README.md
index 7780f8961cc6..ad64b76d648b 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,22 +1,91 @@
 # MXNet documentation
 
+## How to build MXNet website
+
+MXNet Documentation Website is built with [sphinx 1.5.1](http://www.sphinx-doc.org/en/1.5.1/intro.html).
+
 A built version of document is available at http://mxnet.io
 
-To build the documents locally, we need to first install [docker](docker.com).
+To build the documents locally, we need to first install [docker](https://docker.com).
 Then use the following commands to clone and
 build the documents.
 
 ```bash
-git clone --recursive https://github.com/dmlc/mxnet
+git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet
 cd mxnet && make docs
 ```
 
+In case docker method is not available, there is an alternate method:
+```bash
+sudo pip install sphinx==1.5.1 CommonMark==0.5.4 breathe mock==1.0.1 recommonmark pypandoc
+cd mxnet/docs && make html USE_OPENMP=0
+```
+
 The results will be available at `docs/_build/html/`.
 
 Note:
 
 - If C++ codes have been changed, we suggest to remove the previous results to
   trigger the rebuild for all pages, namely run `make clean_docs`.
-- If C++ codes are failed to build, run `make clean`
+- If C++ code fails to build, run `make clean`
 - If CSS or javascript are changed, we often need to do a *force refresh* in the
   browser to clear the cache.
+- If search doesn't work, we need to `make clean` and rebuild.
+  
+## File structure
+
+1. Static files such as css, javascript and html templates are under `_static` folder:
+- Javascript files are under `_static/js` folder.
+- Layout templates and landing page html file are under `_static/mxnet-theme` folder.
+- `_static/mxnet.css` contains all MXNet website styles.
+
+2. Sphinx converts markdowns files to html. Page contents are markdown files. Each content folder 
+contains an index file as landing page.
+
+3. There are some utility scripts to help building website, such as `mxdoc.py` and `build_version_doc/`.
+They are used to manipulate website contents during building.
+
+## Production website building process
+
+[Apache Jenkins MXNet website building job](https://builds.apache.org/job/incubator-mxnet-build-site/) is used to build MXNet website. 
+There are two ways to trigger this job. 
+First is nightly build for master branch. 
+Second is manually trigger job when a new version is released. This will build for new version.
+
+The job will fetch mxnet repository, build MXNet website and push all static files to [host repository](https://github.com/apache/incubator-mxnet-site.git). 
+The host repo is hooked with [Apache gitbox](https://gitbox.apache.org/repos/asf?p=incubator-mxnet-site.git;a=summary) to host website.
+
+## Build versioning website
+
+`make docs` doesn't add any version information. Version information is added by [Apache Jenkins MXNet website building job](https://builds.apache.org/job/incubator-mxnet-build-site/).
+Landing page will point to the latest released version. Older versions and master version are placed under versions folder. 
+After completing website update and testing it locally, we also need to build and test versioning website.
+
+Python Beautifulsoup is the dependency:
+
+```bash
+sudo pip install beautifulsoup4
+```
+
+The essenitial part of adding version is to use `AddPackageLink.py` to add Apache source packages and 
+`AddVersion.py` to update all version related information on website. These two scripts are used in `build_doc.sh` and `build_all_version`. 
+
+`build_doc.sh` is used by Apache Jenkins MXNet webiste building job to incremental adding version. We don't need it 
+for local website development. 
+
+`build_all_version.sh` is to rebuild versioning website locally and is useful to verify versioning website locally. 
+We need to specify which versions to be built. This can be set in `tag_list` variable at the beginning of the script. 
+Version order should be from latest to oldest and placing master at the end. We may also want to modify `mxnet_url` 
+variable to our own repository for local testing. Another use case is to completely rebuild website with specific versions. 
+Although this will not happen often, we can use it to rebuld whole website and push to [host repo](https://github.com/apache/incubator-mxnet-site.git).
+
+```bash
+./build_all_version.sh
+```
+
+## Develop notes
+
+1. `AddVersion.py` depends on Beautiful library, which requires target html files to have close tags. Although open tag html can still be rendered by browser, it will be problematic for Beautifulsoup. 
+
+2. `AddVersion.py` and `AddPackageLink.py` manipulates contents for website. If there are layout changes, it may break these two scripts. We need to change scripts respectively.
+
diff --git a/docs/_static/cn.svg b/docs/_static/cn.svg
index 515176d60f15..9fb3fc084c3c 100644
--- a/docs/_static/cn.svg
+++ b/docs/_static/cn.svg
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="640" height="480" viewBox="-5 -5 12.8 9.6">
   <title>
     Flag of the People&apos;s Republic of China
diff --git a/docs/_static/jquery-1.11.1.js b/docs/_static/jquery-1.11.1.js
index d4b67f7e6c1a..a0ee94130c93 100644
--- a/docs/_static/jquery-1.11.1.js
+++ b/docs/_static/jquery-1.11.1.js
@@ -6,7 +6,28 @@
  * http://sizzlejs.com/
  *
  * Copyright 2005, 2014 jQuery Foundation, Inc. and other contributors
+ * ----
  * Released under the MIT license
+ * MIT License
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ * ----
  * http://jquery.org/license
  *
  * Date: 2014-05-01T17:42Z
diff --git a/docs/_static/js/auto_module_index.js b/docs/_static/js/auto_module_index.js
index 7f4e185655d3..83bdbf37173b 100644
--- a/docs/_static/js/auto_module_index.js
+++ b/docs/_static/js/auto_module_index.js
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 function auto_index(module) {
   $(document).ready(function () {
     // find all classes or functions
@@ -21,4 +40,4 @@ function auto_index(module) {
     html += "</ul>";
     li_node.append(html);
   });
-}
\ No newline at end of file
+}
diff --git a/docs/_static/js/clipboard.min.js b/docs/_static/js/clipboard.min.js
old mode 100755
new mode 100644
index 1993676f9928..a23c4e1384d5
--- a/docs/_static/js/clipboard.min.js
+++ b/docs/_static/js/clipboard.min.js
@@ -1,7 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  * clipboard.js v1.6.1
  * https://zenorocha.github.io/clipboard.js
  *
  * Licensed MIT © Zeno Rocha
  */
-!function(e){if("object"==typeof exports&&"undefined"!=typeof module)module.exports=e();else if("function"==typeof define&&define.amd)define([],e);else{var t;t="undefined"!=typeof window?window:"undefined"!=typeof global?global:"undefined"!=typeof self?self:this,t.Clipboard=e()}}(function(){var e,t,n;return function e(t,n,o){function i(a,c){if(!n[a]){if(!t[a]){var l="function"==typeof require&&require;if(!c&&l)return l(a,!0);if(r)return r(a,!0);var u=new Error("Cannot find module '"+a+"'");throw u.code="MODULE_NOT_FOUND",u}var s=n[a]={exports:{}};t[a][0].call(s.exports,function(e){var n=t[a][1][e];return i(n?n:e)},s,s.exports,e,t,n,o)}return n[a].exports}for(var r="function"==typeof require&&require,a=0;a<o.length;a++)i(o[a]);return i}({1:[function(e,t,n){function o(e,t){for(;e&&e.nodeType!==i;){if(e.matches(t))return e;e=e.parentNode}}var i=9;if("undefined"!=typeof Element&&!Element.prototype.matches){var r=Element.prototype;r.matches=r.matchesSelector||r.mozMatchesSelector||r.msMatchesSelector||r.oMatchesSelector||r.webkitMatchesSelector}t.exports=o},{}],2:[function(e,t,n){function o(e,t,n,o,r){var a=i.apply(this,arguments);return e.addEventListener(n,a,r),{destroy:function(){e.removeEventListener(n,a,r)}}}function i(e,t,n,o){return function(n){n.delegateTarget=r(n.target,t),n.delegateTarget&&o.call(e,n)}}var r=e("./closest");t.exports=o},{"./closest":1}],3:[function(e,t,n){n.node=function(e){return void 0!==e&&e instanceof HTMLElement&&1===e.nodeType},n.nodeList=function(e){var t=Object.prototype.toString.call(e);return void 0!==e&&("[object NodeList]"===t||"[object HTMLCollection]"===t)&&"length"in e&&(0===e.length||n.node(e[0]))},n.string=function(e){return"string"==typeof e||e instanceof String},n.fn=function(e){var t=Object.prototype.toString.call(e);return"[object Function]"===t}},{}],4:[function(e,t,n){function o(e,t,n){if(!e&&!t&&!n)throw new Error("Missing required arguments");if(!c.string(t))throw new TypeError("Second argument must be a String");if(!c.fn(n))throw new TypeError("Third argument must be a Function");if(c.node(e))return i(e,t,n);if(c.nodeList(e))return r(e,t,n);if(c.string(e))return a(e,t,n);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function i(e,t,n){return e.addEventListener(t,n),{destroy:function(){e.removeEventListener(t,n)}}}function r(e,t,n){return Array.prototype.forEach.call(e,function(e){e.addEventListener(t,n)}),{destroy:function(){Array.prototype.forEach.call(e,function(e){e.removeEventListener(t,n)})}}}function a(e,t,n){return l(document.body,e,t,n)}var c=e("./is"),l=e("delegate");t.exports=o},{"./is":3,delegate:2}],5:[function(e,t,n){function o(e){var t;if("SELECT"===e.nodeName)e.focus(),t=e.value;else if("INPUT"===e.nodeName||"TEXTAREA"===e.nodeName){var n=e.hasAttribute("readonly");n||e.setAttribute("readonly",""),e.select(),e.setSelectionRange(0,e.value.length),n||e.removeAttribute("readonly"),t=e.value}else{e.hasAttribute("contenteditable")&&e.focus();var o=window.getSelection(),i=document.createRange();i.selectNodeContents(e),o.removeAllRanges(),o.addRange(i),t=o.toString()}return t}t.exports=o},{}],6:[function(e,t,n){function o(){}o.prototype={on:function(e,t,n){var o=this.e||(this.e={});return(o[e]||(o[e]=[])).push({fn:t,ctx:n}),this},once:function(e,t,n){function o(){i.off(e,o),t.apply(n,arguments)}var i=this;return o._=t,this.on(e,o,n)},emit:function(e){var t=[].slice.call(arguments,1),n=((this.e||(this.e={}))[e]||[]).slice(),o=0,i=n.length;for(o;o<i;o++)n[o].fn.apply(n[o].ctx,t);return this},off:function(e,t){var n=this.e||(this.e={}),o=n[e],i=[];if(o&&t)for(var r=0,a=o.length;r<a;r++)o[r].fn!==t&&o[r].fn._!==t&&i.push(o[r]);return i.length?n[e]=i:delete n[e],this}},t.exports=o},{}],7:[function(t,n,o){!function(i,r){if("function"==typeof e&&e.amd)e(["module","select"],r);else if("undefined"!=typeof o)r(n,t("select"));else{var a={exports:{}};r(a,i.select),i.clipboardAction=a.exports}}(this,function(e,t){"use strict";function n(e){return e&&e.__esModule?e:{default:e}}function o(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")}var i=n(t),r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(e){return typeof e}:function(e){return e&&"function"==typeof Symbol&&e.constructor===Symbol&&e!==Symbol.prototype?"symbol":typeof e},a=function(){function e(e,t){for(var n=0;n<t.length;n++){var o=t[n];o.enumerable=o.enumerable||!1,o.configurable=!0,"value"in o&&(o.writable=!0),Object.defineProperty(e,o.key,o)}}return function(t,n,o){return n&&e(t.prototype,n),o&&e(t,o),t}}(),c=function(){function e(t){o(this,e),this.resolveOptions(t),this.initSelection()}return a(e,[{key:"resolveOptions",value:function e(){var t=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};this.action=t.action,this.emitter=t.emitter,this.target=t.target,this.text=t.text,this.trigger=t.trigger,this.selectedText=""}},{key:"initSelection",value:function e(){this.text?this.selectFake():this.target&&this.selectTarget()}},{key:"selectFake",value:function e(){var t=this,n="rtl"==document.documentElement.getAttribute("dir");this.removeFake(),this.fakeHandlerCallback=function(){return t.removeFake()},this.fakeHandler=document.body.addEventListener("click",this.fakeHandlerCallback)||!0,this.fakeElem=document.createElement("textarea"),this.fakeElem.style.fontSize="12pt",this.fakeElem.style.border="0",this.fakeElem.style.padding="0",this.fakeElem.style.margin="0",this.fakeElem.style.position="absolute",this.fakeElem.style[n?"right":"left"]="-9999px";var o=window.pageYOffset||document.documentElement.scrollTop;this.fakeElem.style.top=o+"px",this.fakeElem.setAttribute("readonly",""),this.fakeElem.value=this.text,document.body.appendChild(this.fakeElem),this.selectedText=(0,i.default)(this.fakeElem),this.copyText()}},{key:"removeFake",value:function e(){this.fakeHandler&&(document.body.removeEventListener("click",this.fakeHandlerCallback),this.fakeHandler=null,this.fakeHandlerCallback=null),this.fakeElem&&(document.body.removeChild(this.fakeElem),this.fakeElem=null)}},{key:"selectTarget",value:function e(){this.selectedText=(0,i.default)(this.target),this.copyText()}},{key:"copyText",value:function e(){var t=void 0;try{t=document.execCommand(this.action)}catch(e){t=!1}this.handleResult(t)}},{key:"handleResult",value:function e(t){this.emitter.emit(t?"success":"error",{action:this.action,text:this.selectedText,trigger:this.trigger,clearSelection:this.clearSelection.bind(this)})}},{key:"clearSelection",value:function e(){this.target&&this.target.blur(),window.getSelection().removeAllRanges()}},{key:"destroy",value:function e(){this.removeFake()}},{key:"action",set:function e(){var t=arguments.length>0&&void 0!==arguments[0]?arguments[0]:"copy";if(this._action=t,"copy"!==this._action&&"cut"!==this._action)throw new Error('Invalid "action" value, use either "copy" or "cut"')},get:function e(){return this._action}},{key:"target",set:function e(t){if(void 0!==t){if(!t||"object"!==("undefined"==typeof t?"undefined":r(t))||1!==t.nodeType)throw new Error('Invalid "target" value, use a valid Element');if("copy"===this.action&&t.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if("cut"===this.action&&(t.hasAttribute("readonly")||t.hasAttribute("disabled")))throw new Error('Invalid "target" attribute. You can\'t cut text from elements with "readonly" or "disabled" attributes');this._target=t}},get:function e(){return this._target}}]),e}();e.exports=c})},{select:5}],8:[function(t,n,o){!function(i,r){if("function"==typeof e&&e.amd)e(["module","./clipboard-action","tiny-emitter","good-listener"],r);else if("undefined"!=typeof o)r(n,t("./clipboard-action"),t("tiny-emitter"),t("good-listener"));else{var a={exports:{}};r(a,i.clipboardAction,i.tinyEmitter,i.goodListener),i.clipboard=a.exports}}(this,function(e,t,n,o){"use strict";function i(e){return e&&e.__esModule?e:{default:e}}function r(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")}function a(e,t){if(!e)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!t||"object"!=typeof t&&"function"!=typeof t?e:t}function c(e,t){if("function"!=typeof t&&null!==t)throw new TypeError("Super expression must either be null or a function, not "+typeof t);e.prototype=Object.create(t&&t.prototype,{constructor:{value:e,enumerable:!1,writable:!0,configurable:!0}}),t&&(Object.setPrototypeOf?Object.setPrototypeOf(e,t):e.__proto__=t)}function l(e,t){var n="data-clipboard-"+e;if(t.hasAttribute(n))return t.getAttribute(n)}var u=i(t),s=i(n),f=i(o),d=function(){function e(e,t){for(var n=0;n<t.length;n++){var o=t[n];o.enumerable=o.enumerable||!1,o.configurable=!0,"value"in o&&(o.writable=!0),Object.defineProperty(e,o.key,o)}}return function(t,n,o){return n&&e(t.prototype,n),o&&e(t,o),t}}(),h=function(e){function t(e,n){r(this,t);var o=a(this,(t.__proto__||Object.getPrototypeOf(t)).call(this));return o.resolveOptions(n),o.listenClick(e),o}return c(t,e),d(t,[{key:"resolveOptions",value:function e(){var t=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};this.action="function"==typeof t.action?t.action:this.defaultAction,this.target="function"==typeof t.target?t.target:this.defaultTarget,this.text="function"==typeof t.text?t.text:this.defaultText}},{key:"listenClick",value:function e(t){var n=this;this.listener=(0,f.default)(t,"click",function(e){return n.onClick(e)})}},{key:"onClick",value:function e(t){var n=t.delegateTarget||t.currentTarget;this.clipboardAction&&(this.clipboardAction=null),this.clipboardAction=new u.default({action:this.action(n),target:this.target(n),text:this.text(n),trigger:n,emitter:this})}},{key:"defaultAction",value:function e(t){return l("action",t)}},{key:"defaultTarget",value:function e(t){var n=l("target",t);if(n)return document.querySelector(n)}},{key:"defaultText",value:function e(t){return l("text",t)}},{key:"destroy",value:function e(){this.listener.destroy(),this.clipboardAction&&(this.clipboardAction.destroy(),this.clipboardAction=null)}}],[{key:"isSupported",value:function e(){var t=arguments.length>0&&void 0!==arguments[0]?arguments[0]:["copy","cut"],n="string"==typeof t?[t]:t,o=!!document.queryCommandSupported;return n.forEach(function(e){o=o&&!!document.queryCommandSupported(e)}),o}}]),t}(s.default);e.exports=h})},{"./clipboard-action":7,"good-listener":4,"tiny-emitter":6}]},{},[8])(8)});
\ No newline at end of file
+!function(e){if("object"==typeof exports&&"undefined"!=typeof module)module.exports=e();else if("function"==typeof define&&define.amd)define([],e);else{var t;t="undefined"!=typeof window?window:"undefined"!=typeof global?global:"undefined"!=typeof self?self:this,t.Clipboard=e()}}(function(){var e,t,n;return function e(t,n,o){function i(a,c){if(!n[a]){if(!t[a]){var l="function"==typeof require&&require;if(!c&&l)return l(a,!0);if(r)return r(a,!0);var u=new Error("Cannot find module '"+a+"'");throw u.code="MODULE_NOT_FOUND",u}var s=n[a]={exports:{}};t[a][0].call(s.exports,function(e){var n=t[a][1][e];return i(n?n:e)},s,s.exports,e,t,n,o)}return n[a].exports}for(var r="function"==typeof require&&require,a=0;a<o.length;a++)i(o[a]);return i}({1:[function(e,t,n){function o(e,t){for(;e&&e.nodeType!==i;){if(e.matches(t))return e;e=e.parentNode}}var i=9;if("undefined"!=typeof Element&&!Element.prototype.matches){var r=Element.prototype;r.matches=r.matchesSelector||r.mozMatchesSelector||r.msMatchesSelector||r.oMatchesSelector||r.webkitMatchesSelector}t.exports=o},{}],2:[function(e,t,n){function o(e,t,n,o,r){var a=i.apply(this,arguments);return e.addEventListener(n,a,r),{destroy:function(){e.removeEventListener(n,a,r)}}}function i(e,t,n,o){return function(n){n.delegateTarget=r(n.target,t),n.delegateTarget&&o.call(e,n)}}var r=e("./closest");t.exports=o},{"./closest":1}],3:[function(e,t,n){n.node=function(e){return void 0!==e&&e instanceof HTMLElement&&1===e.nodeType},n.nodeList=function(e){var t=Object.prototype.toString.call(e);return void 0!==e&&("[object NodeList]"===t||"[object HTMLCollection]"===t)&&"length"in e&&(0===e.length||n.node(e[0]))},n.string=function(e){return"string"==typeof e||e instanceof String},n.fn=function(e){var t=Object.prototype.toString.call(e);return"[object Function]"===t}},{}],4:[function(e,t,n){function o(e,t,n){if(!e&&!t&&!n)throw new Error("Missing required arguments");if(!c.string(t))throw new TypeError("Second argument must be a String");if(!c.fn(n))throw new TypeError("Third argument must be a Function");if(c.node(e))return i(e,t,n);if(c.nodeList(e))return r(e,t,n);if(c.string(e))return a(e,t,n);throw new TypeError("First argument must be a String, HTMLElement, HTMLCollection, or NodeList")}function i(e,t,n){return e.addEventListener(t,n),{destroy:function(){e.removeEventListener(t,n)}}}function r(e,t,n){return Array.prototype.forEach.call(e,function(e){e.addEventListener(t,n)}),{destroy:function(){Array.prototype.forEach.call(e,function(e){e.removeEventListener(t,n)})}}}function a(e,t,n){return l(document.body,e,t,n)}var c=e("./is"),l=e("delegate");t.exports=o},{"./is":3,delegate:2}],5:[function(e,t,n){function o(e){var t;if("SELECT"===e.nodeName)e.focus(),t=e.value;else if("INPUT"===e.nodeName||"TEXTAREA"===e.nodeName){var n=e.hasAttribute("readonly");n||e.setAttribute("readonly",""),e.select(),e.setSelectionRange(0,e.value.length),n||e.removeAttribute("readonly"),t=e.value}else{e.hasAttribute("contenteditable")&&e.focus();var o=window.getSelection(),i=document.createRange();i.selectNodeContents(e),o.removeAllRanges(),o.addRange(i),t=o.toString()}return t}t.exports=o},{}],6:[function(e,t,n){function o(){}o.prototype={on:function(e,t,n){var o=this.e||(this.e={});return(o[e]||(o[e]=[])).push({fn:t,ctx:n}),this},once:function(e,t,n){function o(){i.off(e,o),t.apply(n,arguments)}var i=this;return o._=t,this.on(e,o,n)},emit:function(e){var t=[].slice.call(arguments,1),n=((this.e||(this.e={}))[e]||[]).slice(),o=0,i=n.length;for(o;o<i;o++)n[o].fn.apply(n[o].ctx,t);return this},off:function(e,t){var n=this.e||(this.e={}),o=n[e],i=[];if(o&&t)for(var r=0,a=o.length;r<a;r++)o[r].fn!==t&&o[r].fn._!==t&&i.push(o[r]);return i.length?n[e]=i:delete n[e],this}},t.exports=o},{}],7:[function(t,n,o){!function(i,r){if("function"==typeof e&&e.amd)e(["module","select"],r);else if("undefined"!=typeof o)r(n,t("select"));else{var a={exports:{}};r(a,i.select),i.clipboardAction=a.exports}}(this,function(e,t){"use strict";function n(e){return e&&e.__esModule?e:{default:e}}function o(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")}var i=n(t),r="function"==typeof Symbol&&"symbol"==typeof Symbol.iterator?function(e){return typeof e}:function(e){return e&&"function"==typeof Symbol&&e.constructor===Symbol&&e!==Symbol.prototype?"symbol":typeof e},a=function(){function e(e,t){for(var n=0;n<t.length;n++){var o=t[n];o.enumerable=o.enumerable||!1,o.configurable=!0,"value"in o&&(o.writable=!0),Object.defineProperty(e,o.key,o)}}return function(t,n,o){return n&&e(t.prototype,n),o&&e(t,o),t}}(),c=function(){function e(t){o(this,e),this.resolveOptions(t),this.initSelection()}return a(e,[{key:"resolveOptions",value:function e(){var t=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};this.action=t.action,this.emitter=t.emitter,this.target=t.target,this.text=t.text,this.trigger=t.trigger,this.selectedText=""}},{key:"initSelection",value:function e(){this.text?this.selectFake():this.target&&this.selectTarget()}},{key:"selectFake",value:function e(){var t=this,n="rtl"==document.documentElement.getAttribute("dir");this.removeFake(),this.fakeHandlerCallback=function(){return t.removeFake()},this.fakeHandler=document.body.addEventListener("click",this.fakeHandlerCallback)||!0,this.fakeElem=document.createElement("textarea"),this.fakeElem.style.fontSize="12pt",this.fakeElem.style.border="0",this.fakeElem.style.padding="0",this.fakeElem.style.margin="0",this.fakeElem.style.position="absolute",this.fakeElem.style[n?"right":"left"]="-9999px";var o=window.pageYOffset||document.documentElement.scrollTop;this.fakeElem.style.top=o+"px",this.fakeElem.setAttribute("readonly",""),this.fakeElem.value=this.text,document.body.appendChild(this.fakeElem),this.selectedText=(0,i.default)(this.fakeElem),this.copyText()}},{key:"removeFake",value:function e(){this.fakeHandler&&(document.body.removeEventListener("click",this.fakeHandlerCallback),this.fakeHandler=null,this.fakeHandlerCallback=null),this.fakeElem&&(document.body.removeChild(this.fakeElem),this.fakeElem=null)}},{key:"selectTarget",value:function e(){this.selectedText=(0,i.default)(this.target),this.copyText()}},{key:"copyText",value:function e(){var t=void 0;try{t=document.execCommand(this.action)}catch(e){t=!1}this.handleResult(t)}},{key:"handleResult",value:function e(t){this.emitter.emit(t?"success":"error",{action:this.action,text:this.selectedText,trigger:this.trigger,clearSelection:this.clearSelection.bind(this)})}},{key:"clearSelection",value:function e(){this.target&&this.target.blur(),window.getSelection().removeAllRanges()}},{key:"destroy",value:function e(){this.removeFake()}},{key:"action",set:function e(){var t=arguments.length>0&&void 0!==arguments[0]?arguments[0]:"copy";if(this._action=t,"copy"!==this._action&&"cut"!==this._action)throw new Error('Invalid "action" value, use either "copy" or "cut"')},get:function e(){return this._action}},{key:"target",set:function e(t){if(void 0!==t){if(!t||"object"!==("undefined"==typeof t?"undefined":r(t))||1!==t.nodeType)throw new Error('Invalid "target" value, use a valid Element');if("copy"===this.action&&t.hasAttribute("disabled"))throw new Error('Invalid "target" attribute. Please use "readonly" instead of "disabled" attribute');if("cut"===this.action&&(t.hasAttribute("readonly")||t.hasAttribute("disabled")))throw new Error('Invalid "target" attribute. You can\'t cut text from elements with "readonly" or "disabled" attributes');this._target=t}},get:function e(){return this._target}}]),e}();e.exports=c})},{select:5}],8:[function(t,n,o){!function(i,r){if("function"==typeof e&&e.amd)e(["module","./clipboard-action","tiny-emitter","good-listener"],r);else if("undefined"!=typeof o)r(n,t("./clipboard-action"),t("tiny-emitter"),t("good-listener"));else{var a={exports:{}};r(a,i.clipboardAction,i.tinyEmitter,i.goodListener),i.clipboard=a.exports}}(this,function(e,t,n,o){"use strict";function i(e){return e&&e.__esModule?e:{default:e}}function r(e,t){if(!(e instanceof t))throw new TypeError("Cannot call a class as a function")}function a(e,t){if(!e)throw new ReferenceError("this hasn't been initialised - super() hasn't been called");return!t||"object"!=typeof t&&"function"!=typeof t?e:t}function c(e,t){if("function"!=typeof t&&null!==t)throw new TypeError("Super expression must either be null or a function, not "+typeof t);e.prototype=Object.create(t&&t.prototype,{constructor:{value:e,enumerable:!1,writable:!0,configurable:!0}}),t&&(Object.setPrototypeOf?Object.setPrototypeOf(e,t):e.__proto__=t)}function l(e,t){var n="data-clipboard-"+e;if(t.hasAttribute(n))return t.getAttribute(n)}var u=i(t),s=i(n),f=i(o),d=function(){function e(e,t){for(var n=0;n<t.length;n++){var o=t[n];o.enumerable=o.enumerable||!1,o.configurable=!0,"value"in o&&(o.writable=!0),Object.defineProperty(e,o.key,o)}}return function(t,n,o){return n&&e(t.prototype,n),o&&e(t,o),t}}(),h=function(e){function t(e,n){r(this,t);var o=a(this,(t.__proto__||Object.getPrototypeOf(t)).call(this));return o.resolveOptions(n),o.listenClick(e),o}return c(t,e),d(t,[{key:"resolveOptions",value:function e(){var t=arguments.length>0&&void 0!==arguments[0]?arguments[0]:{};this.action="function"==typeof t.action?t.action:this.defaultAction,this.target="function"==typeof t.target?t.target:this.defaultTarget,this.text="function"==typeof t.text?t.text:this.defaultText}},{key:"listenClick",value:function e(t){var n=this;this.listener=(0,f.default)(t,"click",function(e){return n.onClick(e)})}},{key:"onClick",value:function e(t){var n=t.delegateTarget||t.currentTarget;this.clipboardAction&&(this.clipboardAction=null),this.clipboardAction=new u.default({action:this.action(n),target:this.target(n),text:this.text(n),trigger:n,emitter:this})}},{key:"defaultAction",value:function e(t){return l("action",t)}},{key:"defaultTarget",value:function e(t){var n=l("target",t);if(n)return document.querySelector(n)}},{key:"defaultText",value:function e(t){return l("text",t)}},{key:"destroy",value:function e(){this.listener.destroy(),this.clipboardAction&&(this.clipboardAction.destroy(),this.clipboardAction=null)}}],[{key:"isSupported",value:function e(){var t=arguments.length>0&&void 0!==arguments[0]?arguments[0]:["copy","cut"],n="string"==typeof t?[t]:t,o=!!document.queryCommandSupported;return n.forEach(function(e){o=o&&!!document.queryCommandSupported(e)}),o}}]),t}(s.default);e.exports=h})},{"./clipboard-action":7,"good-listener":4,"tiny-emitter":6}]},{},[8])(8)});
diff --git a/docs/_static/js/copycode.js b/docs/_static/js/copycode.js
index 141e797fa2c2..b1c268cfec3b 100644
--- a/docs/_static/js/copycode.js
+++ b/docs/_static/js/copycode.js
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*Copy code to clipboard*/
 LANG_GP = {'default':'>>> ', 'python':'>>> ' , 'scala':'scala>', 'julia':'julia> ', 'r':'> ', 'perl':'pdl>' , 'cpp':'', 'bash':'$ '};
 
@@ -6,6 +25,7 @@ function addBtn() {
               'data-placement="bottom" title="Copy to clipboard"><i class="fa fa-copy"></i></button>'
     for (var lang in LANG_GP) {
         codeBlock = $('div .highlight-' + lang);
+        codeBlock.css('position', 'relative')
         codeBlock.prepend(copyBtn);
         codeBlock.find('.copy-btn').addClass(lang);
         codeBlock.hover(
diff --git a/docs/_static/js/navbar.js b/docs/_static/js/navbar.js
index 91e0356d9263..e3601c409ee0 100644
--- a/docs/_static/js/navbar.js
+++ b/docs/_static/js/navbar.js
@@ -1,9 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 var searchBox = $("#search-input-wrap");
-var TITLE = ['/get_started/', '/tutorials/', '/how_to/', '/api/', '/architecture/'];
-var APIsubMenu;
+var TITLE = ['/get_started/', '/tutorials/', '/gluon/' , '/api/', '/community/contribute.html', ];
+var DOC_TITLE = ['/faq/', '/architecture/', '/model_zoo/'];
+var APISubmenu, versionSubmenu, docSubmenu;
 $("#burgerMenu").children().each(function () {
-    if($(this).children().first().html() == 'API') APIsubMenu = $(this).clone()
-    if($(this).children().first().html().startsWith('Versions')) VersionsubMenu = $(this).clone()
+    if($(this).children().first().html() == 'API') APISubmenu = $(this).clone();
+    if($(this).children().first().html().startsWith('Versions')) versionSubmenu = $(this).clone();
+    if($(this).children().first().html() == 'Docs') docSubmenu= $(this).clone();
 });
 
 function navbar() {
@@ -40,10 +61,13 @@ function navbar() {
     $("#plusMenu").empty();
     for (var i = 0; i < plusMenuList.length; ++i) {
         if(plusMenuList[i].attr('id') == 'dropdown-menu-position-anchor') {
-            $("#plusMenu").append(APIsubMenu);
+            $("#plusMenu").append(APISubmenu);
         }
         else if(plusMenuList[i].attr('id') == 'dropdown-menu-position-anchor-version') {
-            $("#plusMenu").append(VersionsubMenu);
+            $("#plusMenu").append(versionSubmenu);
+        }
+        else if(plusMenuList[i].attr('id') == 'dropdown-menu-position-anchor-docs') {
+            $("#plusMenu").append(docSubmenu);
         }
         else {
             $("#plusMenu").append("<li></li>");
@@ -62,8 +86,16 @@ function showTab() {
             var tab = $($('#main-nav').children().eq(i));
             if(!tab.is('a')) tab = tab.find('a').first();
             tab.css('border-bottom', '3px solid');
+            return;
         }
     }
+     for(var i = 0; i < DOC_TITLE.length; ++i) {
+        if(url.indexOf(DOC_TITLE[i]) != -1) {
+            var tab = $($('#main-nav').children().eq(4));
+            if(!tab.is('a')) tab = tab.find('a').first();
+            tab.css('border-bottom', '3px solid');
+        }
+     }
 }
 
 $(document).ready(function () {
@@ -71,5 +103,7 @@ $(document).ready(function () {
     showTab();
     $(window).resize(function () {
         navbar();
+        if($("body").prop("clientWidth") < 1000 || $('div.sphinxsidebar').css('visibility') == 'hidden') $('div.content').css('width', '100%');
+        else $('div.content').css('width', 'calc(100% - 300px)');
     });
-});
\ No newline at end of file
+});
diff --git a/docs/_static/js/options.js b/docs/_static/js/options.js
index 77ef94074c57..6e285df88638 100644
--- a/docs/_static/js/options.js
+++ b/docs/_static/js/options.js
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 $(document).ready(function () {
     function label(lbl) {
         return lbl.replace(/[ .]/g, '-').toLowerCase();
diff --git a/docs/_static/js/page.js b/docs/_static/js/page.js
new file mode 100644
index 000000000000..9054bf49ca04
--- /dev/null
+++ b/docs/_static/js/page.js
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/* Generate url tracking for each page */
+var protocol = location.protocol.concat("//");
+var host = protocol.concat(window.location.host);
+var path = window.location.pathname;
+var pathArr = path.split('/');
+var icon = '<i class="fa fa-angle-right" aria-hidden="true"></i>';
+var urlTracker = "<ul><li><a href=" + host + ">MXNet</a>" + icon + "</li>";
+
+// Check whether this is another version
+var lastUrl = host;
+var versionIDX = -1;
+for (var i = 1; i < pathArr.length; ++i) {
+    lastUrl += '/' + pathArr[i];
+    if(pathArr[i] == 'versions') {
+        versionIDX = i;
+        lastUrl += '/' + pathArr[i + 1];
+        break;
+    }
+}
+if (versionIDX > 0) {
+    pathArr = pathArr.slice(versionIDX + 1, pathArr.length);
+    urlTracker = "<ul><li><a href=" + lastUrl + "/index.html>MXNet</a>" + icon + "</li>";
+}
+else lastUrl = host;
+
+for (var i = 1; i < pathArr.length; ++i) {
+    if (pathArr[i] == 'index.html' || pathArr[i].length == 0) continue;
+    if (pathArr[i].indexOf('#') != -1) pathArr[i] = pathArr[i].substring(0, pathArr[i].indexOf('#'));
+    lastUrl += '/' + pathArr[i];
+    if (pathArr[i].endsWith('.html')) pathArr[i] = pathArr[i].substring(0, pathArr[i].length - 5);
+    if (i == pathArr.length - 1 || pathArr[i + 1].length == 0 || pathArr[i + 1] == 'index.html') {
+        if ( pathArr[i] == 'faq' ){
+             pathArr[i] = "FAQ";
+        }
+        urlTracker += "<li>" + pathArr[i].replace(/_/g, ' ') + "</li>";
+    }
+    else {
+        // Check whether current folder has index.html.
+        // If it doesn't, disable the link.
+        $.ajax(lastUrl + '/index.html', {
+            type: "GET",
+            statusCode: {
+                404: function (response) {
+                    if (pathArr[i] == 'api') urlTracker += "<li>API" + icon + "</li>";
+                    else urlTracker += "<li>" + pathArr[i].replace(/_/g, ' ') + icon + "</li>";
+                }
+            }, 
+            success: function () {
+                item = pathArr[i] == 'ndarray' ? "NDArray" : pathArr[i];
+                urlTracker += "<li><a href=" + lastUrl + '/index.html' + ">" + item.replace(/_/g, ' ') + "</a>" + icon + "</li>";
+            },
+            async: false
+        });
+    }
+}
+urlTracker += '</ul>';
+$('.page-tracker').append(urlTracker);
+
+/* Generate top download btn*/
+if ($('div.download-btn').length > 0) {
+    var topBtn = $('div.download-btn').clone();
+    topBtn.addClass('download-btn-top');
+    topBtn.insertAfter(".page-tracker");
+}
+
+/* Adjust footer position */
+var footerHeight = 252;
+if ($('div.content-block').height() > $(window).height() - footerHeight) {
+    $('div.footer').css('position', 'relative');
+}
diff --git a/docs/_static/js/search.js b/docs/_static/js/search.js
index 9df9702225a2..e9c6e84410b0 100644
--- a/docs/_static/js/search.js
+++ b/docs/_static/js/search.js
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 $(document).ready(function () {
     var searchForm = $("#search-input-wrap").children("form").first();
     searchForm.append('<div class="form-group searchBtn"><input type="submit" class="form-control" value="Go"></div>');
@@ -16,4 +35,4 @@ $(document).ready(function () {
             $('#searchIcon span').addClass('glyphicon-search');
         }
     });
-});
\ No newline at end of file
+});
diff --git a/docs/_static/js/sidebar.js b/docs/_static/js/sidebar.js
index 42607068e16e..890f8c36ad6b 100644
--- a/docs/_static/js/sidebar.js
+++ b/docs/_static/js/sidebar.js
@@ -1,6 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*Preprocess*/
 var LANG = ['python', 'scala', 'r', 'julia', 'c++', 'perl'];
-var TITLE_WITH_LANG = ['/get_started/', '/tutorials/', '/how_to/', '/architecture/'];
+var TITLE_WITH_LANG = ['/get_started/', '/tutorials/', '/faq/', '/architecture/', '/community/'];
 for(var i = 0; i < LANG.length; ++i) {
     TITLE_WITH_LANG.push('/api/' + LANG[i] + '/');
 }
@@ -9,22 +28,18 @@ for(var i = 0; i < LANG.length; ++i) {
 var API_PAGE = ['python'];
 var isAPI = false;
 
-function render_left_helper(toc, currentText) {
+function render_left_helper(toc) {
     var lefttoc = toc;
-    var currentText = currentText, trailing = ' Documents';
-    if (currentText.endsWith(trailing)) currentText = currentText.substring(0, currentText.length - trailing.length);
-    if (currentText == 'System') currentText = 'Architecture';
 
     lefttoc.addClass('current');
     $('.leftsidebar > .sphinxsidebarwrapper').children().remove();
     $('.leftsidebar > .sphinxsidebarwrapper').append(lefttoc);
-    
-    $('.leftsidebar > .sphinxsidebarwrapper').prepend('<h3>Contents</h3>');
+
     addToggle('.leftsidebar');
     
     $('.leftsidebar li a').click(function () {
-        $('.leftsidebar li a').css('color', 'black');
-        $(this).css('color', '#337ab7');
+        $('.leftsidebar li a').css('color', '#337ab7');
+        $(this).css('color', 'black');
     });
 }
 
@@ -38,6 +53,7 @@ function render_lefttoc() {
         $('.sphinxsidebar').css("visibility", "visible");
         return;
     }
+    // If current page is not index page
     if (url.indexOf(indexTrailing) == -1) {
         for(var i = 0; i < TITLE_WITH_LANG.length; ++i) {
             var path = TITLE_WITH_LANG[i];
@@ -50,39 +66,36 @@ function render_lefttoc() {
                         break;
                     }
                 }
-                var urlPath = 'https://' + window.location.host + version +  path;
+                var protocol = location.protocol.concat("//");
+                var urlPath = protocol + window.location.host + version +  path;
                 $.get(urlPath + indexTrailing, null, function(data) {
-                    var currentText = $($.parseHTML(data)).find('.leftsidebar >  .sphinxsidebarwrapper > ul.current > li.current > a').html();
-                    if (isAPI) {
-                        render_left_helper($($.parseHTML(data)).find('#table-of-contents > div > ul'), currentText);
-                    }
-                    else {
-                        render_left_helper($($.parseHTML(data)).find('.leftsidebar > .sphinxsidebarwrapper > ul.current > li.current > ul'), currentText);
-                        var tocLink = $('.leftsidebar .sphinxsidebarwrapper .leaf a');
-                        var staticLink = 'http';
-                        tocLink.each(function () {
-                            if (!$(this).attr('href').startsWith(staticLink)) {
-                                $(this).attr('href', urlPath + $(this).attr('href'));
-                            }
-                        });
-                    }
+                    var lastToc = $($.parseHTML(data)).find('.leftsidebar > .sphinxsidebarwrapper > ul.current > li.current > ul')
+                    render_left_helper(lastToc);
+                    var tocLink = $('.leftsidebar .sphinxsidebarwrapper .leaf a');
+                    var staticLink = 'http';
+                    tocLink.each(function () {
+                        if (!$(this).attr('href').startsWith(staticLink)) {
+                            $(this).attr('href', urlPath + $(this).attr('href'));
+                        }
+                    });
                     keepExpand();
                     $('.sphinxsidebar').css("visibility", "visible");
+                    if ($('div.sphinxsidebar').css('display') != 'none') $('.content').css('width', 'calc(100% - 300px)');
+                    else $('.content').css('width', '100%');
                 })
             }
         }
     }
     else {
-        var currentText = $('.leftsidebar >  .sphinxsidebarwrapper > ul.current > li.current > a').html();
-        var toc = isAPI ? $('#table-of-contents > div > ul').clone() : $('.leftsidebar > .sphinxsidebarwrapper > ul.current > li.current > ul').clone();
-        render_left_helper(toc, currentText);
+        var toc = $('.leftsidebar > .sphinxsidebarwrapper > ul.current > li.current > ul').clone();
+        render_left_helper(toc);
         $('.sphinxsidebar').css("visibility", "visible");
     }
 }
 
 /*Render contents inside page*/
 function render_righttoc() {
-    var url = window.location.href, apiFlag = '/api/', indexTrailing = 'index.html';
+    var url = window.location.href, indexTrailing = 'index.html';
     
     var rightTocTitle = "Page Contents";
     $("div.rightsidebar > div.sphinxsidebarwrapper > h3").children().remove();
@@ -91,8 +104,8 @@ function render_righttoc() {
     addToggle('.rightsidebar');
     
     $('.rightsidebar li a').click(function () {
-        $('.rightsidebar li a').css('color', 'black');
-        $(this).css('color', '#337ab7');
+        $('.rightsidebar li a').css('color', '#337ab7');
+        $(this).css('color', 'black');
     });
     
     if (url.indexOf(indexTrailing) != -1 || isAPI) {
@@ -107,8 +120,8 @@ function scroll_righttoc() {
     for(var i = 1; i < links.length; ++i) {
         var divID = links.eq(i).attr('href');
         if ($(divID).offset().top - $(window).scrollTop() > navbarHeight) {
-            $('.rightsidebar a').css('color', 'black');
-            links.eq(i - 1).css('color', '#337ab7');
+            $('.rightsidebar a').css('color', '#337ab7');
+            links.eq(i - 1).css('color', 'black');
             if (!links.eq(i - 1).parent().hasClass('leaf')) {
                 links.eq(i - 1).parent().removeClass('closed');
                 links.eq(i - 1).parent().addClass('opened');
@@ -173,7 +186,7 @@ function autoExpand(elem) {
 /*Keep toc expansion while redirecting*/
 function keepExpand() {
     var url = window.location.href, currentEntry;
-    var entryList = isAPI ? $('.leftsidebar li') : $('.sphinxsidebar li');
+    var entryList = $('.sphinxsidebar li');
     for(var i = entryList.length - 1; i >= 0; --i) {
         var entryURL = entryList.eq(i).find('a').first().attr('href');
         if (entryURL != '#' && url.indexOf(entryURL) != -1) {
@@ -182,12 +195,12 @@ function keepExpand() {
         }
     }
     
+    //Merge right toc into left toc for API pages since they are quite long
     if (isAPI) {
         var rootEntry = currentEntry;
-        if (rootEntry.parent().parent().is('li')) rootEntry = rootEntry.parent().parent();
-        rootEntry.children("ul").first().remove();
         rootEntry.append($('.rightsidebar .sphinxsidebarwrapper > ul > li > ul').clone());
-        var allEntry = $(".leftsidebar div.sphinxsidebarwrapper li");
+        rootEntry.addClass('closed').removeClass('leaf');
+        var allEntry = $(".leftsidebar div.sphinxsidebarwrapper li.toctree-l2 li");
         allEntry.each(function () {
             var anchor = $(this).children("a").first();
             anchor.click(function () {
@@ -201,8 +214,8 @@ function keepExpand() {
             }
         });
         $('.leftsidebar li a').click(function () {
-            $('.leftsidebar li a').css('color', 'black');
-            $(this).css('color', '#337ab7');
+            $('.leftsidebar li a').css('color', '#337ab7');
+            $(this).css('color', 'black');
         });
     }
     currentEntry.find('a').first().css('color', '#337ab7');
@@ -218,9 +231,9 @@ function keepExpand() {
 
 $(document).ready(function () {
     var url = window.location.href, searchFlag = 'search.html';
+    var showRightToc = false;
     try {
-        if(url.indexOf('/get_started/') != -1) return;
-        if (url.indexOf(searchFlag) == -1) {
+        if (url.indexOf('/get_started/') == -1 && url.indexOf(searchFlag) == -1) {
             for(var i = 0; i < API_PAGE.length; ++i) {
                 if (url.indexOf('/api/' + API_PAGE[i]) != -1) {
                     isAPI = true;
@@ -230,13 +243,31 @@ $(document).ready(function () {
             render_righttoc();
             if ($('.leftsidebar').length) render_lefttoc();
         }
-        
-        if(url.indexOf('/api/') != -1) return;
-        $(window).scroll(function () {
-            scroll_righttoc();
-        });
+        if ($('div.sphinxsidebar').css('visibility') == 'hidden') $('.content').css('width', '100%');
+        if (url.indexOf('/api/') != -1) return;
+        if (url.indexOf('/install/') != -1) {
+            $('div.sphinxsidebar').hide();
+            $('.content').css('width', '100%');
+        }
+        if (url.indexOf('/gluon/index.html') != -1) {
+            $('div.sphinxsidebar').hide();
+            $('.content').css('width', '100%');
+        }
+        if (showRightToc) {
+            $(window).scroll(function () {
+                scroll_righttoc();
+            });
+        }
+        else {
+            $('.rightsidebar').hide();
+        }
+        // move right toc to left if current left toc is empty
+        if ($('.leftsidebar > .sphinxsidebarwrapper').children().length == 0) {
+            $('.leftsidebar > .sphinxsidebarwrapper').append($('.rightsidebar > .sphinxsidebarwrapper > ul'));
+        }
     }
     catch(err) {
+        if ($('div.sphinxsidebar').css('visibility') == 'hidden') $('.content').css('width', '100%');
         return;
     }
-});
\ No newline at end of file
+});
diff --git a/docs/_static/mxnet-theme/footer.html b/docs/_static/mxnet-theme/footer.html
index 45ba457a0722..76d694e8b34f 100644
--- a/docs/_static/mxnet-theme/footer.html
+++ b/docs/_static/mxnet-theme/footer.html
@@ -1,5 +1,35 @@
-<div class="container">
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <div class="footer">
-  <p> </p>
+<div class="section-disclaimer">
+<div class="container">
+    <div>
+        <img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/apache_incubator_logo.png" height=60>
+        <p>
+            Apache MXNet is an effort undergoing incubation at The Apache Software Foundation (ASF), <strong>sponsored by the <i>Apache Incubator</i></strong>. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
+        </p>
+        <p>
+            "Copyright © 2017, The Apache Software Foundation
+            Apache MXNet, MXNet, Apache, the Apache feather, and the Apache MXNet project logo are either registered trademarks or trademarks of the Apache Software Foundation."
+        </p>
+    </div>
+</div>
 </div>
 </div>
diff --git a/docs/_static/mxnet-theme/index.html b/docs/_static/mxnet-theme/index.html
index e381428758c0..40bd6dff5edc 100644
--- a/docs/_static/mxnet-theme/index.html
+++ b/docs/_static/mxnet-theme/index.html
@@ -1,13 +1,40 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <div id="splash">
   <div class="container">
     <div class="row">
       <div class="col-lg-12">
-        <div id="banner-title">A Flexible and Efficient Library for Deep Learning</div>
-        <div id="why_mxnet" class='col-sm-6 col-xs-12'>
-          <a href="get_started/why_mxnet.html" id="why_mxnet_btn">Learn More</a>
-        </div>
-        <div id="install_blk" class='col-sm-6 col-xs-12'>
-          <a href="get_started/install.html" id="install_btn">Install</a>
+          <div id="banner-title" class='col-sm-6 col-xs-12'><span>Apache MXNet</span>
+            <p id="landing-title">is a flexible and efficient library for deep learning.</p>
+            <p id="intro">
+                Building a high-performance deep learning library requires many system-level design decisions. In this design note, we share the rationale for the specific 
+                choices made when designing MXNet. We imagine that these insights may be useful to both deep learning practitioners and builders of other deep learning systems.
+            </p>
+            <div id='landing-btn-blk'>
+                <div id="install_blk">
+                    <a href="install/index.html" id="install_btn">Install</a>
+                </div>
+                <div id="why_mxnet">
+                    <a href="get_started/why_mxnet.html" id="why_mxnet_btn">Learn More</a>
+                </div>
+            </div>
         </div>
       </div>
     </div>
@@ -24,8 +51,8 @@ <h3>Introducing - Gluon</h3>
         <a href="http://gluon.mxnet.io">Learn More</a>
       </div>
       <div class="col-lg-4 col-sm-12">
-        <h3>MXNet 0.10.0 Released</h3>
-        <p>We're excited to announce the release of MXNet 0.10.0! Check out the release notes for latest updates.</p>
+        <h3>MXNet 0.12.0 Released</h3>
+        <p>We're excited to announce the release of MXNet 0.12.0! Check out the release notes for latest updates.</p>
         <a href="https://github.com/dmlc/mxnet/releases">Learn More</a>
       </div>
       <div class="col-lg-4 col-sm-12">
@@ -41,96 +68,30 @@ <h3>MXNet Joining Apache</h3>
 <div class="section-util">
     <div class="container">
       <div class="row">
-        <div class="col-lg-6 col-sm-12">
+        <div id="model-zoo-blk" class="col-lg-4 col-sm-12">
+          <span class="glyphicon glyphicon-folder-open"></span>
+          <h2>Model Zoo</h2>
+          <p>Off the shelf pre-trained models. Fast implementations of many state-of-art models.</p>
+          <div class='util-btn'>
+            <a id="model-zoo-link" href="model_zoo/index.html">Model zoo</a>
+          </div>
+        </div>
+        <div id="example-blk" class="col-lg-4 col-sm-12">
           <span class="glyphicon glyphicon-list-alt"></span>
           <h2>Examples</h2>
-          <p>Explore projects from simple demos to state-of-the-art research</p>
+          <p>Explore projects from simple demos, exmaples, tutorials to state-of-the-art research.</p>
           <div class='util-btn'>
-            <a id="example-link" href="https://github.com/dmlc/mxnet/tree/master/example">MXNet Examples</a>
+            <a id="example-link" href="https://github.com/dmlc/mxnet/tree/master/example">MXNet examples</a>
           </div>
         </div>
-        <div class="col-lg-6 col-sm-12">
-          <span class="glyphicon glyphicon-folder-open"></span>
-          <h2>Model Zoo</h2>
-          <p>Off the shelf pre-trained models</p>
+        <div id="tutorial-blk" class="col-lg-4 col-sm-12">
+          <span class="glyphicon glyphicon-ok-circle"></span>
+          <h2>Tutorials</h2>
+          <p>These tutorials introduce a few fundamental concepts in deep learning and how to implement them in MXNet.</p>
           <div class='util-btn'>
-            <a id="model-zoo-link" href="model_zoo/index.html">Model Zoo</a>
+            <a id="tutorial-link" href="tutorials/index.html">MXNet tutorials</a>
           </div>
         </div>
       </div>
     </div>
 </div>
-
-<div class="section-inst">
-  <div class="container">
-    <div class="row">
-      <p>
-        MXNet is developed by collaborators from multiple universities and
-        companies. We sincerely thank the following organizations for supporting
-        MXNet and sponsoring its major developers (alphabetical order).
-      </p>
-      <div class="col-lg-4 col-sm-6">
-        <img height="60px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/aws-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6">
-        <img height="55px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/baidu-logo.jpg">
-      </div>
-
-      <div class="col-lg-4 col-sm-6 smallClear">
-        <img height="70px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/cmu-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6">
-        <img height="70px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/dato-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6">
-        <img height="70px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/intel-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6">
-        <img height="55px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/nyu-logo.jpg">
-      </div>
-
-      <div class="col-lg-4 col-sm-6 clear smallClear">
-        <img height="40px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/nvidia-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6">
-        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/mit-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6 smallClear">
-        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/ms-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6 clear smallClear">
-        <img height="55px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/hkust-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6">
-        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/tusimple-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6">
-        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/university-alberta-logo.png">
-      </div>
-
-      <div class="col-lg-4 col-sm-6 clear smallClear">
-        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/university-washington-logo.jpg">
-      </div>
-
-      <div class="col-lg-4 col-sm-6">
-        <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/wolfram-logo.png">
-      </div>
-    </div>
-  </div>
-</div>
-
-<div class="section-disclaimer">
-    <img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/apache_incubator_logo.png" height=60>
-    <p>
-        Apache MXNet is an effort undergoing incubation at The Apache Software Foundation (ASF), <strong>sponsored by the <i>Apache Incubator</i></strong>. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF.
-    </p>
-</div>
\ No newline at end of file
diff --git a/docs/_static/mxnet-theme/layout.html b/docs/_static/mxnet-theme/layout.html
index c312181a1476..3d5df27077d5 100644
--- a/docs/_static/mxnet-theme/layout.html
+++ b/docs/_static/mxnet-theme/layout.html
@@ -1,3 +1,22 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 {%- block doctype -%}
 <!DOCTYPE html>
 {%- endblock %}
@@ -148,10 +167,15 @@
 
     <link rel="icon" type="image/png" href="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-icon.png">
   </head>
-  <body role="document">
+  <body role="document" background="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-background.png">
+    <div class='content-block'>
     {%- include "navbar.html" %}
+    
 
     {% if pagename != 'index' %}
+    <script type="text/javascript">
+        $('body').css('background', 'white');
+    </script>
     <div class="container">
       <div class="row">
         <div class="sphinxsidebar leftsidebar" role="navigation" aria-label="main navigation">
@@ -160,15 +184,18 @@
           </div>
         </div>
         <div class="content">
+          <div class="page-tracker"></div>
           {% block body %} {% endblock %}
-          {%- include "footer.html" %}
         </div>
-        {{ sidebar() }}
+      </div>
+      {{ sidebar() }}
     </div>
+    {%- include "footer.html" %}
     {%- else %}
     {%- include "index.html" %}
     {%- include "footer.html" %}
     {%- endif %} <!-- pagename != index -->
+    </div>
 
     <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.6/js/bootstrap.min.js" integrity="sha384-0mSbJDEHialfmuBBQP6A4Qrprq5OVfW37PRR3j5ELqxss1yVqOtnepnHVP9aJ7xS" crossorigin="anonymous"></script>
     <script type="text/javascript" src="{{ pathto('_static/js/sidebar.js', 1) }}"></script>
@@ -176,6 +203,7 @@
     <script type="text/javascript" src="{{ pathto('_static/js/navbar.js', 1) }}"></script>
     <script type="text/javascript" src="{{ pathto('_static/js/clipboard.min.js', 1) }}"></script>
     <script type="text/javascript" src="{{ pathto('_static/js/copycode.js', 1) }}"></script>
+    <script type="text/javascript" src="{{ pathto('_static/js/page.js', 1) }}"></script>
     <script type="text/javascript">
         $('body').ready(function () {
             $('body').css('visibility', 'visible');
diff --git a/docs/_static/mxnet-theme/navbar.html b/docs/_static/mxnet-theme/navbar.html
index c88fb58bb5c2..fc483f83c93c 100644
--- a/docs/_static/mxnet-theme/navbar.html
+++ b/docs/_static/mxnet-theme/navbar.html
@@ -1,62 +1,43 @@
-<!-- Previous Navbar Layout
-<div class="navbar navbar-default navbar-fixed-top">
-  <div class="container">
-    <div class="navbar-header">
-      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false" aria-controls="navbar">
-        <span class="sr-only">Toggle navigation</span>
-        <span class="icon-bar"></span>
-        <span class="icon-bar"></span>
-        <span class="icon-bar"></span>
-      </button>
-      <a href="{{ url_root }}" class="navbar-brand">
-        <img src="http://data.mxnet.io/theme/mxnet.png">
-      </a>
-    </div>
-    <div id="navbar" class="navbar-collapse collapse">
-      <ul id="navbar" class="navbar navbar-left">
-        {% for name in ['Get Started', 'Tutorials', 'How To'] %}
-        <li> <a href="{{url_root}}{{name.lower()|replace(" ", "_")}}/index.html">{{name}}</a> </li>
-        {% endfor %}
-        {% for name in ['Packages'] %}
-        <li class="dropdown">
-          <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">{{name}} <span class="caret"></span></a>
-          <ul class="dropdown-menu">
-            {% for lang in ['Python', 'R', 'Julia', 'C++', 'Scala', 'Perl'] %}
-            <li><a href="{{url_root}}{{name.lower()|replace(" ", "_")}}/{{lang.lower()}}/index.html">
-                {{lang}}
-            </a></li>
-            {% endfor %}
-          </ul>
-        </li>
-        {% endfor %}
-        <li> <a href="{{url_root}}system/index.html">System</a> </li>
-        <li> {{searchform('', False)}} </li>
-      </ul>
-      <ul id="navbar" class="navbar navbar-right">
-        <li> <a href="{{url_root}}index.html"><span class="flag-icon flag-icon-us"></span></a> </li>
-        <li> <a href="{{url_root}}/zh/index.html"><span class="flag-icon flag-icon-cn"></span></a> </li>
-      </ul>
-    </div>
-  </div>
-</div>
-Previous Navbar Layout End -->
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
 
 <div class="navbar navbar-fixed-top">
   <div class="container" id="navContainer">
     <div id="header-inner" class="innder">
       <h1 id="logo-wrap">
-        <a href="{{ url_root }}" id="logo"><img src="http://data.mxnet.io/theme/mxnet.png"></a>
+        <a href="{{ url_root }}" id="logo"><img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet_logo.png"></a>
       </h1>
       <nav id="main-nav" class='nav-bar'>
-        <a class="main-nav-link" href="{{url_root}}get_started/install.html">Install</a>
-        {% for name in ['Tutorials', 'How To'] %}
-        <a class="main-nav-link" href="{{url_root}}{{name.lower()|replace(" ", "_")}}/index.html">{{name}}</a>
-        {% endfor %}
+        <a class="main-nav-link" href="{{url_root}}install/index.html">Install</a>
+        <a class="main-nav-link" href="{{url_root}}tutorials/index.html">Tutorials</a>
 
-        {% for name in ['API'] %}
         <span id="dropdown-menu-position-anchor">
-          <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">{{name}} <span class="caret"></span></a>
-          <ul id="package-dropdown-menu" class="dropdown-menu">
+          <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Gluon <span class="caret"></span></a>
+          <ul id="package-dropdown-menu" class="dropdown-menu navbar-menu">
+            <li><a class="main-nav-link" href="{{url_root}}gluon/index.html">About</a></li>
+            <li><a class="main-nav-link" href="http://gluon.mxnet.io">Tutorials</a></li>
+          </ul>
+        </span>
+        
+        <span id="dropdown-menu-position-anchor">
+          <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">API <span class="caret"></span></a>
+          <ul id="package-dropdown-menu" class="dropdown-menu navbar-menu">
             <li><a class="main-nav-link" href="{{url_root}}api/python/index.html">Python</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/scala/index.html">Scala</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/r/index.html">R</a></li>
@@ -64,21 +45,45 @@ <h1 id="logo-wrap">
             <li><a class="main-nav-link" href="{{url_root}}api/c++/index.html">C++</a></li>
             <li><a class="main-nav-link" href="{{url_root}}api/perl/index.html">Perl</a></li>
           </ul>
-          {% endfor %}
         </span>
+          
+         <span id="dropdown-menu-position-anchor-docs">
+          <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Docs <span class="caret"></span></a>
+          <ul id="package-dropdown-menu-docs" class="dropdown-menu navbar-menu">
+            <li><a class="main-nav-link" href="{{url_root}}faq/index.html">FAQ</a></li>
+            <li><a class="main-nav-link" href="{{url_root}}architecture/index.html">Architecture</a></li>
+            <li><a class="main-nav-link" href="https://github.com/apache/incubator-mxnet/tree/master/example">Examples</a></li>
+            <li><a class="main-nav-link" href="{{url_root}}model_zoo/index.html">Model Zoo</a></li>
+          </ul>
+        </span>
+
 
-        <a class="main-nav-link" href="{{url_root}}architecture/index.html">Architecture</a>
-		<!-- <a class="main-nav-link" href="{{url_root}}community/index.html">Community</a> -->
         <a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a>
+
+        <span id="dropdown-menu-position-anchor-community">
+          <a href="#" class="main-nav-link dropdown-toggle" data-toggle="dropdown" role="button" aria-haspopup="true" aria-expanded="true">Community <span class="caret"></span></a>
+          <ul id="package-dropdown-menu-community" class="dropdown-menu navbar-menu">
+            <li><a class="main-nav-link" href="{{url_root}}community/index.html">Community</a></li>
+            <li><a class="main-nav-link" href="{{url_root}}community/contribute.html">Contribute</a></li>
+            <li><a class="main-nav-link" href="{{url_root}}community/powered_by.html">Powered By</a></li>
+          </ul>
+        </span>
+        <a class="main-nav-link" href="http://discuss.mxnet.io">Discuss</a>
       </nav>
       <script> function getRootPath(){ return "{{url_root}}" } </script>
       <div class="burgerIcon dropdown">
           <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button">☰</a>
-          <ul id="burgerMenu" class="dropdown-menu dropdown-menu-right">
-              <li><a href="{{url_root}}get_started/install.html">Install</a></li>
-              {% for name in ['Tutorials', 'How To'] %}
-              <li><a href="{{url_root}}{{name.lower()|replace(" ", "_")}}/index.html">{{name}}</a></li>
-              {% endfor %}
+          <ul id="burgerMenu" class="dropdown-menu">
+              <li><a href="{{url_root}}install/index.html">Install</a></li>
+              <li><a class="main-nav-link" href="{{url_root}}tutorials/index.html">Tutorials</a></li>
+              <li class="dropdown-submenu">
+                <a href="#" tabindex="-1">Community</a>
+                <ul class="dropdown-menu">
+                  <li><a tabindex="-1"  href="{{url_root}}community/index.html">Community</a></li>
+                  <li><a tabindex="-1"  href="{{url_root}}community/contribute.html">Contribute</a></li>
+                  <li><a tabindex="-1"  href="{{url_root}}community/powered_by.html">Powered By</a></li>
+                </ul>
+              </li>
               {% for name in ['API'] %}
               <li class="dropdown-submenu">
                 <a href="#" tabindex="-1">{{name}}</a>
@@ -90,6 +95,16 @@ <h1 id="logo-wrap">
                 </ul>
               </li>
               {% endfor %}
+              <li class="dropdown-submenu">
+                <a href="#" tabindex="-1">Docs</a>
+                <ul class="dropdown-menu">
+                  <li><a tabindex="-1"  href="{{url_root}}tutorials/index.html">Tutorials</a></li>
+                  <li><a tabindex="-1"  href="{{url_root}}faq/index.html">FAQ</a></li>
+                  <li><a tabindex="-1"  href="{{url_root}}architecture/index.html">Architecture</a></li>
+                  <li><a tabindex="-1"  href="https://github.com/apache/incubator-mxnet/tree/master/example">Examples</a></li>
+                  <li><a tabindex="-1"  href="{{url_root}}model_zoo/index.html">Model Zoo</a></li>
+                </ul>
+              </li>
               <li><a href="{{url_root}}architecture/index.html">Architecture</a></li>
 		      <li><a class="main-nav-link" href="https://github.com/dmlc/mxnet">Github</a></li>
           </ul>
diff --git a/docs/_static/mxnet.css b/docs/_static/mxnet.css
index db2c5a275c53..fec4e455396c 100644
--- a/docs/_static/mxnet.css
+++ b/docs/_static/mxnet.css
@@ -1,3 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*-------------------- AmazonEmber font -----------------------------------*/
+@font-face {
+    font-family: AmazonEmber;
+    src: url(https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/font/AmazonEmber_Rg.ttf);
+    font-weight: normal;
+}
+
+@font-face {
+    font-family: AmazonEmber;
+    src: url(https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/font/AmazonEmber_Bd.ttf);
+    font-weight: bold;
+}
+
+@font-face {
+    font-family: AmazonEmber;
+    src: url(https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/font/AmazonEmber_Th.ttf);
+    font-weight: 300;
+}
+
 
 /*-------------------- basic html elements and classes --------------------*/
 html, body {
@@ -9,16 +47,18 @@ html, body {
 body {
     display: block;
     visibility: hidden;
+    background-size: cover;
+    background-attachment: fixed;
 }
 
 body, div {
-    font-family: "Open Sans", "Lucida Grande", "Helvetica Neue", Arial;
+    font-family: AmazonEmber;
     font-size: 16px;
     color: #000;
 }
 
 p {
-    font-family: "Open Sans", "Lucida Grande", "Helvetica Neue", Arial;
+    font-family: AmazonEmber;
     color: #000;
     font-size: 16px;
     line-height: 1.5em;
@@ -34,6 +74,13 @@ li, dt a, dt span {
     line-height: 1.5em;
 }
 
+.footer {
+    z-index: 10;
+    position: absolute;
+    bottom: 0;
+    width: 100%;
+}
+
 /* /\*Content paragraph space*\/ */
 /* div.content p { */
 /*     margin-top: 20px; */
@@ -41,7 +88,7 @@ li, dt a, dt span {
 
 /*------------------ Headings -------------------*/
 h1, h2, h3 {
-    font-family: "Open Sans", "Lucida Grande", "Helvetica Neue", Arial;
+    font-family: AmazonEmber;
     margin-top: 18px;
     margin-bottom: 18px;
     color: #444;
@@ -51,6 +98,7 @@ h1 {
     margin-top: 0px;
     margin-bottom: 25px;
     font-size: 28px;
+    font-weight: bold;
 }
 
 h2 {
@@ -90,15 +138,18 @@ img {
 
 /*-----------------------nav bar-----------------------*/
 .navbar {
-    background-color:#0079b2;
     opacity: 0.9;
     border: 0px;
     height: 60px;
     margin-bottom: 0px;
+    border-bottom: 0.01em solid #fff;
+    background-image: url("https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-navbar.png");
+    background-size: cover;
+    background-attachment: scroll;
 }
 
 .navbar .container {
-    position: fixed;
+    position: relative;
     left: 20px;
 }
 
@@ -124,7 +175,8 @@ img {
     -webkit-box-align: center;
     -ms-flex-align: center;
     -webkit-align-items: center;
-    align-items: center
+    align-items: center;
+    padding-left: 10px;
 }
 
 @media screen and(max-width: 768 px) {
@@ -143,18 +195,14 @@ img {
     width: 125px;
 }
 
-@media screen and (max-width: 405px) {
-    #logo-wrap {
-        margin-left: -25px;
-    }
-}
-
 #logo {
+    color: #fff;
     width: 150px;
     display: block;
     float: left;
     height: 60px;
-    padding: 10px 0 0 0;
+    padding: 15px 0 0 0;
+    text-decoration: none;
 }
 
 #logo > img {
@@ -175,12 +223,10 @@ img {
     color: #fff;
     text-decoration: none;
     line-height: 50px;
-    opacity: .7;
-    -webkit-transition: .2s;
-    transition: .2s;
-    font-family: Lato, "Helvetica Neue", Helvetica, Arial, sans-serif;
+    font-family: AmazonEmber;
     display: inline-block;
     padding: 0 15px;
+    font-weight: 300;
 }
 
 .main-nav-link:hover {
@@ -190,18 +236,20 @@ img {
 }
 
 #dropdown-menu-position-anchor, 
+#dropdown-menu-position-anchor-docs,
+#dropdown-menu-position-anchor-community,
 #dropdown-menu-position-anchor-version, 
 #dropdown-menu-position-anchor-version-mobile {
     position: relative;
 }
 
-#package-dropdown-menu {
+.navbar-menu {
     top: 36px;
     border-radius: 4px;
     padding: 0;
 }
 
-#package-dropdown-menu > li > a {
+.navbar-menu > li > a {
     color: #0079b2;
     padding: 6px 16px;
 
@@ -209,10 +257,12 @@ img {
 
 #search-input-wrap {
     display: block;
-    position: fixed;
+    position: absolute;
     top: 15px;
-    right: 40px;
-    width: 140px;
+    right: 20px;
+    width: 200px;
+    -webkit-transition: width 0.5s;
+    transition: width 0.5s;
 }
 
 #search-input-wrap ul {
@@ -223,7 +273,7 @@ img {
     display: none;
     position: fixed;
     top: 20px;
-    right: 20px;
+    right: 35px;
     color: white;
 }
 
@@ -231,13 +281,15 @@ img {
     position: absolute;
     top: 0;
     left: 0;
-    width: 140px;
-    background: #87CEFA;
+    width: 200px;
+    background: transparent;
     border-radius: 5px;
+    -webkit-transition: width 0.5s;
+    transition: width 0.5s;
 }
 
 .searchBox input {
-    background: #87CEFA;
+    background: transparent;
 }
 
 .searchBox input::placeholder {
@@ -262,19 +314,12 @@ img {
 
 @media screen and (max-width: 510px) {
     #search-input-wrap {
-        display: none;
-        right: 40px;
-        margin-left: -100px;
+        width: 120px;
+        margin-right: 10px;
     }
-
-    #searchIcon {
-        display: block;
-    }
-}
-
-@media screen and (min-width: 511px) {
-    #search-input-wrap {
-        display: block !important;
+    
+    .searchBox {
+        width: 120px;
     }
 }
 
@@ -288,7 +333,7 @@ img {
 #lang-select-label {
     color: #fff;
     opacity: .7;
-    font-family: Lato, Helvetica Neue, Helvetica, Arial, sans-serif;
+    font-family: AmazonEmber;
     line-height: 50px
 }
 
@@ -313,25 +358,34 @@ img {
 
 .github-btn { border: 0; overflow: hidden }
 
+.boxed {
+    border: 1px solid rgb(57, 144, 211);
+    background: #0079b2;
+    color: white;
+    font-weight: bold
+}
+
+.boxed-bordered {
+    border: 3px solid rgba(0,0,0,.05);
+    background: #0079b2;
+    font-weight: bold;
+    color: white;
+}
+
 /*burger icon*/
 div .burgerIcon {
     font-size: 25px;
     position: fixed;
     right: 155px;
-    top: 10px;
-    display: none
+    top: 11px;
+    display: none;
+    width: 30px;
 }
 
 div .burgerIcon a {
     color: white;
 }
 
-@media screen and (max-width: 405px) {
-    div .burgerIcon {
-        margin-left: -25px;
-    }
-}
-
 /*burger menu*/
 #burgerMenu, #plusMenu {
     list-style-type: none;
@@ -350,7 +404,13 @@ div .burgerIcon a {
     height: 30px;
     padding-left: 5px;
     width: 108px;
-    max-width: 200px
+    max-width: 250px;
+}
+
+#plusMenu .dropdown-submenu>.dropdown-menu {
+    left: 138px;
+    top: 0;
+    background: #fff;
 }
 
 #burgerMenu li, #plusMenu li {
@@ -363,11 +423,17 @@ li.dropdown-submenu ul.dropdown-menu {
 }
 
 li.dropdown-submenu ul.dropdown-menu li {
-    height: auto
+    border-style: none !important;
 }
 
 li.dropdown-submenu ul.dropdown-menu a {
-    height: auto
+    padding-left: 20px !important;
+}
+
+@media screen and (max-width: 420px) {
+    li.dropdown-submenu ul.dropdown-menu a {
+        font-size: 12px !important;
+    }
 }
 
 /*dropdown submenu*/
@@ -376,13 +442,12 @@ li.dropdown-submenu ul.dropdown-menu a {
 }
 
 .dropdown-submenu>.dropdown-menu {
-    top: 0;
-    left: 100%;
-    margin-top: -6px;
-    margin-left: -1px;
+    top: -8px;
+    left: 250px;
     -webkit-border-radius: 0 6px 6px 6px;
     -moz-border-radius: 0 6px 6px;
     border-radius: 0 6px 6px 6px;
+    background: transparent;
 }
 
 .dropdown-submenu:hover>.dropdown-menu {
@@ -399,7 +464,7 @@ li.dropdown-submenu ul.dropdown-menu a {
     border-style: solid;
     border-width: 5px 0 5px 5px;
     border-left-color: #ccc;
-    margin-top: 5px;
+    margin-top: 10px;
     margin-right: -10px;
 }
 
@@ -422,9 +487,9 @@ li.dropdown-submenu ul.dropdown-menu a {
 /*Plus icon*/
 .plusIcon {
     display: none;
-    position: fixed;
+    position: absolute;
     top: 20px;
-    right: 175px;
+    right: 210px;
     width: 30px;
 }
 
@@ -437,18 +502,50 @@ li.dropdown-submenu ul.dropdown-menu a {
         display: none !important;
     }
 
-    div .burgerIcon {
+    div.burgerIcon {
         display: block;
         position: fixed;
-        top: 10px;
-        left: 160px;
+        left: 45px;
+    }
+    
+    #burgerMenu {
+        position: fixed;
+        left: 0;
+        top: 49px;
+        width: 100%;
+        height: 100%;
+        background: url(https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/mxnet-background.png);
+        background-attachment: initial;
+        background-size: cover;
+    }
+    
+    #burgerMenu a {
+        display: block;
+        font-size: 19px;
+        width: auto;
+        padding-left: 75px;
+        padding-top: 10px;
+        color: #fff;
+        text-decoration: none;
+    }
+    
+    #burgerMenu a:hover {
+        color: initial;
+        background-color: inherit;
+    }
+    
+    #burgerMenu li {
+        border-bottom: 1px solid lightgray;
+        height: 50px;
+    }
+    
+    #logo-wrap {
+        margin-left: 30px;
     }
 }
 /*--------------------------banner---------------------------*/
 #splash{
-    padding:60px 0 50px 0;
-    background-color:#0079b2;
-    /* background-image:url(../img/bg.jpg); */
+    padding:60px 0 0 0;
     background-size:cover;
     background-attachment:fixed;
     color:#fff;
@@ -459,31 +556,43 @@ li.dropdown-submenu ul.dropdown-menu a {
     text-align: center
 }
 
-#splash #banner-title {
-    padding: 20px 0 10px 0;
+#banner-title span {
+    font-family: AmazonEmber;
+    font-weight: bold;
+}
+
+#banner-title {
+    padding: 60px 0 10px 0;
     font-size: 40px;
-    line-height: 1.15;
+    line-height: 55px;
     font-weight: 300;
-    font-family: Lato, Helvetica Neue, Helvetica, Arial, sans-serif;
+    font-family: AmazonEmber;
     color: #fff;
+    max-width: 600px;
+    text-align: left;
+    margin-left: 30px;
 }
 
-@media screen and (min-width:769px) {
-    #splash #banner-title {
-        padding-top: 100px;
-    }
+#landing-title {
+    margin-right: 40px;
+    font-size: 35px;
+    color: #fff;
+    font-family: AmazonEmber;
+    font-weight: 300;
 }
 
-@media screen and (max-width:768px) {
-    #splash #banner-title {
-            font-size: 30px;
-            padding: 60px 30px 10px 30px;
-    }
+#intro {
+    font-size: 16px;
+    color: #fff;
+    font-family: AmazonEmber;
+    font-weight: 300;
 }
 
-@media screen and (max-width:510px) {
-    #splash #banner-title {
-            padding-top: 60px;
+@media screen and (max-width:768px) {
+    #banner-title {
+        font-size: 30px;
+        padding: 60px 30px 10px 30px;
+        margin-left: 0;
     }
 }
 
@@ -496,62 +605,55 @@ li.dropdown-submenu ul.dropdown-menu a {
     margin:2em 0 4em 0;
 }
 
-#why_mxnet, #install_blk {
-    margin:2em 0
+#landing-btn-blk {
+    margin-top: 2em;
 }
 
-#why_mxnet_btn, #install_btn, #release_btn {
-    border: 1.8px solid #FFFFFF;
-    color: #FFFFFF;
-    display: inline-block;
-    font-size: 18px;
-    font-family: Helvetica, Helvetica Neue, Arial, sans-serif;
-    padding: 8px 20px;
-    -webkit-transition: .2s;
-    transition: .2s;
-    width: 160px;
-    border-radius: 30px;
+#why_mxnet, #install_blk {
+    margin:2em 0;
+    display: inline;
 }
 
-#release_btn {
-    width: auto;
-    border: none;
+#why_mxnet {
+    padding-left: 20px;
 }
 
-@media (min-width: 768px) {
-    #why_mxnet_btn {
-        margin-right: -45%;
+@media screen and (max-width:400px) {
+    #install_blk, #install_blk{
+        display: block;
     }
-    #install_btn {
-        margin-left: -45%;
+    
+    #install_blk {
+        padding-left: 0;
+        margin-top: 0;
     }
 }
 
-@media (min-width: 992px) {
-    #why_mxnet_btn {
-        margin-right: -55%;
-    }
-    #install_btn {
-        margin-left: -55%;
-    }
+#why_mxnet_btn {
+    border: 1.8px solid #FFFFFF;
+    color: #FFFFFF;
+    display: inline;
+    font-size: 18px;
+    font-family: AmazonEmber;
+    padding: 8px 20px;
+    -webkit-transition: .2s;
+    transition: .2s;
 }
 
-@media (min-width: 1200px) {
-    #why_mxnet_btn {
-        margin-right: -65%;
-    }
-    #install_btn {
-        margin-left: -65%;
-    }
+#install_btn {
+    border: 1.8px solid #FFFFFF;
+    color: #fff;
+    display: inline;
+    font-size: 18px;
+    font-family: AmazonEmber;
+    padding: 8px 40px;
+    -webkit-transition: .2s;
+    transition: .2s;
 }
 
-@media (max-width: 767px) {
-    #why_mxnet_btn {
-        margin-bottom: -20%;
-    }
-    #install_btn {
-        margin-top: -20%;
-    }
+#release_btn {
+    width: auto;
+    border: none;
 }
 
 #why_mxnet_btn:hover, #install_btn:hover {
@@ -563,9 +665,7 @@ li.dropdown-submenu ul.dropdown-menu a {
 
 /*------------------------index page section----------------------------*/
 .section-tout {
-    padding:3em 0 3em;
-    border-bottom:1px solid rgba(0,0,0,.05);
-    background-color:#fff
+    padding:3em 0 1em;
 }
 
 .section-tout .container {
@@ -577,21 +677,30 @@ li.dropdown-submenu ul.dropdown-menu a {
 }
 
 .section-tout .row div {
-    height: 100%;
+    height: 110%;
     padding-left: 50px;
+    background-color:#fff;
+    box-shadow: 0px 2px 4px 0px rgba(0, 28, 36, 0.5);
 }
 
 .section-tout .row a {
+    font-family: AmazonEmber;
     position: absolute;
     bottom: 20px;
+    border: solid 1px;
+    padding: 5px 10px;
+    color: #111;
 }
 
 .section-tout h3{
+    font-family: AmazonEmber;
     font-size:20px;
-    color: #444;
+    color:  #007dbc;
+    padding-top: 10px
 }
 
 .section-tout p {
+    font-family: AmazonEmber;
     margin-bottom:2em;
 }
 
@@ -605,34 +714,76 @@ li.dropdown-submenu ul.dropdown-menu a {
     }
 
     .section-tout .row div {
-        margin-bottom: 20px;
-        padding-left: 20px;
+        padding-left: 45px;
+    }
+    
+    .section-tout .row p {
+        padding-bottom: 10px;
+    }
+}
+
+@media (max-width: 767px) {
+    .section-tout .row div {
+        margin-bottom: -20px;
     }
 }
 
 .section-util {
-    background-color: #eaf1f1;
     padding:3em 0 3em;
-    border-bottom:1px solid rgba(0,0,0,.05);
-    text-align: center;
+    text-align: left;
+    margin-bottom: 50px;
+}
+
+.section-util .row > div {
+    height: 320px;
+    padding-left: 50px;
+    padding-top: 40px;
+}
+
+.section-util .row div h2 {
+    font-family: AmazonEmber;
+    color: #fff;   
+}
+
+#model-zoo-blk {
+    background-color:  #00446e;
+    box-shadow: 0px 2px 4px 0px rgba(0, 28, 36, 0.5);
+}
+
+#example-blk {
+    background-color:  #006ea7;
+    box-shadow: 0px 2px 4px 0px rgba(0, 28, 36, 0.5);
+}
+
+#tutorial-blk {
+    background-color:  #00a1c9;
+    box-shadow: 0px 2px 4px 0px rgba(0, 28, 36, 0.5);
+}
+
+@media (min-width: 1200px) {
+    #model-zoo-blk {
+        width: 370px;
+        margin-right: 20px;
+    }
+
+    #example-blk {
+        width: 370px;
+        margin-right: 20px;
+    }
 }
 
 .section-util p {
-    color: #999;
+    font-family: AmazonEmber;
+    color: #fff;
     position: absolute;
-    width: 50%;
-    margin: auto;
-    left: 0;
-    right: 0;
+    width: 70%;
+    font-size: 14px;
+    font-weight: 300;
 }
 
 .section-util .util-btn {
     position: absolute;
-    margin: auto;
-    left: 0;
-    right: 0;
-    padding-top: 10px;
-    margin-top: 60px;
+    bottom: 30px;
 }
 
 @media (max-width: 600px) {
@@ -643,18 +794,17 @@ li.dropdown-submenu ul.dropdown-menu a {
 
 .section-util .glyphicon {
     font-size: 50px;
-    color: #999;
+    color: #b3ffff;
 }
 
 .util-btn a {
+    font-family: AmazonEmber;
     display: inline-block;
-    border: 1.8px solid #0079b2;
-    border-radius: 30px;
-    width: 200px;
-    height: 50px;
+    border: 1.8px solid;
     -webkit-transition: .2s;
     transition: .2s;
-    padding: 10px 30px;
+    padding: 5px 30px;
+    color: #fff;
 }
 
 .util-btn a:hover {
@@ -672,27 +822,35 @@ li.dropdown-submenu ul.dropdown-menu a {
     .section-util .container {
         height: auto
     }
-
-    .section-util .row div {
-        margin-bottom: 200px;
-    }
-}
-
-@media (max-width: 767px) {
-    .section-util .row div {
-        margin-bottom: 250px;
-    }
 }
 
 .section-inst {
     padding:3em 0 3em;
     border-bottom:1px solid rgba(0,0,0,.05);
+    text-align: left;
+    background-color: #fff;
+}
 
-    text-align:center
+.section-inst .row div {
+    text-align: center;
+    margin-bottom: 30px;
+    height: 80px;
 }
 
 .section-inst p {
-    margin-bottom:2em
+    font-family: AmazonEmber;
+    font-weight: 300;
+    margin-bottom:2em;
+    padding-left: 50px;
+    color: #111;
+}
+
+.section-inst h2 {
+    font-family: AmazonEmber;
+    margin-top: -20px;
+    margin-bottom: 50px;
+    color: dimgray;
+    padding-left: 40px;
 }
 
 @media (min-width: 1200px) {
@@ -718,26 +876,29 @@ li.dropdown-submenu ul.dropdown-menu a {
 }
 
 .section-disclaimer {
-    padding: 3em 3em 3em;
+    padding: 3em 0 3em;
+    background-color:  #f2f2f2;
+}
+
+.section-disclaimer .container div {
+    padding-left: 50px;
 }
 
 .section-disclaimer p {
     padding-top: 2em;
 }
 
-.footer{
-    padding-top: 40px;
-}
 .footer li{
     float:right;
     margin-right:1.5em;
     margin-bottom:1.5em
 }
 .footer p{
-    font-size: 15px;
-    color: #888;
+    font-family: AmazonEmber;
+    font-weight: 300;
+    font-size: 12px;
     clear:right;
-    margin-bottom:0
+    margin-bottom:0;
 }
 
 /*------------------content----------------------*/
@@ -746,39 +907,62 @@ div.navbar + div.container {
 }
 
 div.content {
-    position: absolute;
-    left: 320px;
-    right: 200px;
-    margin-right: 5%;
-    padding: 40px 0 0 0;
+    padding: 80px 40px 40px 45px;
+    background: #fff;
+    width: calc(100% - 300px);
+    float: right;
     overflow-x: hidden;
     z-index: -1;
 }
 
 @media (max-width: 999px) {
     div.content {
-        left: 0;
-        margin-left: 5%;
-        right: 0;
         overflow-x: auto;
+        width: 100%;
     }
 }
 
+div.page-tracker {
+    margin-bottom: 30px;
+    font-family: AmazonEmber;
+}
+
+div.page-tracker ul {
+    padding-left: 0;
+}
+
+div.page-tracker ul li {
+    text-transform: capitalize;
+    display: inline;
+}
+
+div.page-tracker a {
+    text-transform: capitalize;
+    color: #337ab7;
+}
+
+div.page-tracker li:last-child {
+    color: #aab7b8;
+}
+
+div.page-tracker i {
+    margin: 0 10px;
+}
+
 /*------------------sidebar-----------------------*/
 div.sphinxsidebar {
     position: fixed;
-    background-color: white;
-    height: calc(100% - 80px);
+    padding-top: 40px;
+    height: 100%;
     display: none;
     visibility: hidden;
     overflow: auto;
 }
 
 div.leftsidebar {
-    left: 30px;
-    width: 250px;
-    padding: 40px 0 15px 0;
-    margin: 0;
+    width: 300px;
+    margin-left: 25px;
+    background: #f2f2f2;
 }
 
 div.rightsidebar {
@@ -793,7 +977,8 @@ div.sphinxsidebar ul {
 }
 
 div.sphinxsidebar a {
-    color: black;
+    font-family: AmazonEmber;
+    font-size: 13px;
 }
 
 div.rightsidebar div.sphinxsidebarwrapper > ul {
@@ -804,11 +989,10 @@ div.sphinxsidebar li {
     padding-top: 5px;
     margin-bottom: 5px;
     margin-left: -10px;
-    font: normal 12px "Open Sans", "Lucida Grande", "Helvetica Neue", Arial;
 }
 
 div.leftsidebar > div.sphinxsidebarwrapper {
-    padding: 10px 10px 10px 10px;
+    padding: 40px 20px 10px 20px;
     background: #f2f2f2;
 }
 
@@ -827,26 +1011,26 @@ div.rightsidebar > div.sphinxsidebarwrapper > ul > li {
 
 div.sphinxsidebar li.opened .tocToggle:before {
     font-family: 'FontAwesome';
-    content: "\f146";
+    content: "\f0d7";
     margin: 0 5px 0 -15px;
-    color: #337ab7;
+    color: #999999;
 }
 
 div.sphinxsidebar li.closed .tocToggle:before {
     font-family: 'FontAwesome';
-    content: "\f0fe";
+    content: "\f0da";
     margin: 0 5px 0 -15px;
-    color: #337ab7;
+    color: #999999;
 }
 
 div.sphinxsidebar li.leaf .tocToggle:before {
-    font-family: 'FontAwesome';
-    content: "\f096";
+    /*font-family: 'FontAwesome';*/
+    content: " ";
     margin: 0 5px 0 -15px;
-    color: #337ab7;
+    color: #f2f2f2;
 }
 
-@media (min-width:1000px) {
+@media (min-width:1015px) {
     div.sphinxsidebar {display: block}
 }
 
@@ -985,6 +1169,15 @@ code {
     background-color: #f5f5f5;
 }
 
+code.docutils.literal {
+    color: #111;
+    font-weight: bold;
+}
+
+a code.docutils.literal {
+    color: #337ab7;
+}
+
 dt code {
     color: #555;
 }
@@ -996,6 +1189,8 @@ dl.last.docutils dt{
 
 dl.docutils dt {
     color: #555;
+    background-color: #f0f0f0;
+    border-bottom: solid #ccc;
 }
 
 /*----------------Model zoo page style------------------*/
@@ -1120,7 +1315,6 @@ table.docutils tr:nth-child(even) {
     height: 38px;
 }
 
-
 /*----------------Search function style------------------*/
 #searchPage {
     width: 60%;
@@ -1187,7 +1381,7 @@ table.docutils tr:nth-child(even) {
 
 /* align icon */
 .left-addon .glyphicon  {
-    left:  0px;
+    right: 0px;
 }
 .right-addon .glyphicon {
     right: 0px;
@@ -1195,7 +1389,8 @@ table.docutils tr:nth-child(even) {
 
 /* add padding  */
 .left-addon input  {
-    padding-left:  40px;
+    font-family: AmazonEmber;
+    font-style: italic;
 }
 .right-addon input {
     padding-right: 40px;
@@ -1219,9 +1414,9 @@ button.download {
 }
 
 /*----------------------Download button------------------------*/
-div.download_btn {
+div.download-btn {
 
-    border: solid 1px lightgray;
+    border: solid 1px #000;
     border-radius: 3px;
     font-size: 90%;
     height: 40px;
@@ -1229,18 +1424,29 @@ div.download_btn {
     float: left;
 }
 
-div.download_btn a {
+div.download-btn a {
     padding: 0 10px;
     display: table-cell;
     vertical-align: middle;
+    color: #000;
 }
 
-div.download_btn a:hover {
+div.download-btn a:hover {
     background-color: #0079b2;
     color: white;
     text-decoration: none;
 }
 
+div.download-btn.download-btn-top {
+    border-color: #ec7211;
+    background: #ec7211;
+    float: right;
+}
+
+div.download-btn.download-btn-top a {
+    color: #fff;
+}
+
 /*-------------output blocks----------------*/
 
 .highlight-results .highlight pre {
@@ -1258,3 +1464,28 @@ div.download_btn a:hover {
     display: none;
     padding-top: 40px;
 }
+
+/*------------Mobile dropdown menu smooth transition---------------*/
+@keyframes slide {
+    0% {
+        left: -100%;
+    }
+    25% {
+        left: -75%;
+    }
+    50% {
+        left: -50%;
+    }
+    75% {
+        left: -25%;
+    }
+    100% {
+        left: 0;
+    }
+}
+
+#burgerMenu {
+    animation-name: slide;
+    animation-duration: 0.3s;
+    animation-timing-function: ease-in-out;
+}
diff --git a/docs/_static/searchtools_custom.js b/docs/_static/searchtools_custom.js
index c6fd37f72233..fe1e62101151 100644
--- a/docs/_static/searchtools_custom.js
+++ b/docs/_static/searchtools_custom.js
@@ -4,13 +4,34 @@
  *
  * Sphinx JavaScript utilties for the full-text search.
  *
- * :copyright: Copyright 2007-2016 by the Sphinx team, see AUTHORS.
- * :license: BSD, see LICENSE for details.
+ * Copyright (c) 2007-2017 by the Sphinx team
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 
-/* Non-minified version JS is _stemmer.js if file is provided */ 
+/* Non-minified version JS is _stemmer.js if file is provided */
 /**
  * Porter Stemmer
  */
@@ -406,7 +427,7 @@ var Search = {
         return (left > right) ? -1 : ((left < right) ? 1 : 0);
       }
     });
-    
+
     //Render result for preview
     if(isPreview) {
         var prevNum = 0;
@@ -436,7 +457,7 @@ var Search = {
                 highlightstring + item[2]).html(item[1]));
             } else {
                 // normal html builders
-                var baseURL = 'http://mxnet.io/';
+                var baseURL = 'https://' + window.location.hostname + '/';
                 listItem.append($('<a/>').attr('href',
                 baseURL + item[0] + DOCUMENTATION_OPTIONS.FILE_SUFFIX +
                 highlightstring + item[2]).html(item[1]));
@@ -719,7 +740,7 @@ var Search = {
         scoreDict[filenames[file]][titles[file]] += score;
       }
     }
-    
+
     for(var file in scoreDict) {
       for(var title in scoreDict[file]) {
         results.push([file, title, '', null, scoreDict[file][title]]);
@@ -756,30 +777,30 @@ var Search = {
 };
 
 $(document).ready(function() {
-  var searchBoxWidth = 140;
-  var searchBoxWidthModifier = 200;
-    
+  var searchBoxWidth = $('#search-input-wrap').width();
+  var searchBoxWidthModifier = 150;
   var focusInputColor = "white";
   var focusIconColor = "dimgray";
   var focusPlaceColor = "searchBoxExp";
-  var normalInputColor = "#87CEFA";
+  var normalInputColor = "transparent";
   var normalIconColor = "white";
   var normalPlaceColor = "searchBoxNorm";
-    
+
   function focusOut() {
-    $("#search-input-wrap").width(searchBoxWidth);
+    $("#search-input-wrap").css('width', '');
     $(".searchBox").width(searchBoxWidth);
     $(".searchBox").css("background-color", normalInputColor);
     $('#search-input-wrap input').css("background-color", normalInputColor);
     $(".searchBox .glyphicon-search").css("color", normalIconColor);
     $(".searchBox").addClass(normalPlaceColor);
     $(".searchBox").removeClass(focusPlaceColor);
+    $(".searchBox").css('width', '');
     $('#search-preview').hide();
   }
 
   Search.init();
   $('#search-input-wrap input').focus(function () {
-    var modifiedWidth = $(window).width() - searchBoxWidthModifier;
+    var modifiedWidth = Math.max($('#navContainer').width() - searchBoxWidthModifier, searchBoxWidth);
     $("#search-input-wrap").width(modifiedWidth);
     $(".searchBox").width(modifiedWidth);
     $(".searchBox").css("background-color", focusInputColor);
@@ -793,14 +814,14 @@ $(document).ready(function() {
       isPreview = false;
     }
   });
-    
+
   //Click to focus out
   $('body').click(function (e) {
     if(e.target.id == 'search-preview' || e.target.name == 'q' || $(e.target).parents("#search-preview").size()) return;
 
     focusOut();
   });
-    
+
   //Press esc to focus out
   $(document).keyup(function(e) {
      if (e.keyCode == 27) { // escape key maps to keycode `27`
@@ -808,7 +829,7 @@ $(document).ready(function() {
        focusOut();
      }
   });
-    
+
   //Add search result preview
   $('#search-input-wrap input').on('input', function () {
       if($(this).val().length == 0) {
@@ -819,4 +840,4 @@ $(document).ready(function() {
       Search.performSearch($(this).val());
       isPreview = false;
   });
-});
\ No newline at end of file
+});
diff --git a/docs/_static/selectlang.js b/docs/_static/selectlang.js
index 25337abcb22b..86fbd10822a6 100644
--- a/docs/_static/selectlang.js
+++ b/docs/_static/selectlang.js
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 function changeLanguage(langSelect, langSelectLabel, rootpath){
 	langSelect.change(function() {
 		var lang = langSelect.val();
@@ -22,4 +41,4 @@ $(document).ready(function () {
 	langSelectLabel.text($("option:selected").text());
 
 	changeLanguage(langSelect, langSelectLabel, getRootPath());
-})
\ No newline at end of file
+})
diff --git a/docs/_static/us.svg b/docs/_static/us.svg
index 1d621f96d8ad..f410544e3e71 100644
--- a/docs/_static/us.svg
+++ b/docs/_static/us.svg
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <svg id="svg153" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://www.w3.org/2000/svg" height="480" width="640" version="1.1" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/">
  <metadata id="metadata3151">
   <rdf:RDF>
diff --git a/docs/api/julia/index.md b/docs/api/julia/index.md
index ecd08bf9f55d..edcc145ce2f6 100644
--- a/docs/api/julia/index.md
+++ b/docs/api/julia/index.md
@@ -1,5 +1,5 @@
-MXNet - Julia API
-=================
+# MXNet - Julia API
+
 MXNet supports the Julia programming language. The MXNet Julia package brings flexible and efficient GPU
 computing and the state-of-art deep learning to Julia.
 
@@ -10,4 +10,5 @@ computing and the state-of-art deep learning to Julia.
 
 &nbsp;
 
+## Julia API Reference
 Julia documents are available at [http://dmlc.ml/MXNet.jl/latest/](http://dmlc.ml/MXNet.jl/latest/).
diff --git a/docs/api/perl/index.md b/docs/api/perl/index.md
index 19f70fa97883..aadff6f9d2aa 100644
--- a/docs/api/perl/index.md
+++ b/docs/api/perl/index.md
@@ -12,6 +12,12 @@ that's all that is needed there.
 In addition please refer to [excellent metacpan doc interface](https://metacpan.org/release/AI-MXNet) and to very detailed
 [MXNet Python API Documentation](http://mxnet.io/api/python/index.html).
 
+AI::MXNet supports new imperative PyTorch like Gluon MXNet interface. Please get acquainted with this new interface
+at [Deep Learning - The Straight Dope](http://gluon.mxnet.io/).
+
+For specific Perl Gluon usage please refer to Perl examples and tests directories on github, but be assured that the Python and Perl usage
+are extremely close in order to make the use of the Python Gluon docs and examples as easy as possible.
+
 AI::MXNet is seamlessly glued with PDL, the C++ level state can be easily initialized from PDL and the results can be
 transferred to PDL objects in order to allow you to use all the glory and power of the PDL!
 
@@ -56,9 +62,9 @@ pdl> print mx->nd->array(sequence(2,3))->aspdl ## 3 rows, 2 columns
 ]
 ```
  ## Perl API Reference
- * [Module API](module.md) is a flexible high-level interface for training neural networks.
- * [Symbolic API](symbol.md) performs operations on NDArrays to assemble neural networks from layers.
- * [IO Data Loading API](io.md) performs parsing and data loading.
- * [NDArray API](ndarray.md) performs vector/matrix/tensor operations.
- * [KVStore API](kvstore.md) performs multi-GPU and multi-host distributed training.
+ * [Module API is a flexible high-level interface for training neural networks.](module.md)
+ * [Symbolic API performs operations on NDArrays to assemble neural networks from layers.](symbol.md)
+ * [IO Data Loading API performs parsing and data loading.](io.md)
+ * [NDArray API performs vector/matrix/tensor operations.](ndarray.md)
+ * [KVStore API performs multi-GPU and multi-host distributed training.](kvstore.md)
 
diff --git a/docs/api/python/autograd.md b/docs/api/python/autograd/autograd.md
similarity index 95%
rename from docs/api/python/autograd.md
rename to docs/api/python/autograd/autograd.md
index de8188446b7c..410d6a94e260 100644
--- a/docs/api/python/autograd.md
+++ b/docs/api/python/autograd/autograd.md
@@ -1,14 +1,9 @@
 # Autograd Package
 
-
 ```eval_rst
 .. currentmodule:: mxnet.autograd
 ```
 
-```eval_rst
-.. warning:: This package is currently experimental and may change in the near future.
-```
-
 ## Overview
 
 The `autograd` package enables automatic
diff --git a/docs/api/python/callback.md b/docs/api/python/callback/callback.md
similarity index 100%
rename from docs/api/python/callback.md
rename to docs/api/python/callback/callback.md
diff --git a/docs/api/python/executor/executor.md b/docs/api/python/executor/executor.md
new file mode 100644
index 000000000000..ce920ff935d2
--- /dev/null
+++ b/docs/api/python/executor/executor.md
@@ -0,0 +1,40 @@
+# Executor and Executor Manager
+
+The executor and executor manager are internal classes for managing symbolic
+graph execution. This document is only intended for reference for advanced users.
+
+## Executor
+
+```eval_rst
+.. currentmodule:: mxnet.executor
+
+.. autosummary::
+    :nosignatures:
+
+    Executor
+```
+
+## Executor Manager
+
+```eval_rst
+.. currentmodule:: mxnet.executor_manager
+
+.. autosummary::
+    :nosignatures:
+
+    DataParallelExecutorGroup
+    DataParallelExecutorManager
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+.. automodule:: mxnet.executor
+    :members:
+.. automodule:: mxnet.executor_manager
+    :members:
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/gluon.md b/docs/api/python/gluon.md
deleted file mode 100644
index ac637749f856..000000000000
--- a/docs/api/python/gluon.md
+++ /dev/null
@@ -1,565 +0,0 @@
-# Gluon Package
-
-
-```eval_rst
-.. currentmodule:: mxnet.gluon
-```
-
-```eval_rst
-.. warning:: This package is currently experimental and may change in the near future.
-```
-
-<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
-
-## Overview
-
-Gluon package is a high-level interface for MXNet designed to be easy to use while
-keeping most of the flexibility of low level API. Gluon supports both imperative
-and symbolic programming, making it easy to train complex models imperatively
-in Python and then deploy with symbolic graph in C++ and Scala.
-
-## Parameter
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    Parameter
-    ParameterDict
-```
-
-
-## Containers
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    Block
-    HybridBlock
-    SymbolBlock
-```
-
-## Neural Network Layers
-
-```eval_rst
-.. currentmodule:: mxnet.gluon.nn
-```
-
-### Containers
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    Sequential
-    HybridSequential
-```
-
-
-### Basic Layers
-
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    Dense
-    Activation
-    Dropout
-    BatchNorm
-    LeakyReLU
-    Embedding
-```
-
-
-### Convolutional Layers
-
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    Conv1D
-    Conv2D
-    Conv3D
-    Conv1DTranspose
-    Conv2DTranspose
-    Conv3DTranspose
-```
-
-
-
-### Pooling Layers
-
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    MaxPool1D
-    MaxPool2D
-    MaxPool3D
-    AvgPool1D
-    AvgPool2D
-    AvgPool3D
-    GlobalMaxPool1D
-    GlobalMaxPool2D
-    GlobalMaxPool3D
-    GlobalAvgPool1D
-    GlobalAvgPool2D
-    GlobalAvgPool3D
-```
-
-
-
-## Recurrent Layers
-
-```eval_rst
-.. currentmodule:: mxnet.gluon.rnn
-```
-
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    RecurrentCell
-    RNN
-    LSTM
-    GRU
-    RNNCell
-    LSTMCell
-    GRUCell
-    SequentialRNNCell
-    BidirectionalCell
-    DropoutCell
-    ZoneoutCell
-    ResidualCell
-```
-
-
-## Trainer
-
-```eval_rst
-.. currentmodule:: mxnet.gluon
-
-.. autosummary::
-    :nosignatures:
-
-    Trainer
-```
-
-
-## Loss functions
-
-```eval_rst
-.. currentmodule:: mxnet.gluon.loss
-```
-
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    L2Loss
-    L1Loss
-    SoftmaxCrossEntropyLoss
-    KLDivLoss
-```
-
-## Utilities
-
-```eval_rst
-.. currentmodule:: mxnet.gluon.utils
-```
-
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    split_data
-    split_and_load
-    clip_global_norm
-```
-
-## Data
-
-```eval_rst
-.. currentmodule:: mxnet.gluon.data
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    Dataset
-    ArrayDataset
-    RecordFileDataset
-    ImageRecordDataset
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    Sampler
-    SequentialSampler
-    RandomSampler
-    BatchSampler
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    DataLoader
-```
-
-### Vision
-
-```eval_rst
-.. currentmodule:: mxnet.gluon.data.vision
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    MNIST
-    CIFAR10
-```
-
-## Model Zoo
-
-Model zoo provides pre-defined and pre-trained models to help bootstrap machine learning applications.
-
-### Vision
-
-```eval_rst
-.. currentmodule:: mxnet.gluon.model_zoo.vision
-.. automodule:: mxnet.gluon.model_zoo.vision
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    get_model
-```
-
-#### ResNet
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    resnet18_v1
-    resnet34_v1
-    resnet50_v1
-    resnet101_v1
-    resnet152_v1
-    resnet18_v2
-    resnet34_v2
-    resnet50_v2
-    resnet101_v2
-    resnet152_v2
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    ResNetV1
-    ResNetV2
-    BasicBlockV1
-    BasicBlockV2
-    BottleneckV1
-    BottleneckV2
-    get_resnet
-```
-
-#### VGG
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    vgg11
-    vgg13
-    vgg16
-    vgg19
-    vgg11_bn
-    vgg13_bn
-    vgg16_bn
-    vgg19_bn
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    VGG
-    get_vgg
-```
-
-#### Alexnet
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    alexnet
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    AlexNet
-```
-
-#### DenseNet
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    densenet121
-    densenet161
-    densenet169
-    densenet201
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    DenseNet
-```
-
-#### SqueezeNet
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    squeezenet1_0
-    squeezenet1_1
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    SqueezeNet
-```
-
-#### Inception
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    inception_v3
-```
-
-```eval_rst
-.. autosummary::
-    :nosignatures:
-
-    Inception3
-```
-
-## API Reference
-
-<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
-
-```eval_rst
-.. autoclass:: mxnet.gluon.Parameter
-    :members:
-.. autoclass:: mxnet.gluon.ParameterDict
-    :members:
-
-.. autoclass:: mxnet.gluon.Block
-    :members:
-
-    .. automethod:: __call__
-.. autoclass:: mxnet.gluon.HybridBlock
-    :members:
-.. autoclass:: mxnet.gluon.SymbolBlock
-    :members:
-
-.. autoclass:: mxnet.gluon.nn.Sequential
-    :members:
-.. autoclass:: mxnet.gluon.nn.HybridSequential
-    :members:
-.. autoclass:: mxnet.gluon.nn.Dense
-    :members:
-.. autoclass:: mxnet.gluon.nn.Activation
-    :members:
-.. autoclass:: mxnet.gluon.nn.Dropout
-    :members:
-.. autoclass:: mxnet.gluon.nn.BatchNorm
-    :members:
-.. autoclass:: mxnet.gluon.nn.LeakyReLU
-    :members:
-.. autoclass:: mxnet.gluon.nn.Embedding
-    :members:
-.. autoclass:: mxnet.gluon.nn.Conv1D
-    :members:
-.. autoclass:: mxnet.gluon.nn.Conv2D
-    :members:
-.. autoclass:: mxnet.gluon.nn.Conv3D
-    :members:
-.. autoclass:: mxnet.gluon.nn.Conv1DTranspose
-    :members:
-.. autoclass:: mxnet.gluon.nn.Conv2DTranspose
-    :members:
-.. autoclass:: mxnet.gluon.nn.Conv3DTranspose
-    :members:
-.. autoclass:: mxnet.gluon.nn.MaxPool1D
-    :members:
-.. autoclass:: mxnet.gluon.nn.MaxPool2D
-    :members:
-.. autoclass:: mxnet.gluon.nn.MaxPool3D
-    :members:
-.. autoclass:: mxnet.gluon.nn.AvgPool1D
-    :members:
-.. autoclass:: mxnet.gluon.nn.AvgPool2D
-    :members:
-.. autoclass:: mxnet.gluon.nn.AvgPool3D
-    :members:
-.. autoclass:: mxnet.gluon.nn.GlobalMaxPool1D
-    :members:
-.. autoclass:: mxnet.gluon.nn.GlobalMaxPool2D
-    :members:
-.. autoclass:: mxnet.gluon.nn.GlobalMaxPool3D
-    :members:
-.. autoclass:: mxnet.gluon.nn.GlobalAvgPool1D
-    :members:
-.. autoclass:: mxnet.gluon.nn.GlobalAvgPool2D
-    :members:
-.. autoclass:: mxnet.gluon.nn.GlobalAvgPool3D
-    :members:
-
-.. autoclass:: mxnet.gluon.rnn.RecurrentCell
-    :members:
-
-    .. automethod:: __call__
-.. autoclass:: mxnet.gluon.rnn.RNN
-    :members:
-.. autoclass:: mxnet.gluon.rnn.LSTM
-    :members:
-.. autoclass:: mxnet.gluon.rnn.GRU
-    :members:
-.. autoclass:: mxnet.gluon.rnn.RNNCell
-    :members:
-.. autoclass:: mxnet.gluon.rnn.LSTMCell
-    :members:
-.. autoclass:: mxnet.gluon.rnn.GRUCell
-    :members:
-.. autoclass:: mxnet.gluon.rnn.SequentialRNNCell
-    :members:
-.. autoclass:: mxnet.gluon.rnn.BidirectionalCell
-    :members:
-.. autoclass:: mxnet.gluon.rnn.DropoutCell
-    :members:
-.. autoclass:: mxnet.gluon.rnn.ZoneoutCell
-    :members:
-.. autoclass:: mxnet.gluon.rnn.ResidualCell
-    :members:
-
-.. autoclass:: mxnet.gluon.Trainer
-    :members:
-
-.. autoclass:: mxnet.gluon.loss.L2Loss
-    :members:
-.. autoclass:: mxnet.gluon.loss.L1Loss
-    :members:
-.. autoclass:: mxnet.gluon.loss.SoftmaxCrossEntropyLoss
-    :members:
-.. autoclass:: mxnet.gluon.loss.KLDivLoss
-    :members:
-.. automethod:: mxnet.gluon.utils.split_data
-
-.. automethod:: mxnet.gluon.utils.split_and_load
-
-.. automethod:: mxnet.gluon.utils.clip_global_norm
-
-.. autoclass:: mxnet.gluon.data.Dataset
-    :members:
-.. autoclass:: mxnet.gluon.data.ArrayDataset
-    :members:
-.. autoclass:: mxnet.gluon.data.RecordFileDataset
-    :members:
-.. autoclass:: mxnet.gluon.data.ImageRecordDataset
-    :members:
-.. autoclass:: mxnet.gluon.data.Sampler
-    :members:
-.. autoclass:: mxnet.gluon.data.SequentialSampler
-    :members:
-.. autoclass:: mxnet.gluon.data.RandomSampler
-    :members:
-.. autoclass:: mxnet.gluon.data.BatchSampler
-    :members:
-.. autoclass:: mxnet.gluon.data.DataLoader
-    :members:
-.. automodule:: mxnet.gluon.data.vision
-    :members:
-
-.. automethod:: mxnet.gluon.model_zoo.vision.get_model
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet18_v1
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet34_v1
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet50_v1
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet101_v1
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet152_v1
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet18_v2
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet34_v2
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet50_v2
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet101_v2
-.. automethod:: mxnet.gluon.model_zoo.vision.resnet152_v2
-.. automethod:: mxnet.gluon.model_zoo.vision.get_resnet
-.. autoclass:: mxnet.gluon.model_zoo.vision.ResNetV1
-    :members:
-.. autoclass:: mxnet.gluon.model_zoo.vision.BasicBlockV1
-    :members:
-.. autoclass:: mxnet.gluon.model_zoo.vision.BottleneckV1
-    :members:
-.. autoclass:: mxnet.gluon.model_zoo.vision.ResNetV2
-    :members:
-.. autoclass:: mxnet.gluon.model_zoo.vision.BasicBlockV2
-    :members:
-.. autoclass:: mxnet.gluon.model_zoo.vision.BottleneckV2
-    :members:
-.. automethod:: mxnet.gluon.model_zoo.vision.vgg11
-.. automethod:: mxnet.gluon.model_zoo.vision.vgg13
-.. automethod:: mxnet.gluon.model_zoo.vision.vgg16
-.. automethod:: mxnet.gluon.model_zoo.vision.vgg19
-.. automethod:: mxnet.gluon.model_zoo.vision.vgg11_bn
-.. automethod:: mxnet.gluon.model_zoo.vision.vgg13_bn
-.. automethod:: mxnet.gluon.model_zoo.vision.vgg16_bn
-.. automethod:: mxnet.gluon.model_zoo.vision.vgg19_bn
-.. automethod:: mxnet.gluon.model_zoo.vision.get_vgg
-.. autoclass:: mxnet.gluon.model_zoo.vision.VGG
-    :members:
-.. automethod:: mxnet.gluon.model_zoo.vision.alexnet
-.. autoclass:: mxnet.gluon.model_zoo.vision.AlexNet
-    :members:
-.. automethod:: mxnet.gluon.model_zoo.vision.densenet121
-.. automethod:: mxnet.gluon.model_zoo.vision.densenet161
-.. automethod:: mxnet.gluon.model_zoo.vision.densenet169
-.. automethod:: mxnet.gluon.model_zoo.vision.densenet201
-.. autoclass:: mxnet.gluon.model_zoo.vision.DenseNet
-    :members:
-.. automethod:: mxnet.gluon.model_zoo.vision.squeezenet1_0
-.. automethod:: mxnet.gluon.model_zoo.vision.squeezenet1_1
-.. autoclass:: mxnet.gluon.model_zoo.vision.SqueezeNet
-    :members:
-.. automethod:: mxnet.gluon.model_zoo.vision.inception_v3
-.. autoclass:: mxnet.gluon.model_zoo.vision.Inception3
-    :members:
-```
-
-<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/gluon/contrib.md b/docs/api/python/gluon/contrib.md
new file mode 100644
index 000000000000..86f0331a5e72
--- /dev/null
+++ b/docs/api/python/gluon/contrib.md
@@ -0,0 +1,61 @@
+# Gluon Contrib API
+
+## Overview
+
+This document lists the contrib APIs in Gluon:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.gluon.contrib
+```
+
+The `Gluon Contrib` API, defined in the `gluon.contrib` package, provides
+many useful experimental APIs for new features.
+This is a place for the community to try out the new features,
+so that feature contributors can receive feedback.
+
+```eval_rst
+.. warning:: This package contains experimental APIs and may change in the near future.
+```
+
+In the rest of this document, we list routines provided by the `gluon.contrib` package.
+
+## Contrib
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.contrib
+
+.. autosummary::
+    :nosignatures:
+
+    rnn.VariationalDropoutCell
+    rnn.Conv1DRNNCell
+    rnn.Conv2DRNNCell
+    rnn.Conv3DRNNCell
+    rnn.Conv1DLSTMCell
+    rnn.Conv2DLSTMCell
+    rnn.Conv3DLSTMCell
+    rnn.Conv1DGRUCell
+    rnn.Conv2DGRUCell
+    rnn.Conv3DGRUCell
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.gluon.contrib
+    :members:
+    :imported-members:
+
+.. automodule:: mxnet.gluon.contrib.rnn
+    :members:
+    :imported-members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/gluon/data.md b/docs/api/python/gluon/data.md
new file mode 100644
index 000000000000..0b5f959e328a
--- /dev/null
+++ b/docs/api/python/gluon/data.md
@@ -0,0 +1,85 @@
+# Gluon Data API
+
+## Overview
+
+This document lists the data APIs in Gluon:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.gluon.data
+    mxnet.gluon.data.vision
+```
+
+The `Gluon Data` API, defined in the `gluon.data` package, provides useful dataset loading
+and processing tools, as well as common public datasets.
+
+In the rest of this document, we list routines provided by the `gluon.data` package.
+
+## Data
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.data
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Dataset
+    ArrayDataset
+    RecordFileDataset
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Sampler
+    SequentialSampler
+    RandomSampler
+    BatchSampler
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    DataLoader
+```
+
+### Vision
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.data.vision
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    MNIST
+    FashionMNIST
+    CIFAR10
+    CIFAR100
+    ImageRecordDataset
+    ImageFolderDataset
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.gluon.data
+    :members:
+    :imported-members:
+
+.. automodule:: mxnet.gluon.data.vision
+    :members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/gluon/gluon.md b/docs/api/python/gluon/gluon.md
new file mode 100644
index 000000000000..2ae766fdcba3
--- /dev/null
+++ b/docs/api/python/gluon/gluon.md
@@ -0,0 +1,102 @@
+# Gluon Package
+
+
+```eval_rst
+.. currentmodule:: mxnet.gluon
+```
+
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+## Overview
+
+Gluon package is a high-level interface for MXNet designed to be easy to use while
+keeping most of the flexibility of low level API. Gluon supports both imperative
+and symbolic programming, making it easy to train complex models imperatively
+in Python and then deploy with symbolic graph in C++ and Scala.
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   nn.md
+   rnn.md
+   loss.md
+   data.md
+   model_zoo.md
+   contrib.md
+```
+
+
+## Parameter
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Parameter
+    ParameterDict
+```
+
+
+## Containers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Block
+    HybridBlock
+    SymbolBlock
+    nn.Sequential
+    nn.HybridSequential
+```
+
+
+## Trainer
+
+```eval_rst
+.. currentmodule:: mxnet.gluon
+
+.. autosummary::
+    :nosignatures:
+
+    Trainer
+```
+
+## Utilities
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.utils
+```
+
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    split_data
+    split_and_load
+    clip_global_norm
+```
+
+
+## API Reference
+
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+.. automodule:: mxnet.gluon
+    :members:
+    :imported-members:
+    :special-members:
+
+.. autoclass:: mxnet.gluon.nn.Sequential
+    :members:
+.. autoclass:: mxnet.gluon.nn.HybridSequential
+    :members:
+
+.. automodule:: mxnet.gluon.utils
+    :members:
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/gluon/loss.md b/docs/api/python/gluon/loss.md
new file mode 100644
index 000000000000..3529e8e27063
--- /dev/null
+++ b/docs/api/python/gluon/loss.md
@@ -0,0 +1,41 @@
+# Gluon Loss API
+
+## Overview
+
+This document lists the loss API in Gluon:
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.loss
+```
+
+This package includes several commonly used loss functions in neural networks.
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    L2Loss
+    L1Loss
+    SigmoidBinaryCrossEntropyLoss
+    SoftmaxCrossEntropyLoss
+    SigmoidBinaryCrossEntropyLoss
+    KLDivLoss
+    HuberLoss
+    HingeLoss
+    SquaredHingeLoss
+    LogisticLoss
+    TripletLoss
+    CTCLoss
+```
+
+
+## API Reference
+
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+.. automodule:: mxnet.gluon.loss
+    :members:
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/gluon/model_zoo.md b/docs/api/python/gluon/model_zoo.md
new file mode 100644
index 000000000000..8310461242dc
--- /dev/null
+++ b/docs/api/python/gluon/model_zoo.md
@@ -0,0 +1,194 @@
+# Gluon Model Zoo
+
+```eval_rst
+    .. currentmodule:: mxnet.gluon.model_zoo
+```
+
+## Overview
+
+This document lists the model APIs in Gluon:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.gluon.model_zoo
+    mxnet.gluon.model_zoo.vision
+```
+
+The `Gluon Model Zoo` API, defined in the `gluon.model_zoo` package, provides pre-defined
+and pre-trained models to help bootstrap machine learning applications.
+
+In the rest of this document, we list routines provided by the `gluon.model_zoo` package.
+
+### Vision
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.model_zoo.vision
+.. automodule:: mxnet.gluon.model_zoo.vision
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    get_model
+```
+
+#### ResNet
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    resnet18_v1
+    resnet34_v1
+    resnet50_v1
+    resnet101_v1
+    resnet152_v1
+    resnet18_v2
+    resnet34_v2
+    resnet50_v2
+    resnet101_v2
+    resnet152_v2
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    ResNetV1
+    ResNetV2
+    BasicBlockV1
+    BasicBlockV2
+    BottleneckV1
+    BottleneckV2
+    get_resnet
+```
+
+#### VGG
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    vgg11
+    vgg13
+    vgg16
+    vgg19
+    vgg11_bn
+    vgg13_bn
+    vgg16_bn
+    vgg19_bn
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    VGG
+    get_vgg
+```
+
+#### Alexnet
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    alexnet
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    AlexNet
+```
+
+#### DenseNet
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    densenet121
+    densenet161
+    densenet169
+    densenet201
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    DenseNet
+```
+
+#### SqueezeNet
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    squeezenet1_0
+    squeezenet1_1
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    SqueezeNet
+```
+
+#### Inception
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    inception_v3
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Inception3
+```
+
+#### MobileNet
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mobilenet1_0
+    mobilenet0_75
+    mobilenet0_5
+    mobilenet0_25
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    MobileNet
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.gluon.model_zoo
+
+.. automodule:: mxnet.gluon.model_zoo.vision
+    :members:
+    :imported-members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/gluon/nn.md b/docs/api/python/gluon/nn.md
new file mode 100644
index 000000000000..5e2dbe016d61
--- /dev/null
+++ b/docs/api/python/gluon/nn.md
@@ -0,0 +1,79 @@
+# Gluon Neural Network Layers
+
+## Overview
+
+This document lists the neural network blocks in Gluon:
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.nn
+```
+
+
+## Basic Layers
+
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Dense
+    Activation
+    Dropout
+    BatchNorm
+    LeakyReLU
+    Embedding
+    Flatten
+```
+
+
+## Convolutional Layers
+
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Conv1D
+    Conv2D
+    Conv3D
+    Conv1DTranspose
+    Conv2DTranspose
+    Conv3DTranspose
+```
+
+
+
+## Pooling Layers
+
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    MaxPool1D
+    MaxPool2D
+    MaxPool3D
+    AvgPool1D
+    AvgPool2D
+    AvgPool3D
+    GlobalMaxPool1D
+    GlobalMaxPool2D
+    GlobalMaxPool3D
+    GlobalAvgPool1D
+    GlobalAvgPool2D
+    GlobalAvgPool3D
+```
+
+
+## API Reference
+
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+.. automodule:: mxnet.gluon.nn
+    :members:
+    :imported-members:
+    :exclude-members: Block, HybridBlock, SymbolBlock, Sequential, HybridSequential
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/gluon/rnn.md b/docs/api/python/gluon/rnn.md
new file mode 100644
index 000000000000..7a40c451bca5
--- /dev/null
+++ b/docs/api/python/gluon/rnn.md
@@ -0,0 +1,82 @@
+# Gluon Recurrent Neural Network API
+
+## Overview
+
+This document lists the recurrent neural network API in Gluon:
+
+```eval_rst
+.. currentmodule:: mxnet.gluon.rnn
+```
+
+### Recurrent Layers
+
+Recurrent layers can be used in `Sequential` with other regular neural network layers.
+For example, to construct a sequence labeling model where a prediction is made for each
+time-step:
+
+```python
+model = mx.gluon.nn.Sequential()
+with model.name_scope():
+    model.add(mx.gluon.nn.Embedding(30, 10))
+    model.add(mx.gluon.rnn.LSTM(20))
+    model.add(mx.gluon.nn.Dense(5, flatten=False))
+model.initialize()
+model(mx.nd.ones((2,3)))
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RNN
+    LSTM
+    GRU
+```
+
+### Recurrent Cells
+
+Recurrent cells allows fine-grained control when defining recurrent models. User
+can explicit step and unroll to construct complex networks. It provides more
+flexibility but is slower than recurrent layers. Recurrent cells can be stacked
+with `SequentialRNNCell`:
+
+```python
+model = mx.gluon.rnn.SequentialRNNCell()
+with model.name_scope():
+    model.add(mx.gluon.rnn.LSTMCell(20))
+    model.add(mx.gluon.rnn.LSTMCell(20))
+states = model.begin_state(batch_size=32)
+inputs = mx.nd.random.uniform(shape=(5, 32, 10))
+outputs = []
+for i in range(5):
+    output, states = model(inputs[i], states)
+    outputs.append(output)
+```
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RNNCell
+    LSTMCell
+    GRUCell
+    RecurrentCell
+    SequentialRNNCell
+    BidirectionalCell
+    DropoutCell
+    ZoneoutCell
+    ResidualCell
+```
+
+
+## API Reference
+
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+.. automodule:: mxnet.gluon.rnn
+    :members:
+    :imported-members:
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/image.md b/docs/api/python/image/image.md
similarity index 92%
rename from docs/api/python/image.md
rename to docs/api/python/image/image.md
index 23b5ee3d1415..1a1d0fd11102 100644
--- a/docs/api/python/image.md
+++ b/docs/api/python/image/image.md
@@ -69,12 +69,13 @@ A list of supporting augmenters
     :nosignatures:
 
     image.Augmenter
+    image.SequentialAug
+    image.RandomOrderAug
     image.ResizeAug
     image.ForceResizeAug
     image.RandomCropAug
     image.RandomSizedCropAug
     image.CenterCropAug
-    image.RandomOrderAug
     image.BrightnessJitterAug
     image.ContrastJitterAug
     image.SaturationJitterAug
@@ -116,17 +117,17 @@ index_1  label_1  image_path_1
 Where `label_N` is a number a of fixed-width vector.
 The format of label used in object detection is a variable length vector
 ```
-A  B  [header]  [(object0), (object1), ... (objectN)]
+A  B  [extra header]  [(object0), (object1), ... (objectN)]
 ```
-Where A is the width of header, B is the width of each object.
-Header is optional and used for inserting helper information such as (width, height).
+Where A is the width of header (2 + length of extra header), B is the width of each object.
+Extra header is optional and used for inserting helper information such as (width, height).
 Each object is usually 5 or 6 numbers describing the object properties, for example:
 [id, xmin, ymin, xmax, ymax, difficulty]
 Putting all together, we have a `lst` file for object detection:
 ```
-0  2  5  640  480  1  0.1  0.2  0.8  0.9  2  0.5  0.3  0.6  0.8  data/xxx.jpg
-1  2  5  480  640  3  0.05  0.16  0.75  0.9  data/xxx.jpg
-2  2  5  500  600  2  0.6  0.1  0.7  0.5  0  0.1  0.3  0.2  0.4  3  0.25  0.25  0.3  0.3 data/xxx.jpg
+0  4  5  640  480  1  0.1  0.2  0.8  0.9  2  0.5  0.3  0.6  0.8  data/xxx.jpg
+1  4  5  480  640  3  0.05  0.16  0.75  0.9  data/yyy.jpg
+2  4  5  500  600  2  0.6  0.1  0.7  0.5  0  0.1  0.3  0.2  0.4  3  0.25  0.25  0.3  0.3 data/zzz.jpg
 ...
 ```
 
diff --git a/docs/api/python/index.md b/docs/api/python/index.md
index 964ccde0145a..75ff186fd81d 100644
--- a/docs/api/python/index.md
+++ b/docs/api/python/index.md
@@ -19,22 +19,126 @@ imported by running:
 
 ```
 
-## Table of contents
+## NDArray API
 
 ```eval_rst
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
 
-   ndarray
-   symbol
-   module
-   autograd
-   gluon
-   rnn
-   kvstore
-   io
-   image
-   optimization
-   callback
-   metric
+   ndarray/ndarray.md
+   ndarray/random.md
+   ndarray/linalg.md
+   ndarray/sparse.md
+   ndarray/contrib.md
+```
+
+## Symbol API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   symbol/symbol.md
+   symbol/random.md
+   symbol/linalg.md
+   symbol/sparse.md
+   symbol/contrib.md
+   symbol/rnn.md
+```
+
+## Module API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   module/module.md
+   executor/executor.md
+```
+
+## Autograd API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   autograd/autograd.md
+```
+
+## Gluon API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   gluon/gluon.md
+   gluon/nn.md
+   gluon/rnn.md
+   gluon/loss.md
+   gluon/data.md
+   gluon/model_zoo.md
+   gluon/contrib.md
+```
+
+## KVStore API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   kvstore/kvstore.md
+```
+
+## IO API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   io/io.md
+```
+
+## Image API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   image/image.md
+```
+
+## Optimization API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   optimization/optimization.md
+```
+
+## Callback API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   callback/callback.md
+```
+
+## Metric API
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   metric/metric.md
+```
+
+## Run-Time Compilation API
+
+```eval_rst
+.. toctree::
+   :maxdepth 1
+
+   rtc/rtc.md
 ```
diff --git a/docs/api/python/io.md b/docs/api/python/io/io.md
similarity index 98%
rename from docs/api/python/io.md
rename to docs/api/python/io/io.md
index 15f8aa3ce354..ecf3e75ac0d5 100644
--- a/docs/api/python/io.md
+++ b/docs/api/python/io/io.md
@@ -35,7 +35,7 @@ Let's see a complete example of how to use data iterator in model training.
 >>> data = mx.sym.Variable('data')
 >>> label = mx.sym.Variable('softmax_label')
 >>> fullc = mx.sym.FullyConnected(data=data, num_hidden=1)
->>> loss = mx.sym.SoftmaxOutput(data=data, label=label)
+>>> loss = mx.sym.SoftmaxOutput(data=fullc, label=label)
 >>> mod = mx.mod.Module(loss, data_names=['data'], label_names=['softmax_label'])
 >>> mod.bind(data_shapes=nd_iter.provide_data, label_shapes=nd_iter.provide_label)
 >>> mod.fit(nd_iter, num_epoch=2)
@@ -56,6 +56,7 @@ A detailed tutorial is available at
 
     io.NDArrayIter
     io.CSVIter
+    io.LibSVMIter
     io.ImageRecordIter
     io.ImageRecordUInt8Iter
     io.MNISTIter
diff --git a/docs/api/python/kvstore.md b/docs/api/python/kvstore/kvstore.md
similarity index 100%
rename from docs/api/python/kvstore.md
rename to docs/api/python/kvstore/kvstore.md
diff --git a/docs/api/python/metric.md b/docs/api/python/metric/metric.md
similarity index 100%
rename from docs/api/python/metric.md
rename to docs/api/python/metric/metric.md
diff --git a/docs/api/python/model.md b/docs/api/python/model.md
index 964095883910..0cc1df371188 100644
--- a/docs/api/python/model.md
+++ b/docs/api/python/model.md
@@ -1,7 +1,7 @@
 # Model API
 
 The model API provides a simplified way to train neural networks using common best practices.
-It's a thin wrapper built on top of the [ndarray](ndarray.md) and [symbolic](symbol.md)
+It's a thin wrapper built on top of the [ndarray](../python/ndarray/ndarray.md) and [symbolic](../python/symbol/symbol.md)
 modules that make neural network training easy.
 
 Topics:
@@ -152,7 +152,7 @@ Training occurs in parallel on the GPUs that you specify.
 ```
 
 ## Next Steps
-* See [Symbolic API](symbol.md) for operations on NDArrays that assemble neural networks from layers.
-* See [IO Data Loading API](io.md) for parsing and loading data.
-* See [NDArray API](ndarray.md) for vector/matrix/tensor operations.
-* See [KVStore API](kvstore.md) for multi-GPU and multi-host distributed training.
+* See [Symbolic API](../python/symbol/symbol.md) for operations on NDArrays that assemble neural networks from layers.
+* See [IO Data Loading API](../python/io/io.md) for parsing and loading data.
+* See [NDArray API](../python/ndarray/ndarray.md) for vector/matrix/tensor operations.
+* See [KVStore API](../python/kvstore/kvstore.md) for multi-GPU and multi-host distributed training.
diff --git a/docs/api/python/module.md b/docs/api/python/module/module.md
similarity index 100%
rename from docs/api/python/module.md
rename to docs/api/python/module/module.md
diff --git a/docs/api/python/ndarray/contrib.md b/docs/api/python/ndarray/contrib.md
new file mode 100644
index 000000000000..3dcb6d18f95f
--- /dev/null
+++ b/docs/api/python/ndarray/contrib.md
@@ -0,0 +1,65 @@
+# Contrib NDArray API
+
+```eval_rst
+    .. currentmodule:: mxnet.ndarray.contrib
+```
+
+## Overview
+
+This document lists the contrib routines of the *n*-dimensional array package:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.ndarray.contrib
+```
+
+The `Contrib NDArray` API, defined in the `ndarray.contrib` package, provides
+many useful experimental APIs for new features.
+This is a place for the community to try out the new features,
+so that feature contributors can receive feedback.
+
+```eval_rst
+.. warning:: This package contains experimental APIs and may change in the near future.
+```
+
+In the rest of this document, we list routines provided by the `ndarray.contrib` package.
+
+## Contrib
+
+```eval_rst
+.. currentmodule:: mxnet.ndarray.contrib
+
+.. autosummary::
+    :nosignatures:
+
+    CTCLoss
+    DeformableConvolution
+    DeformablePSROIPooling
+    MultiBoxDetection
+    MultiBoxPrior
+    MultiBoxTarget
+    MultiProposal
+    PSROIPooling
+    Proposal
+    count_sketch
+    ctc_loss
+    dequantize
+    fft
+    ifft
+    quantize
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.ndarray.contrib
+    :members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/ndarray/linalg.md b/docs/api/python/ndarray/linalg.md
new file mode 100644
index 000000000000..0a85b48304fa
--- /dev/null
+++ b/docs/api/python/ndarray/linalg.md
@@ -0,0 +1,54 @@
+# Linear Algebra NDArray API
+
+```eval_rst
+    .. currentmodule:: mxnet.ndarray.linalg
+```
+
+## Overview
+
+This document lists the linear algebra routines of the *n*-dimensional array package:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.ndarray.linalg
+```
+
+The `Linear Algebra NDArray` API, defined in the `ndarray.linalg` package, provides
+imperative linear algebra tensor operations on CPU/GPU.
+
+In the rest of this document, we list routines provided by the `ndarray.linalg` package.
+
+## Linear Algebra
+
+```eval_rst
+.. currentmodule:: mxnet.ndarray.linalg
+
+.. autosummary::
+    :nosignatures:
+
+    gemm
+    gemm2
+    potrf
+    potri
+    trmm
+    trsm
+    sumlogdiag
+    syrk
+    gelqf
+    syevd
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.ndarray.linalg
+    :members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/ndarray.md b/docs/api/python/ndarray/ndarray.md
similarity index 68%
rename from docs/api/python/ndarray.md
rename to docs/api/python/ndarray/ndarray.md
index 5e9f7e1a1184..59ca4a612e61 100644
--- a/docs/api/python/ndarray.md
+++ b/docs/api/python/ndarray/ndarray.md
@@ -42,7 +42,7 @@ A detailed tutorial is available at
 
 .. note:: ``mxnet.ndarray`` is similar to ``numpy.ndarray`` in some aspects. But the differences are not negligible. For instance:
 
-   - ``mxnet.ndarray.NDArray.T`` does real data transpose to return new a copied 
+   - ``mxnet.ndarray.NDArray.T`` does real data transpose to return new a copied
      array, instead of returning a view of the input array.
    - ``mxnet.ndarray.dot`` performs dot product between the last axis of the
      first input array and the first axis of the second input, while `numpy.dot`
@@ -56,7 +56,7 @@ A detailed tutorial is available at
   differs from ``symbol`` in few aspects:
 
   - ``ndarray`` adopts imperative programming, namely sentences are executed
-    step-by-step so that the results can be obtained immediately whereas 
+    step-by-step so that the results can be obtained immediately whereas
     ``symbol`` adopts declarative programming.
 
   - Most binary operators in ``ndarray`` such as ``+`` and ``>`` have
@@ -64,9 +64,18 @@ A detailed tutorial is available at
 ```
 
 In the rest of this document, we first overview the methods provided by the
-`ndarray.NDArray` class, and then list other routines provided by the
-`ndarray` package.
+`ndarray.NDArray` class, and then list other routines provided by the `ndarray` package.
 
+The `ndarray` package provides several classes:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray
+    sparse.CSRNDArray
+    sparse.RowSparseNDArray
+```
 
 ## The `NDArray` class
 
@@ -80,6 +89,7 @@ In the rest of this document, we first overview the methods provided by the
     NDArray.size
     NDArray.context
     NDArray.dtype
+    NDArray.stype
 ```
 
 ### Array conversion
@@ -94,6 +104,17 @@ In the rest of this document, we first overview the methods provided by the
     NDArray.asnumpy
     NDArray.asscalar
     NDArray.astype
+    NDArray.tostype
+```
+
+### Array creation
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.zeros_like
+    NDArray.ones_like
 ```
 
 ### Array change shape
@@ -104,7 +125,77 @@ In the rest of this document, we first overview the methods provided by the
 
     NDArray.T
     NDArray.reshape
+    NDArray.reshape_like
+    NDArray.flatten
+    NDArray.expand_dims
+    NDArray.split
+```
+
+### Array expand elements
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
     NDArray.broadcast_to
+    NDArray.broadcast_axes
+    NDArray.tile
+    NDArray.pad
+```
+
+### Array rearrange elements
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.transpose
+    NDArray.swapaxes
+    NDArray.flip
+```
+
+### Array reduction
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.sum
+    NDArray.nansum
+    NDArray.prod
+    NDArray.nanprod
+    NDArray.mean
+    NDArray.max
+    NDArray.min
+    NDArray.norm
+```
+
+### Array rounding
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.round
+    NDArray.rint
+    NDArray.fix
+    NDArray.floor
+    NDArray.ceil
+    NDArray.trunc
+```
+
+### Array sorting and searching
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.sort
+    NDArray.argsort
+    NDArray.topk
+    NDArray.argmax
+    NDArray.argmin
+    NDArray.argmax_channel
 ```
 
 ### Arithmetic operations
@@ -125,6 +216,76 @@ In the rest of this document, we first overview the methods provided by the
     NDArray.__pow__
 ```
 
+### Trigonometric functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.sin
+    NDArray.cos
+    NDArray.tan
+    NDArray.arcsin
+    NDArray.arccos
+    NDArray.arctan
+    NDArray.degrees
+    NDArray.radians
+```
+
+### Hyperbolic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.sinh
+    NDArray.cosh
+    NDArray.tanh
+    NDArray.arcsinh
+    NDArray.arccosh
+    NDArray.arctanh
+```
+
+### Exponents and logarithms
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.exp
+    NDArray.expm1
+    NDArray.log
+    NDArray.log10
+    NDArray.log2
+    NDArray.log1p
+```
+
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.sqrt
+    NDArray.rsqrt
+    NDArray.cbrt
+    NDArray.rcbrt
+    NDArray.square
+    NDArray.reciprocal
+```
+
+## Basic neural network functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.relu
+    NDArray.sigmoid
+    NDArray.softmax
+    NDArray.log_softmax
+```
+
 ### In-place arithmetic operations
 
 ```eval_rst
@@ -160,6 +321,11 @@ In the rest of this document, we first overview the methods provided by the
 
     NDArray.__getitem__
     NDArray.__setitem__
+    NDArray.slice
+    NDArray.slice_axis
+    NDArray.take
+    NDArray.one_hot
+    NDArray.pick
 ```
 
 ### Lazy evaluation
@@ -171,6 +337,16 @@ In the rest of this document, we first overview the methods provided by the
     NDArray.wait_to_read
 ```
 
+### Miscellaneous
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    NDArray.clip
+    NDArray.sign
+```
+
 ## Array creation routines
 
 ```eval_rst
@@ -180,7 +356,9 @@ In the rest of this document, we first overview the methods provided by the
     array
     empty
     zeros
+    zeros_like
     ones
+    ones_like
     full
     arange
     load
@@ -197,6 +375,7 @@ In the rest of this document, we first overview the methods provided by the
 
     cast
     reshape
+    reshape_like
     flatten
     expand_dims
 ```
@@ -233,6 +412,7 @@ In the rest of this document, we first overview the methods provided by the
 
     concat
     split
+    stack
 ```
 
 ### Indexing routines
@@ -247,6 +427,7 @@ In the rest of this document, we first overview the methods provided by the
     batch_take
     one_hot
     pick
+    where
 ```
 
 ## Mathematical functions
@@ -280,6 +461,7 @@ In the rest of this document, we first overview the methods provided by the
     arcsin
     arccos
     arctan
+    broadcast_hypot
     degrees
     radians
 ```
@@ -328,7 +510,6 @@ In the rest of this document, we first overview the methods provided by the
     trunc
 ```
 
-
 ### Exponents and logarithms
 
 ```eval_rst
@@ -352,10 +533,13 @@ In the rest of this document, we first overview the methods provided by the
     power
     sqrt
     rsqrt
+    cbrt
+    rcbrt
     square
+    reciprocal
 ```
 
-### Logic functions
+### Comparison
 
 ```eval_rst
 .. autosummary::
@@ -368,19 +552,20 @@ In the rest of this document, we first overview the methods provided by the
     lesser
     lesser_equal
 ```
+
 ### Random sampling
 
 ```eval_rst
 .. autosummary::
     :nosignatures:
 
-    random_uniform
-    random_normal
-    random_gamma
-    random_exponential
-    random_poisson
-    random_negative_binomial
-    random_generalized_negative_binomial
+    mxnet.ndarray.random.uniform
+    mxnet.ndarray.random.normal
+    mxnet.ndarray.random.gamma
+    mxnet.ndarray.random.exponential
+    mxnet.ndarray.random.poisson
+    mxnet.ndarray.random.negative_binomial
+    mxnet.ndarray.random.generalized_negative_binomial
     mxnet.random.seed
 ```
 
@@ -397,6 +582,17 @@ In the rest of this document, we first overview the methods provided by the
     argmin
 ```
 
+### Sequence operation
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    SequenceLast
+    SequenceMask
+    SequenceReverse
+```
+
 ### Miscellaneous
 
 ```eval_rst
@@ -428,6 +624,8 @@ In the rest of this document, we first overview the methods provided by the
     SoftmaxOutput
     softmax
     log_softmax
+    relu
+    sigmoid
 ```
 
 ### More
@@ -463,51 +661,25 @@ In the rest of this document, we first overview the methods provided by the
     Custom
 ```
 
-## Contrib
-
-```eval_rst
-.. warning:: This package contains experimental APIs and may change in the near future.
-```
+## API Reference
 
-The `contrib.ndarray` module contains many useful experimental APIs for new features. This is a place for the community to try out the new features, so that feature contributors can receive feedback.
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
 
 ```eval_rst
-.. currentmodule:: mxnet.contrib.ndarray
-
-.. autosummary::
-    :nosignatures:
-
-    CTCLoss
-    DeformableConvolution
-    DeformablePSROIPooling
-    MultiBoxDetection
-    MultiBoxPrior
-    MultiBoxTarget
-    MultiProposal
-    PSROIPooling
-    Proposal
-    count_sketch
-    ctc_loss
-    dequantize
-    fft
-    ifft
-    quantize
-```
-
-## API Reference
 
-<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+.. autoclass:: mxnet.ndarray.NDArray
+    :members:
+    :special-members:
 
-```eval_rst
 .. automodule:: mxnet.ndarray
     :members:
+    :imported-members:
+    :special-members:
+    :exclude-members: CachedOp, NDArray
 
 .. automodule:: mxnet.random
     :members:
 
-.. automodule:: mxnet.contrib.ndarray
-    :members:
-
 ```
 
 <script>auto_index("api-reference");</script>
diff --git a/docs/api/python/ndarray/random.md b/docs/api/python/ndarray/random.md
new file mode 100644
index 000000000000..ae9e69f758f6
--- /dev/null
+++ b/docs/api/python/ndarray/random.md
@@ -0,0 +1,55 @@
+# Random Distribution Generator NDArray API
+
+```eval_rst
+    .. currentmodule:: mxnet.ndarray.random
+```
+
+## Overview
+
+This document lists the random distribution generator routines of the *n*-dimensional array package:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.ndarray.random
+```
+
+The `Random Distribution Generator NDArray` API, defined in the `ndarray.random` package, provides
+imperative random distribution generator operations on CPU/GPU.
+
+In the rest of this document, we list routines provided by the `ndarray.random` package.
+
+## Random Distribution Generator
+
+```eval_rst
+.. currentmodule:: mxnet.ndarray.random
+
+.. autosummary::
+    :nosignatures:
+
+    exponential
+    gamma
+    generalized_negative_binomial
+    negative_binomial
+    normal
+    poisson
+    uniform
+    mxnet.random.seed
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.ndarray.random
+    :members:
+
+.. automodule:: mxnet.random
+    :members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/ndarray/sparse.md b/docs/api/python/ndarray/sparse.md
new file mode 100644
index 000000000000..dd0286d092f6
--- /dev/null
+++ b/docs/api/python/ndarray/sparse.md
@@ -0,0 +1,512 @@
+# Sparse NDArray API
+
+```eval_rst
+.. currentmodule:: mxnet.ndarray.sparse
+```
+
+## Overview
+
+This document lists the routines of the *n*-dimensional sparse array package:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.ndarray.sparse
+```
+
+The `CSRNDArray` and `RowSparseNDArray` API, defined in the `ndarray.sparse` package, provides
+imperative sparse tensor operations on **CPU**.
+
+An `CSRNDArray` inherits from `NDArray`, and represents a two-dimensional, fixed-size array in compressed sparse row format.
+
+```python
+>>> x = mx.nd.array([[1, 0], [0, 0], [2, 3]])
+>>> csr = x.tostype('csr')
+>>> type(csr)
+<class 'mxnet.ndarray.sparse.CSRNDArray'>
+>>> csr.shape
+(3, 2)
+>>> csr.data.asnumpy()
+array([ 1.  2.  3.], dtype=float32)
+>>> csr.indices.asnumpy()
+array([0, 0, 1])
+>>> csr.indptr.asnumpy()
+array([0, 1, 1, 3])
+>>> csr.stype
+'csr'
+```
+
+A detailed tutorial is available at
+[CSRNDArray - NDArray in Compressed Sparse Row Storage Format](https:https://mxnet.incubator.apache.org/versions/master/tutorials/sparse/csr.html).
+<br>
+
+An `RowSparseNDArray` inherits from `NDArray`, and represents a multi-dimensional, fixed-size array in row sparse format.
+
+```python
+>>> x = mx.nd.array([[1, 0], [0, 0], [2, 3]])
+>>> row_sparse = x.tostype('row_sparse')
+>>> type(row_sparse)
+<class 'mxnet.ndarray.sparse.RowSparseNDArray'>
+>>> row_sparse.data.asnumpy()
+array([[ 1.  0.],
+       [ 2.  3.]], dtype=float32)
+>>> row_sparse.indices.asnumpy()
+array([0, 2])
+>>> row_sparse.stype
+'row_sparse'
+```
+
+A detailed tutorial is available at
+[RowSparseNDArray - NDArray for Sparse Gradient Updates](https://mxnet.incubator.apache.org/versions/master/tutorials/sparse/row_sparse.html).
+<br><br>
+
+
+```eval_rst
+
+.. note:: ``mxnet.ndarray.sparse`` is similar to ``mxnet.ndarray`` in some aspects. But the differences are not negligible. For instance:
+
+   - Only a subset of operators in ``mxnet.ndarray`` have specialized implementations in ``mxnet.ndarray.sparse``.
+     Operators such as Convolution and broadcasting do not have sparse implementations yet.
+   - The storage types (``stype``) of sparse operators' outputs depend on the storage types of inputs.
+     By default the operators not available in ``mxnet.ndarray.sparse`` infer "default" (dense) storage type for outputs.
+     Please refer to the [API Reference](#api-reference) section for further details on specific operators.
+   - GPU support for ``mxnet.ndarray.sparse`` is experimental. Only a few sparse operators are supported on GPU such as ``sparse.dot``.
+
+.. note:: ``mxnet.ndarray.sparse.CSRNDArray`` is similar to ``scipy.sparse.csr_matrix`` in some aspects. But they differ in a few aspects:
+
+   - In MXNet the column indices (``CSRNDArray.indices``) for a given row are expected to be **sorted in ascending order**.
+     Duplicate column entries for the same row are not allowed.
+   - ``CSRNDArray.data``, ``CSRNDArray.indices`` and ``CSRNDArray.indptr`` always create deep copies, while it's not the case in ``scipy.sparse.csr_matrix``.
+
+```
+
+In the rest of this document, we first overview the methods provided by the
+`ndarray.sparse.CSRNDArray` class and the `ndarray.sparse.RowSparseNDArray` class,
+and then list other routines provided by the `ndarray.sparse` package.
+
+The `ndarray.sparse` package provides several classes:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray
+    RowSparseNDArray
+```
+
+We summarize the interface for each class in the following sections.
+
+## The `CSRNDArray` class
+
+### Array attributes
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.shape
+    CSRNDArray.context
+    CSRNDArray.dtype
+    CSRNDArray.stype
+    CSRNDArray.data
+    CSRNDArray.indices
+    CSRNDArray.indptr
+```
+
+### Array conversion
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.copy
+    CSRNDArray.copyto
+    CSRNDArray.as_in_context
+    CSRNDArray.asscipy
+    CSRNDArray.asnumpy
+    CSRNDArray.asscalar
+    CSRNDArray.astype
+    CSRNDArray.tostype
+```
+
+### Array inspection
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.check_format
+```
+
+### Array creation
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.zeros_like
+```
+
+### Array reduction
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.sum
+    CSRNDArray.mean
+```
+
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.square
+```
+
+### Indexing
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.__getitem__
+    CSRNDArray.__setitem__
+    CSRNDArray.slice
+```
+
+### Lazy evaluation
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    CSRNDArray.wait_to_read
+```
+
+## The `RowSparseNDArray` class
+
+### Array attributes
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.shape
+    RowSparseNDArray.context
+    RowSparseNDArray.dtype
+    RowSparseNDArray.stype
+    RowSparseNDArray.data
+    RowSparseNDArray.indices
+```
+
+### Array conversion
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.copy
+    RowSparseNDArray.copyto
+    RowSparseNDArray.as_in_context
+    RowSparseNDArray.asnumpy
+    RowSparseNDArray.asscalar
+    RowSparseNDArray.astype
+    RowSparseNDArray.tostype
+```
+
+### Array inspection
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.check_format
+```
+
+### Array creation
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.zeros_like
+```
+
+### Array rounding
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.round
+    RowSparseNDArray.rint
+    RowSparseNDArray.fix
+    RowSparseNDArray.floor
+    RowSparseNDArray.ceil
+    RowSparseNDArray.trunc
+```
+
+### Trigonometric functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.sin
+    RowSparseNDArray.tan
+    RowSparseNDArray.arcsin
+    RowSparseNDArray.arctan
+    RowSparseNDArray.degrees
+    RowSparseNDArray.radians
+```
+
+### Hyperbolic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.sinh
+    RowSparseNDArray.tanh
+    RowSparseNDArray.arcsinh
+    RowSparseNDArray.arctanh
+```
+
+### Exponents and logarithms
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.expm1
+    RowSparseNDArray.log1p
+```
+
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.sqrt
+    RowSparseNDArray.square
+```
+
+### Indexing
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.__getitem__
+    RowSparseNDArray.__setitem__
+    RowSparseNDArray.retain
+```
+
+### Lazy evaluation
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.wait_to_read
+```
+
+### Miscellaneous
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    RowSparseNDArray.clip
+    RowSparseNDArray.sign
+```
+
+## Array creation routines
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    array
+    empty
+    zeros
+    zeros_like
+    csr_matrix
+    row_sparse_array
+    mxnet.ndarray.load
+    mxnet.ndarray.save
+```
+
+## Array manipulation routines
+
+### Changing array storage type
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    cast_storage
+```
+
+### Indexing routines
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    slice
+    retain
+```
+
+## Mathematical functions
+
+### Arithmetic operations
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    elemwise_add
+    elemwise_sub
+    elemwise_mul
+    negative
+    dot
+    add_n
+```
+
+### Trigonometric functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sin
+    tan
+    arcsin
+    arctan
+    degrees
+    radians
+```
+
+### Hyperbolic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sinh
+    tanh
+    arcsinh
+    arctanh
+```
+
+### Reduce functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sum
+    mean
+```
+
+### Rounding
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    round
+    rint
+    fix
+    floor
+    ceil
+    trunc
+```
+
+### Exponents and logarithms
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    expm1
+    log1p
+```
+
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sqrt
+    square
+```
+
+### Miscellaneous
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    abs
+    sign
+```
+
+## Neural network
+
+### Updater
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sgd_update
+    sgd_mom_update
+    adam_update
+    ftrl_update
+```
+
+### More
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    make_loss
+    stop_gradient
+    mxnet.ndarray.contrib.SparseEmbedding
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. autoclass:: mxnet.ndarray.sparse.CSRNDArray
+    :members: shape, context, dtype, stype, data, indices, indptr, copy, copyto, as_in_context, asscipy, asnumpy, asscalar, astype, tostype, slice, wait_to_read, zeros_like, __neg__, sum, mean, square, __getitem__, __setitem__, check_format
+
+.. autoclass:: mxnet.ndarray.sparse.RowSparseNDArray
+    :members: shape, context, dtype, stype, data, indices, copy, copyto, as_in_context, asnumpy, asscalar, astype, tostype, wait_to_read, zeros_like, round, rint, fix, floor, ceil, trunc, sin, tan, arcsin, arctan, degrees, radians, sinh, tanh, arcsinh, arctanh, expm1, log1p, sqrt, square, __negative__, __getitem__, __setitem__, check_format, retain, clip, sign
+
+.. automodule:: mxnet.ndarray.sparse
+    :members:
+    :special-members:
+    :exclude-members: BaseSparseNDArray, RowSparseNDArray, CSRNDArray
+
+.. automodule:: mxnet.ndarray.sparse
+    :members: array, zeros, empty
+
+.. automodule:: mxnet.ndarray
+    :members: load, save
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/optimization.md b/docs/api/python/optimization/optimization.md
similarity index 100%
rename from docs/api/python/optimization.md
rename to docs/api/python/optimization/optimization.md
diff --git a/docs/api/python/rtc/rtc.md b/docs/api/python/rtc/rtc.md
new file mode 100644
index 000000000000..bb1c31405ed1
--- /dev/null
+++ b/docs/api/python/rtc/rtc.md
@@ -0,0 +1,29 @@
+# Run-Time Compilation API
+
+```eval_rst
+.. currentmodule:: mxnet.rtc
+```
+
+## Overview
+
+The RTC package contains tools for compiling and running CUDA code from python
+frontend. The compiled kernels can be used stand-alone or combined with
+`autograd.Function` or `operator.CustomOpProp` to support differentiation.
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.rtc
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+.. automodule:: mxnet.rtc
+    :members:
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/symbol/contrib.md b/docs/api/python/symbol/contrib.md
new file mode 100644
index 000000000000..7f5cc4bb3ff7
--- /dev/null
+++ b/docs/api/python/symbol/contrib.md
@@ -0,0 +1,65 @@
+# Contrib Symbol API
+
+```eval_rst
+    .. currentmodule:: mxnet.symbol.contrib
+```
+
+## Overview
+
+This document lists the contrib routines of the symbolic expression package:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.symbol.contrib
+```
+
+The `Contrib Symbol` API, defined in the `symbol.contrib` package, provides
+many useful experimental APIs for new features.
+This is a place for the community to try out the new features,
+so that feature contributors can receive feedback.
+
+```eval_rst
+.. warning:: This package contains experimental APIs and may change in the near future.
+```
+
+In the rest of this document, we list routines provided by the `symbol.contrib` package.
+
+## Contrib
+
+```eval_rst
+.. currentmodule:: mxnet.symbol.contrib
+
+.. autosummary::
+    :nosignatures:
+
+    CTCLoss
+    DeformableConvolution
+    DeformablePSROIPooling
+    MultiBoxDetection
+    MultiBoxPrior
+    MultiBoxTarget
+    MultiProposal
+    PSROIPooling
+    Proposal
+    count_sketch
+    ctc_loss
+    dequantize
+    fft
+    ifft
+    quantize
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.symbol.contrib
+    :members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/symbol/linalg.md b/docs/api/python/symbol/linalg.md
new file mode 100644
index 000000000000..d22ca8e171f9
--- /dev/null
+++ b/docs/api/python/symbol/linalg.md
@@ -0,0 +1,54 @@
+# Linear Algebra Symbol API
+
+```eval_rst
+    .. currentmodule:: mxnet.symbol.linalg
+```
+
+## Overview
+
+This document lists the linear algebra routines of the symbolic expression package:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.symbol.linalg
+```
+
+The `Linear Algebra Symbol` API, defined in the `symbol.linalg` package, provides
+symbolic expressions for linear algebra routines.
+
+In the rest of this document, we list routines provided by the `symbol.linalg` package.
+
+## Linear Algebra
+
+```eval_rst
+.. currentmodule:: mxnet.symbol.linalg
+
+.. autosummary::
+    :nosignatures:
+
+    gemm
+    gemm2
+    potrf
+    potri
+    trmm
+    trsm
+    sumlogdiag
+    syrk
+    gelqf
+    syevd
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.symbol.linalg
+    :members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/symbol/random.md b/docs/api/python/symbol/random.md
new file mode 100644
index 000000000000..a3492f6f8401
--- /dev/null
+++ b/docs/api/python/symbol/random.md
@@ -0,0 +1,55 @@
+# Random Distribution Generator Symbol API
+
+```eval_rst
+    .. currentmodule:: mxnet.symbol.random
+```
+
+## Overview
+
+This document lists the random distribution generator routines of the symbolic expression package:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.symbol.random
+```
+
+The `Random Distribution Generator Symbol` API, defined in the `symbol.random` package, provides
+symbolic expressions for random distribution generator routines.
+
+In the rest of this document, we list routines provided by the `symbol.random` package.
+
+## Random Distribution Generator
+
+```eval_rst
+.. currentmodule:: mxnet.symbol.random
+
+.. autosummary::
+    :nosignatures:
+
+    exponential
+    gamma
+    generalized_negative_binomial
+    negative_binomial
+    normal
+    poisson
+    uniform
+    mxnet.random.seed
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.symbol.random
+    :members:
+
+.. automodule:: mxnet.random
+    :members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/rnn.md b/docs/api/python/symbol/rnn.md
similarity index 99%
rename from docs/api/python/rnn.md
rename to docs/api/python/symbol/rnn.md
index 4021b26af154..fd8530d0ad0b 100644
--- a/docs/api/python/rnn.md
+++ b/docs/api/python/symbol/rnn.md
@@ -1,11 +1,11 @@
-# RNN Cell API
+# RNN Cell Symbol API
 
 ```eval_rst
 .. currentmodule:: mxnet.rnn
 ```
 
 ```eval_rst
-.. warning:: This package is currently experimental and may change in the near future.
+.. warning:: This package was experimental and may be deprecated in the near future.
 ```
 
 ## Overview
diff --git a/docs/api/python/symbol/sparse.md b/docs/api/python/symbol/sparse.md
new file mode 100644
index 000000000000..b40276b9f1a3
--- /dev/null
+++ b/docs/api/python/symbol/sparse.md
@@ -0,0 +1,210 @@
+# Sparse Symbol API
+
+```eval_rst
+    .. currentmodule:: mxnet.symbol.sparse
+```
+
+## Overview
+
+This document lists the routines of the sparse symbolic expression package:
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    mxnet.symbol.sparse
+```
+
+The `Sparse Symbol` API, defined in the `symbol.sparse` package, provides
+sparse neural network graphs and auto-differentiation on CPU.
+
+The storage type of a variable is speficied by the `stype` attribute of the variable.
+The storage type of a symbolic expression is inferred based on the storage types of the variables and the operators.
+
+```python
+>>> a = mx.sym.Variable('a', stype='csr')
+>>> b = mx.sym.Variable('b')
+>>> c = mx.sym.dot(a, b, transpose_a=True)
+>>> type(c)
+<class 'mxnet.symbol.Symbol'>
+>>> e = c.bind(mx.cpu(), {'a': mx.nd.array([[1,0,0]]).tostype('csr'), 'b':mx.nd.ones((1,2))})
+>>> y = e.forward()
+# the result storage type of dot(csr.T, dense) is inferred to be `row_sparse`
+>>> y
+[<RowSparseNDArray 3x2 @cpu(0)>]
+>>> y[0].asnumpy()
+array([ 1.,  1.],
+      [ 0.,  0.],
+      [ 0.,  0.]], dtype=float32)
+```
+
+```eval_rst
+
+.. note:: most operators provided in ``mxnet.symbol.sparse`` are similar to those in
+   ``mxnet.symbol`` although there are few differences:
+
+   - Only a subset of operators in ``mxnet.symbol`` have specialized implementations in ``mxnet.symbol.sparse``.
+     Operators such as reduction and broadcasting do not have sparse implementations yet.
+   - The storage types (``stype``) of sparse operators' outputs depend on the storage types of inputs.
+     By default the operators not available in ``mxnet.symbol.sparse`` infer "default" (dense) storage type for outputs.
+     Please refer to the API reference section for further details on specific operators.
+   - GPU support for ``mxnet.symbol.sparse`` is experimental.
+
+```
+
+In the rest of this document, we list sparse related routines provided by the
+`symbol.sparse` package.
+
+## Symbol creation routines
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    zeros_like
+    mxnet.symbol.var
+```
+
+## Symbol manipulation routines
+
+### Changing symbol storage type
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    cast_storage
+```
+
+### Indexing routines
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    slice
+    retain
+```
+
+## Mathematical functions
+
+### Arithmetic operations
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    elemwise_add
+    elemwise_sub
+    elemwise_mul
+    negative
+    dot
+    add_n
+```
+
+### Trigonometric functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sin
+    tan
+    arcsin
+    arctan
+    degrees
+    radians
+```
+
+### Hyperbolic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sinh
+    tanh
+    arcsinh
+    arctanh
+```
+
+### Reduce functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sum
+    mean
+```
+
+### Rounding
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    round
+    rint
+    fix
+    floor
+    ceil
+    trunc
+```
+
+### Exponents and logarithms
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    expm1
+    log1p
+```
+
+### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    sqrt
+    square
+```
+
+### Miscellaneous
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    clip
+    abs
+    sign
+```
+
+## Neural network
+
+### More
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    make_loss
+    stop_gradient
+    mxnet.symbol.contrib.SparseEmbedding
+```
+
+## API Reference
+
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
+
+```eval_rst
+
+.. automodule:: mxnet.symbol.sparse
+    :members:
+
+```
+
+<script>auto_index("api-reference");</script>
diff --git a/docs/api/python/symbol.md b/docs/api/python/symbol/symbol.md
similarity index 69%
rename from docs/api/python/symbol.md
rename to docs/api/python/symbol/symbol.md
index dd455eee587a..e383597236d6 100644
--- a/docs/api/python/symbol.md
+++ b/docs/api/python/symbol/symbol.md
@@ -91,6 +91,75 @@ Composite multiple symbols into a new one by an operator.
     Symbol.__pow__
 ```
 
+#### Trigonometric functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.sin
+    Symbol.cos
+    Symbol.tan
+    Symbol.arcsin
+    Symbol.arccos
+    Symbol.arctan
+    Symbol.degrees
+    Symbol.radians
+```
+
+#### Hyperbolic functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.sinh
+    Symbol.cosh
+    Symbol.tanh
+    Symbol.arcsinh
+    Symbol.arccosh
+    Symbol.arctanh
+```
+
+#### Exponents and logarithms
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.exp
+    Symbol.expm1
+    Symbol.log
+    Symbol.log10
+    Symbol.log2
+    Symbol.log1p
+```
+
+#### Powers
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.sqrt
+    Symbol.rsqrt
+    Symbol.cbrt
+    Symbol.rcbrt
+    Symbol.square
+```
+
+## Basic neural network functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.relu
+    Symbol.sigmoid
+    Symbol.softmax
+    Symbol.log_softmax
+```
+
 #### Comparison operators
 
 ```eval_rst
@@ -105,8 +174,97 @@ Composite multiple symbols into a new one by an operator.
     Symbol.__ne__
 ```
 
-### Query information
+### Symbol creation
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.zeros_like
+    Symbol.ones_like
+```
 
+### Changing shape and type
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.astype
+    Symbol.reshape
+    Symbol.reshape_like
+    Symbol.flatten
+    Symbol.expand_dims
+```
+
+### Expanding elements
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.broadcast_to
+    Symbol.broadcast_axes
+    Symbol.tile
+    Symbol.pad
+```
+
+### Rearranging elements
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.transpose
+    Symbol.swapaxes
+    Symbol.flip
+```
+
+### Reduce functions
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.sum
+    Symbol.nansum
+    Symbol.prod
+    Symbol.nanprod
+    Symbol.mean
+    Symbol.max
+    Symbol.min
+    Symbol.norm
+```
+
+### Rounding
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.round
+    Symbol.rint
+    Symbol.fix
+    Symbol.floor
+    Symbol.ceil
+    Symbol.trunc
+```
+
+### Sorting and searching
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.sort
+    Symbol.argsort
+    Symbol.topk
+    Symbol.argmax
+    Symbol.argmin
+    Symbol.argmax_channel
+```
+
+### Query information
 
 ```eval_rst
 .. autosummary::
@@ -121,6 +279,19 @@ Composite multiple symbols into a new one by an operator.
     Symbol.attr_dict
 ```
 
+### Indexing
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.slice
+    Symbol.slice_axis
+    Symbol.take
+    Symbol.one_hot
+    Symbol.pick
+```
+
 ### Get internal and output symbol
 
 ```eval_rst
@@ -166,6 +337,16 @@ Composite multiple symbols into a new one by an operator.
     Symbol.debug_str
 ```
 
+### Miscellaneous
+
+```eval_rst
+.. autosummary::
+    :nosignatures:
+
+    Symbol.clip
+    Symbol.sign
+```
+
 ## Symbol creation routines
 
 ```eval_rst
@@ -174,7 +355,9 @@ Composite multiple symbols into a new one by an operator.
 
     var
     zeros
+    zeros_like
     ones
+    ones_like
     arange
 ```
 
@@ -188,6 +371,7 @@ Composite multiple symbols into a new one by an operator.
 
     cast
     reshape
+    reshape_like
     flatten
     expand_dims
 ```
@@ -224,6 +408,7 @@ Composite multiple symbols into a new one by an operator.
 
     concat
     split
+    stack
 ```
 
 ### Indexing routines
@@ -237,6 +422,10 @@ Composite multiple symbols into a new one by an operator.
     take
     batch_take
     one_hot
+    pick
+    where
+    gather_nd
+    scatter_nd
 ```
 
 ## Mathematical functions
@@ -253,7 +442,6 @@ Composite multiple symbols into a new one by an operator.
     broadcast_div
     broadcast_mod
     negative
-    reciprocal
     dot
     batch_dot
     add_n
@@ -321,7 +509,6 @@ Composite multiple symbols into a new one by an operator.
     trunc
 ```
 
-
 ### Exponents and logarithms
 
 ```eval_rst
@@ -345,10 +532,13 @@ Composite multiple symbols into a new one by an operator.
     broadcast_power
     sqrt
     rsqrt
+    cbrt
+    rcbrt
     square
+    reciprocal
 ```
 
-### Logic functions
+### Comparison
 
 ```eval_rst
 .. autosummary::
@@ -361,26 +551,20 @@ Composite multiple symbols into a new one by an operator.
     broadcast_lesser
     broadcast_lesser_equal
 ```
+
 ### Random sampling
 
 ```eval_rst
 .. autosummary::
     :nosignatures:
 
-    random_uniform
-    random_normal
-    random_gamma
-    random_exponential
-    random_poisson
-    random_negative_binomial
-    random_generalized_negative_binomial
-    sample_uniform
-    sample_normal
-    sample_gamma
-    sample_exponential
-    sample_poisson
-    sample_negative_binomial
-    sample_generalized_negative_binomial
+    mxnet.symbol.random.uniform
+    mxnet.symbol.random.normal
+    mxnet.symbol.random.gamma
+    mxnet.symbol.random.exponential
+    mxnet.symbol.random.poisson
+    mxnet.symbol.random.negative_binomial
+    mxnet.symbol.random.generalized_negative_binomial
     mxnet.random.seed
 ```
 
@@ -397,19 +581,15 @@ Composite multiple symbols into a new one by an operator.
     argmin
 ```
 
-### Linear Algebra
+### Sequence operation
 
 ```eval_rst
 .. autosummary::
     :nosignatures:
 
-    linalg_gemm
-    linalg_gemm2
-    linalg_potrf
-    linalg_potri
-    linalg_trmm
-    linalg_trsm
-    linalg_sumlogdiag
+    SequenceLast
+    SequenceMask
+    SequenceReverse
 ```
 
 ### Miscellaneous
@@ -445,6 +625,8 @@ Composite multiple symbols into a new one by an operator.
     SoftmaxOutput
     softmax
     log_softmax
+    relu
+    sigmoid
 ```
 
 ### More
@@ -480,46 +662,23 @@ Composite multiple symbols into a new one by an operator.
     Custom
 ```
 
-## Contrib
-
-```eval_rst
-.. warning:: This package contains experimental APIs and may change in the near future.
-```
+## API Reference
 
-The `contrib.symbol` module contains many useful experimental APIs for new features. This is a place for the community to try out the new features, so that feature contributors can receive feedback.
+<script type="text/javascript" src='../../../_static/js/auto_module_index.js'></script>
 
 ```eval_rst
-.. currentmodule:: mxnet.contrib.symbol
 
-.. autosummary::
-    :nosignatures:
-
-    CTCLoss
-    DeformableConvolution
-    DeformablePSROIPooling
-    MultiBoxDetection
-    MultiBoxPrior
-    MultiBoxTarget
-    MultiProposal
-    PSROIPooling
-    Proposal
-    count_sketch
-    ctc_loss
-    dequantize
-    fft
-    ifft
-    quantize
-```
-
-## API Reference
-
-<script type="text/javascript" src='../../_static/js/auto_module_index.js'></script>
+.. autoclass:: mxnet.symbol.Symbol
+    :members:
+    :special-members:
 
-```eval_rst
 .. automodule:: mxnet.symbol
     :members:
+    :imported-members:
+    :special-members:
+    :exclude-members: Symbol
 
-.. automodule:: mxnet.contrib.symbol
+.. automodule:: mxnet.symbol.random
     :members:
 
 ```
diff --git a/docs/api/python/symbol_in_pictures.md b/docs/api/python/symbol_in_pictures/symbol_in_pictures.md
similarity index 95%
rename from docs/api/python/symbol_in_pictures.md
rename to docs/api/python/symbol_in_pictures/symbol_in_pictures.md
index 4fdde75fe05f..5ef4395b9af6 100644
--- a/docs/api/python/symbol_in_pictures.md
+++ b/docs/api/python/symbol_in_pictures/symbol_in_pictures.md
@@ -1,7 +1,7 @@
 # Symbolic Configuration and Execution in Pictures
 
 This topic explains symbolic construction and execution in pictures.
-We recommend that you also read [Symbolic API](symbol.md).
+We recommend that you also read [Symbolic API](../symbol/symbol.md).
 
 ## Compose Symbols
 
@@ -74,4 +74,4 @@ Auxiliary states are just like arguments, except that you can't take the gradien
 
 ## Next Steps
 
-See [Symbolic API](symbol.md) and [Python Documentation](index.md).
+See [Symbolic API](../symbol/symbol.md) and [Python Documentation](../index.md).
diff --git a/docs/api/r/index.md b/docs/api/r/index.md
index 7b57947b2649..8dcb746279d9 100644
--- a/docs/api/r/index.md
+++ b/docs/api/r/index.md
@@ -24,5 +24,5 @@ You can perform tensor or matrix computation in R:
 ## Resources
 
 * [MXNet R Reference Manual](https://media.readthedocs.org/pdf/mxnet-test/latest/mxnet-test.pdf)
-* [MXNet for R Tutorials](http://mxnet.io/tutorials/index.html#R-Tutorials)
+* [MXNet for R Tutorials](../../tutorials/r/index.html)
 
diff --git a/docs/api/scala/index.md b/docs/api/scala/index.md
index 2d5a0472ca6a..ef8a03a06a9f 100644
--- a/docs/api/scala/index.md
+++ b/docs/api/scala/index.md
@@ -26,12 +26,12 @@ You can perform tensor or matrix computation in pure Scala:
 ```
 
  ## Scala API Reference
- * [Module API](module.md) is a flexible high-level interface for training neural networks.
- * [Model API](model.md) is an alternate simple high-level interface for training neural networks.
- * [Symbolic API](symbol.md) performs operations on NDArrays to assemble neural networks from layers.
- * [IO Data Loading API](io.md) performs parsing and data loading.
- * [NDArray API](ndarray.md) performs vector/matrix/tensor operations.
- * [KVStore API](kvstore.md) performs multi-GPU and multi-host distributed training.
+ * [Module API is a flexible high-level interface for training neural networks.](module.md)
+ * [Model API is an alternate simple high-level interface for training neural networks.](model.md)
+ * [Symbolic API performs operations on NDArrays to assemble neural networks from layers.](symbol.md)
+ * [IO Data Loading API performs parsing and data loading.](io.md)
+ * [NDArray API performs vector/matrix/tensor operations.](ndarray.md)
+ * [KVStore API performs multi-GPU and multi-host distributed training.](kvstore.md)
 
 
 ## Resources
diff --git a/docs/architecture/note_engine.md b/docs/architecture/note_engine.md
index dc0b84aa53d1..237d8d61b77d 100644
--- a/docs/architecture/note_engine.md
+++ b/docs/architecture/note_engine.md
@@ -274,7 +274,7 @@ most existing code can be scheduled by the dependency engine in two steps:
 
 
 1. Allocate the variable tags associated with resources like memory blob, PRNGS.
-	- Call `push` with the execution function as the original code to execute, and put the variable tags of
+2. Call `push` with the execution function as the original code to execute, and put the variable tags of
   corresponding resources correctly in `read_vars` and `mutate_vars`.
 
 ## Implementing the Generic Dependency Engine
diff --git a/docs/build_version_doc/AddPackageLink.py b/docs/build_version_doc/AddPackageLink.py
index 8fe04b50b5ce..280f9d0927bd 100644
--- a/docs/build_version_doc/AddPackageLink.py
+++ b/docs/build_version_doc/AddPackageLink.py
@@ -21,7 +21,7 @@
 
 parser = argparse.ArgumentParser(description="Add download package link.",
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--file_path', type=str, default='docs/_build/html/get_started/install.html',
+parser.add_argument('--file_path', type=str, default='docs/_build/html/install/index.html',
                         help='file to be modified')
 parser.add_argument('--current_version', type=str, default='master',
                         help='Current version')
@@ -30,13 +30,13 @@
     args = parser.parse_args()
     tag = args.current_version
 
-    src_url = "http://www.apache.org/dyn/closer.cgi/incubator/" \
+    src_url = "https://www.apache.org/dyn/closer.cgi/incubator/" \
               "mxnet/%s-incubating/apache-mxnet-src-%s-incubating.tar.gz" % (tag, tag)
-    pgp_url = "http://www.apache.org/dyn/closer.cgi/incubator/" \
+    pgp_url = "https://www.apache.org/dyn/closer.cgi/incubator/" \
               "mxnet/%s-incubating/apache-mxnet-src-%s-incubating.tar.gz.asc" % (tag, tag)
-    sha_url = "http://www.apache.org/dyn/closer.cgi/incubator/" \
+    sha_url = "https://www.apache.org/dyn/closer.cgi/incubator/" \
               "mxnet/%s-incubating/apache-mxnet-src-%s-incubating.tar.gz.sha" % (tag, tag)
-    md5_url = "http://www.apache.org/dyn/closer.cgi/incubator/" \
+    md5_url = "https://www.apache.org/dyn/closer.cgi/incubator/" \
               "mxnet/%s-incubating/apache-mxnet-src-%s-incubating.tar.gz.md5" % (tag, tag)
 
     download_str = "<div class='btn-group' role='group'>"
diff --git a/docs/build_version_doc/AddVersion.py b/docs/build_version_doc/AddVersion.py
index c48c630565b7..2c9ee22bf42e 100755
--- a/docs/build_version_doc/AddVersion.py
+++ b/docs/build_version_doc/AddVersion.py
@@ -59,9 +59,6 @@
                 continue
             with open(os.path.join(path, name), 'r') as html_file:
                 content = bs(html_file, 'html.parser')
-            if os.path.join(path, name) == args.file_path + 'index.html':
-                content.find(id='example-link')['href'] = \
-                    'https://github.com/apache/incubator-mxnet/tree/%s/example' % (args.current_version)
             navbar = content.find(id="main-nav")
             navbar_mobile = content.find(id="burgerMenu")
             outstr = str(content)
@@ -86,6 +83,31 @@
                 outstr = outstr.replace('http://mxnet.io', 'https://mxnet.incubator.apache.org/'
                                                                'versions/%s' % (args.current_version))
 
+            # Fix git clone and pip installation to specific tag
+            pip_pattern = ['', '-cu80', '-cu75', '-cu80mkl', '-cu75mkl', '-mkl']
+            if args.current_version == 'master':
+                outstr = outstr.replace('git clone --recursive https://github.com/dmlc/mxnet',
+                                        'git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet')
+                for trail in pip_pattern:
+                    outstr = outstr.replace('pip install mxnet%s<' % (trail),
+                                            'pip install mxnet%s --pre<' % (trail))
+                    outstr = outstr.replace('pip install mxnet%s\n<' % (trail),
+                                            'pip install mxnet%s --pre\n<' % (trail))
+            else:
+                outstr = outstr.replace('git clone --recursive https://github.com/dmlc/mxnet',
+                                        'git clone --recursive https://github.com/apache/incubator-mxnet.git mxnet '
+                                        '--branch %s' % (args.current_version))
+                for trail in pip_pattern:
+                    outstr = outstr.replace('pip install mxnet%s<' % (trail),
+                                            'pip install mxnet%s==%s<' % (trail, args.current_version))
+                    outstr = outstr.replace('pip install mxnet%s\n<' % (trail),
+                                            'pip install mxnet%s==%s\n<' % (trail, args.current_version))
+
+            # Add tag for example link
+            outstr = outstr.replace('https://github.com/apache/incubator-mxnet/tree/master/example',
+                                    'https://github.com/apache/incubator-mxnet/tree/%s/example' %
+                                    (args.current_version))
+
             with open(os.path.join(path, name), "w") as outf:
                 outf.write(outstr)
 
diff --git a/docs/build_version_doc/build_all_version.sh b/docs/build_version_doc/build_all_version.sh
new file mode 100755
index 000000000000..bf02a62a1543
--- /dev/null
+++ b/docs/build_version_doc/build_all_version.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This script is for locally building website for all versions
+# Built files are stored in $built
+# Version numbers are stored in $tag_list.
+# Version numbers are ordered from latest to old and final one is master.
+tag_list="1.0.0 0.12.0 0.11.0 master"
+
+mxnet_url="https://github.com/apache/incubator-mxnet.git"
+mxnet_folder="apache_mxnet"
+built="VersionedWeb"
+mkdir $built
+mkdir "$built/versions"
+
+git clone $mxnet_url $mxnet_folder --recursive
+cd "$mxnet_folder/docs"
+tag_file="tag_list.txt"
+
+# Write all version numbers into $tag_file
+for tag in $tag_list; do
+    if [ $tag != 'master' ]
+    then
+        echo "$tag" >> "$tag_file"
+    fi
+done
+
+# Build all versions and use latest version(First version number in $tag_list) as landing page.
+version_num=0
+for tag in $tag_list; do
+    if [ $tag == 'master' ]
+    then
+        git checkout master
+    else
+        git checkout "tags/$tag"
+    fi
+
+    git submodule update || exit 1
+    cd ..
+    make clean
+    cd docs
+    make clean
+    make html USE_OPENMP=0 || exit 1
+    python build_version_doc/AddVersion.py --file_path "_build/html/" --current_version "$tag" || exit 1
+
+    if [ $tag != 'master' ]
+    then 
+        python build_version_doc/AddPackageLink.py --file_path "_build/html/get_started/install.html" \
+                                                   --current_version "$tag" || exit 1
+    fi
+
+    if [ $version_num == 0 ]
+    then
+        cp -a _build/html/. "../../$built"
+    else
+        file_loc="../../$built/versions/$tag"
+        mkdir "$file_loc"
+        cp -a _build/html/. "$file_loc"
+    fi
+
+    ((++version_num))
+done
+    
+mv "$tag_file" "../../$built/tag.txt"
+cd ../..
+rm -rf "$mxnet_folder"
diff --git a/docs/build_version_doc/build_doc.sh b/docs/build_version_doc/build_doc.sh
index f98e1e0683dc..d7cbd3bfbefc 100755
--- a/docs/build_version_doc/build_doc.sh
+++ b/docs/build_version_doc/build_doc.sh
@@ -64,6 +64,7 @@ then
     cp -a "$web_folder/versions/." "$local_build/versions"
     mkdir "$local_build/versions/${tag_list[0]}"
     cp -a "$web_folder/." "$local_build/versions/${tag_list[0]}" || exit 1
+    cp "$web_folder/README.md" "$local_build"
     rm -rf "$local_build/versions/${tag_list[0]}/versions"
     rm -rf "$web_folder/*"
     cp -a "$local_build/." "$web_folder"
diff --git a/docs/community/contribute.md b/docs/community/contribute.md
index 3a39743af33d..7a38295a5a20 100644
--- a/docs/community/contribute.md
+++ b/docs/community/contribute.md
@@ -2,7 +2,7 @@
 
 MXNet has been developed and is used by a group of active community members.
 Please contribute to improve the project.
-After your patch has been merged, remember to add your name to [CONTRIBUTORS.md](https://github.com/dmlc/mxnet/blob/master/CONTRIBUTORS.md).
+After your patch has been merged, remember to add your name to [CONTRIBUTORS.md](https://github.com/apache/incubator-mxnet/blob/master/CONTRIBUTORS.md).
 
 ## Code Contribution
 
@@ -44,8 +44,8 @@ make roxygen.
 ```
 
 #### R Markdown Vignettes
-R Markdown vignettes are located on GitHub in [R-package/vignettes](https://github.com/dmlc/mxnet/tree/master/R-package/vignettes).
-These R Markdown files aren't compiled. We host the compiled version on [doc/R-package](https://github.com/dmlc/mxnet/tree/master/R-package/).
+R Markdown vignettes are located on GitHub in [R-package/vignettes](https://github.com/apache/incubator-mxnet/tree/master/R-package/vignettes).
+These R Markdown files aren't compiled. We host the compiled version on [doc/R-package](https://github.com/apache/incubator-mxnet/tree/master/R-package/).
 
 To add a new R Markdown vignettes:
 
@@ -68,12 +68,12 @@ make html
 
 ### Test Cases
 
-* All of our tests can be found in the GitHub repo in [this directory](https://github.com/dmlc/mxnet/tree/master/tests).
+* All of our tests can be found in the GitHub repo in [this directory](https://github.com/apache/incubator-mxnet/tree/master/tests).
 * We use Python nose for python test cases, and gtest for C++ unit tests.
 
 ### Examples
 
-* Use cases and examples are on GitHub in [examples](https://github.com/dmlc/mxnet/tree/master/example)
+* Use cases and examples are on GitHub in [examples](https://github.com/apache/incubator-mxnet/tree/master/example)
 * If you write a blog post or tutorial about or using MXNet, please tell us by creating an issue
 in our github repo. We regularly feature high-quality contributed content from the community.
 
@@ -86,7 +86,7 @@ Make sure to add documentation with any code you contribute. Follow these standa
 * Follow [numpy doc standards](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard) and
 some changes we made [MXNet doc standards](https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt#docstring-standard).
 * If an API is implemented in Python or has a wrapper defined, the documentation and the examples reside
-where the function is defined in `.py` file in [python/mxnet](https://github.com/dmlc/mxnet/tree/master/python/mxnet) folder. Same goes for other languages.
+where the function is defined in `.py` file in [python/mxnet](https://github.com/apache/incubator-mxnet/tree/master/python/mxnet) folder. Same goes for other languages.
 * If the API is dynamically generated from the MXNet backend, the documentation is in the C++ code(.cc
 file) where the operator is registered in describe method of the `NNVM_REGISTER_OP`. The file and line
 number for the function is usually printed with the API documentation on mxnet.io.
@@ -111,7 +111,7 @@ make lint
 * Before submitting your contribution, rebase your code on the most recent version of the master:
 
 ```bash
-    git remote add upstream https://github.com/dmlc/mxnet
+    git remote add upstream https://github.com/apache/incubator-mxnet
     git fetch upstream
     git rebase upstream/master
 ```
@@ -119,7 +119,7 @@ make lint
    merge them into meaningful groups (use ```git rebase``` then ```squash```).
 * Send the pull request.
 * Fix problems reported by automatic checks.
-* If you are contributing a new module, consider adding a test case in [tests](https://github.com/dmlc/mxnet/tree/master/tests).
+* If you are contributing a new module, consider adding a test case in [tests](https://github.com/apache/incubator-mxnet/tree/master/tests).
 
 ### Resolving a Conflict with the Master
 
@@ -127,7 +127,7 @@ make lint
 
  ```bash
     # The first two steps can be skipped after you do it once.
-    git remote add upstream https://github.com/dmlc/mxnet
+    git remote add upstream https://github.com/apache/incubator-mxnet
     git fetch upstream
     git rebase upstream/master
  ```
diff --git a/docs/community/index.md b/docs/community/index.md
index ccfba2cd9c14..3ab50718a23f 100644
--- a/docs/community/index.md
+++ b/docs/community/index.md
@@ -1,6 +1,6 @@
 # MXNet Community
 ## Questions about Using MXNet
-If you are not sure of how to use MXNet for something, or have questions about applying it to a particular kind of problem, please post a question at [Stackoverflow](http://stackoverflow.com/) with tag - ```mxnet```. You can view StackOverflow questions about mxnet [here](http://stackoverflow.com/questions/tagged/mxnet).
+If you need help with using MXNet, have questions about applying it to a particular kind of problem, or have a discussion topic, please use our [forum](https://discuss.mxnet.io).
 
 ## Issue Tracker
 We track bugs and new feature requests in the MXNet Github repo in the issues folder: [mxnet/issues](https://github.com/dmlc/mxnet/issues).
diff --git a/docs/community/powered_by.md b/docs/community/powered_by.md
new file mode 100644
index 000000000000..82748ef7df2a
--- /dev/null
+++ b/docs/community/powered_by.md
@@ -0,0 +1,66 @@
+# Powered By
+
+<div class="section-inst">
+<div class="container">
+<div class="row">
+<div class="col-lg-4 col-sm-6">
+    <img height="60px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/aws-logo.png">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="55px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/baidu-logo.jpg">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="70px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/cmu-logo.png">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="70px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/dato-logo.png">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="70px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/intel-logo.png">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/ms-logo.png">
+</div>
+        
+<div class="col-lg-4 col-sm-6">
+    <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/mit-logo.png">
+</div>
+        
+<div class="col-lg-4 col-sm-6">
+    <img height="40px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/nvidia-logo.png">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="55px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/nyu-logo.jpg">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="55px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/hkust-logo.png">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/tusimple-logo.png">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/university-alberta-logo.png">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/university-washington-logo.jpg">
+</div>
+
+<div class="col-lg-4 col-sm-6">
+    <img height="45px" src="https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo/wolfram-logo.png">
+</div>
+</div>
+    <p>
+        <b>These are some of the organizations that use MXNet. If you'd like to be added to this list, please send a message to <a href="mailto:dev@mxnet.incubator.apache.org">dev@mxnet.incubator.apache.org</a> </b>
+    </p>
+</div>
+</div>
diff --git a/docs/conf.py b/docs/conf.py
index ad51323f01e9..d018408d455a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -59,6 +59,7 @@
     'sphinx.ext.autosummary',
     'sphinx.ext.napoleon',
     'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
     'breathe',
     'mxdoc'
 ]
diff --git a/docs/how_to/bucketing.md b/docs/faq/bucketing.md
similarity index 100%
rename from docs/how_to/bucketing.md
rename to docs/faq/bucketing.md
diff --git a/docs/how_to/caffe.md b/docs/faq/caffe.md
similarity index 100%
rename from docs/how_to/caffe.md
rename to docs/faq/caffe.md
diff --git a/docs/how_to/cloud.md b/docs/faq/cloud.md
similarity index 100%
rename from docs/how_to/cloud.md
rename to docs/faq/cloud.md
diff --git a/docs/how_to/develop_and_hack.md b/docs/faq/develop_and_hack.md
similarity index 100%
rename from docs/how_to/develop_and_hack.md
rename to docs/faq/develop_and_hack.md
diff --git a/docs/how_to/env_var.md b/docs/faq/env_var.md
similarity index 97%
rename from docs/how_to/env_var.md
rename to docs/faq/env_var.md
index cb993192bc7f..7a4f8d568da2 100644
--- a/docs/how_to/env_var.md
+++ b/docs/faq/env_var.md
@@ -114,6 +114,10 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   - The default value of cudnn auto tunning for convolution layers.
   - Auto tuning is turned off by default. For benchmarking, set this to 1 to turn it on by default.
 
+* MXNET_GLUON_REPO
+  - Values: String ```(default='https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'```
+  - The repository url to be used for Gluon datasets and pre-trained models.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/docs/how_to/faq.md b/docs/faq/faq.md
similarity index 100%
rename from docs/how_to/faq.md
rename to docs/faq/faq.md
diff --git a/docs/how_to/finetune.md b/docs/faq/finetune.md
similarity index 95%
rename from docs/how_to/finetune.md
rename to docs/faq/finetune.md
index f6c164c28db9..533c3caf52a9 100644
--- a/docs/how_to/finetune.md
+++ b/docs/faq/finetune.md
@@ -1,3 +1,4 @@
+
 # Fine-tune with Pretrained Models
 
 Many of the exciting deep learning algorithms for computer vision require
@@ -68,12 +69,19 @@ python ~/mxnet/tools/im2rec.py --resize 256 --quality 90 --num-thread 16 caltech
 
 The following code downloads the pregenerated rec files. It may take a few minutes.
 
+
 ```python
-import os, urllib
+import os, sys
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlretrieve
+else:
+    from urllib import urlretrieve
+
 def download(url):
     filename = url.split("/")[-1]
     if not os.path.exists(filename):
-        urllib.urlretrieve(url, filename)
+        urlretrieve(url, filename)
 download('http://data.mxnet.io/data/caltech-256/caltech-256-60-train.rec')
 download('http://data.mxnet.io/data/caltech-256/caltech-256-60-val.rec')
 ```
@@ -137,8 +145,8 @@ def get_fine_tune_model(symbol, arg_params, num_classes, layer_name='flatten0'):
     return (net, new_args)
 ```
 
-Now we create a module. We first call `init_params` to randomly initialize parameters, next use `set_params` to replace all parameters except for the last fully-connected layer with pretrained model.
-
+Now we create a module. Note we pass the existing parameters from the loaded model via the `arg_params` argument.
+The parameters of the last fully-connected layer will be randomly initialized by the `initializer`.
 
 ```python
 import logging
diff --git a/docs/faq/gradient_compression.md b/docs/faq/gradient_compression.md
new file mode 100644
index 000000000000..4cd58f05d561
--- /dev/null
+++ b/docs/faq/gradient_compression.md
@@ -0,0 +1,107 @@
+# Gradient Compression
+
+Gradient Compression reduces communication bandwidth, and in some scenarios, it can make training more scalable and efficient without significant loss in convergence rate or accuracy. Example implementations with GPUs, CPUs, and distributed training are provided in this document. 
+
+
+## Benefits
+
+**Increased Speed**
+
+For architectures with fully connected layers, the gradient compression capability is observed to speedup training by about 2x, depending on the size of the model and the network bandwidth of the instance. Bigger models see larger speedup with gradient compression.
+
+**Minimal Accuracy Loss**
+
+Gradient compression uses the approach of delaying the synchronization of weight updates which are small. Although small weight updates might not be sent for that batch, this information is not discarded. Once the weight updates for this location accumulate to become a larger value, they will be propagated. Since there is no information loss, but only delayed updates, it does not lead to a significant loss in accuracy or convergence rate. In distributed training experiments[1], the accuracy loss observed due to gradient compression was as low as 1%
+
+
+## When to Use Gradient Compression
+
+When training models whose architectures include large fully connected components, it can be helpful to use gradient compression. For larger models, as well as recurrent neural networks, the communication cost becomes a major factor. Such models stand to benefit greatly with gradient compression.
+
+
+### GPU versus CPU
+
+The greatest benefits from gradient compression are realized when using multi-node (single or multi-GPU) distributed training. Training on CPU would provide a lower compute density per compute node as compared to the massive compute density per compute node on a GPU. Due to this, the required communication bandwidth for CPU-based nodes during training is not as high as for GPU-based nodes. Hence, the benefits of gradient compression are lower for CPU-based nodes as compared to GPU-based nodes.
+
+
+### Network Latency
+
+Benefits of gradient compression can be found when using distributed training with network connected nodes. Depending on the network latency between nodes and the model's size, these can contribute to slow performance such that gradient compression may provide speed improvements.
+
+You may not want to use gradient compression if you have low latency network communication.
+
+
+### Model Size
+
+Distributed training involves synchronization of weights after each batch. Larger models have much higher communication costs during training, hence such models stand to benefit much more from gradient compression.
+When running distributed training with gradient compression, the quantize and dequantize operations happen on CPU parallelized with OpenMP. For smaller models, when training on GPUs, it helps to set `OMP_NUM_THREADS=1` on each node, so that the overhead of launching OMP threads doesn't cause the compression and decompression to be slow.
+
+### Model Architecture
+
+The communication bandwidth requirements during training vary across various neural network architectures and hence the benefits of gradient compression vary accordingly.
+
+In networks which have significant fully connected components, since such layers have low compute cost on GPUs, communication becomes a bottleneck limiting the speed of distributed training. Gradient compression can help reduce the communication cost, and thus speed up training in such cases. We have observed speedup of about 2x on large fully connected neural networks. Models like AlexNet and VGG have large fully connected components as part of the network, hence stand to benefit from gradient compression. As with these models, Long Short-Term Memory architectures require more communication bandwidth, so they also exhibit speed improvements with gradient compression.
+
+Architectures like Convolutional Neural Networks on the other hand have a higher compute cost, in which case some communication can be parallelized with computation. Since communication is not the bottleneck in such networks, gradient compression doesn't help much.
+
+
+### Single Node Gradient Compression
+
+When the training is configured to use device to device communication on a single node with multiple GPUs, gradient compression can be used to reduce the cost of communication. This can provide about 20% speedup for large models using older generation architectures. However, speed benefits may be negligible on a machine with a newer generation architecture where GPUs can communicate at low latency.
+
+
+## Approach
+
+The idea behind gradient compression comes from two observations:
+
+First, when training large neural networks, the gradients of weights computed for a small mini-batch of training data are typically sparse. Only a small fraction of the weights have significant updates after each mini-batch. The synchronization of updates that are near zero can be safely delayed longer than the typical mini-batch size. This essentially means that the rate of weight-update can vary depending on the value of an individual weight.
+
+Secondly, gradients can be compressed significantly by considering only those gradient elements whose absolute values exceed a threshold, and then quantizing them to use lower bits per gradient value. By compressing the gradients, we can reduce communication bandwidth. The delayed gradient values, in the form of quantization error and values that don't meet the threshold, are aggregated into a gradient residual which is communicated when it reaches the threshold.
+
+## Technical Implementation
+
+### Two Bit Quantization
+
+Currently the supported type of quantization uses two bits for each gradient value. Any positive value greater than or equal to the threshold sets two bits as `11`, any negative value whose absolute value is greater or equal to the threshold sets two bits as `10`, and others are set to `00`. This enables us to store 16 quantized gradients as one float. The error in quantization, which is `original_value - quantized_value` is stored in the form of a gradient residual.
+
+### Types of Kvstore
+
+Supported types of `kvstore` are `device` and all distributed kvstores such as `dist_sync`, `dist_async`, and `dist_sync_device`. When `kvstore` is `device`, the communication between GPUs is compressed. Please note that this increases the memory usage of GPUs because of the additional residual stored. When using a distributed kvstore, worker-to-server communication is compressed. In this case, compression and decompression happen on the CPU, and gradient residuals will be stored on the CPU. Server-to-worker communication and device-to-device communication are not compressed to avoid multiple levels of compression.
+
+## Enabling the Gradient Compression in MXNet
+
+Gradient compression is a run-time configuration parameter to be enabled during training. Here are the MXNet APIs to enable gradient compression:
+
+**Gluon API**:
+
+```
+trainer = gluon.Trainer(..., compression_params={'type’:'2bit', 'threshold':0.5})
+```
+A reference `gluon` implementation with a gradient compression option can be found in the [train.py script from a word-level language modeling RNN example](https://github.com/apache/incubator-mxnet/blob/master/example/gluon/word_language_model/train.py).
+
+**Module API**:
+
+```
+mod = mx.mod.Module(..., compression_params={'type’:'2bit', 'threshold':0.5})
+```
+
+A `module` example is provided with [this guide for setting up MXNet with distributed training](https://mxnet.incubator.apache.org/versions/master/how_to/multi_devices.html#distributed-training-with-multiple-machines). It comes with the option of turning on gradient compression as an argument to the [train_mnist.py script](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/train_mnist.py).
+
+### Configuration Details
+
+**Threshold**
+
+A default `threshold` value of `0.5` is good for most use cases, but to get the most benefit from gradient compression for a particular scenario, it can be beneficial to experiment. If the threshold is set to a very large value, say `10.0`, then the updates become too infrequent and the training will converge slower. Setting the threshold automatically is expected in a future release.
+
+**Quantization**
+
+This release supports 2-bit quantization for encoding of gradients to reduce the communication bandwidth during training. Future releases will support 1-bit quantization and other approaches for encoding of gradients based on experimental evidence of benefits and user demand.
+
+**Sparse Format**
+
+We believe that the density of data will need to be really low (i.e. around > 90% zeros) to reap benefits of the sparse format. However, this is an area of experimentation that will be explored in a future release.
+
+
+## References
+
+1. [Nikko Storm, Amazon.com, Scalable Distributed Training using commodity GPU cloud computing.](https://s3-us-west-2.amazonaws.com/amazon.jobs-public-documents/strom_interspeech2015.pdf)
diff --git a/docs/how_to/index.md b/docs/faq/index.md
similarity index 84%
rename from docs/how_to/index.md
rename to docs/faq/index.md
index 4920e1cd3f78..68c7d41cb897 100644
--- a/docs/how_to/index.md
+++ b/docs/faq/index.md
@@ -1,4 +1,4 @@
-# MXNet How To
+# MXNet FAQ
 
 This section addresses common questions about how to use _MXNet_. These include performance issues, e.g., how to train with multiple GPUs.
 They also include workflow questions, e.g., how to visualize a neural network computation graph.
@@ -14,12 +14,15 @@ and full working examples, visit the [tutorials section](../tutorials/index.md).
 * [How do I visualize neural networks as computation graphs?](http://mxnet.io/how_to/visualize_graph.html)
 
 
-## Speed
-
+## Scale
 * [How can I train with multiple CPU/GPUs with data parallelism?](http://mxnet.io/how_to/multi_devices.html)
 
 * [How can I train with multiple GPUs with model parallelism?](http://mxnet.io/how_to/model_parallel_lstm.html)
 
+
+## Speed
+* [How do I use gradient compression with distributed training?](http://mxnet.io/how_to/gradient_compression.html)
+
 * [Can I use nnpack to improve the CPU performance of MXNet?](http://mxnet.io/how_to/nnpack.html)
 
 * [What are the best setup and data-handling tips and tricks for improving speed?](http://mxnet.io/how_to/perf.html)
@@ -40,12 +43,17 @@ and full working examples, visit the [tutorials section](../tutorials/index.md).
 
 * [How to convert MXNet models to Apple CoreML format?](https://github.com/apache/incubator-mxnet/tree/master/tools/coreml)
 
+## Security
+* [How to run MXNet securely?](http://mxnet.io/how_to/security.html)
+
 ## Extend and Contribute to MXNet
 
 * [How do I join the MXNet development discussion?](http://mxnet.io/community/mxnet_channels.html)
 
 * [How do I contribute a patch to MXNet?](http://mxnet.io/community/contribute.html)
 
+* [How do I implement operators in MXNet backend?](http://mxnet.io/how_to/add_op_in_backend.html)
+
 * [How do I create new operators in MXNet?](http://mxnet.io/how_to/new_op.html)
 
 * [How do I set MXNet's environmental variables?](http://mxnet.io/how_to/env_var.html)
@@ -53,7 +61,7 @@ and full working examples, visit the [tutorials section](../tutorials/index.md).
 * [How do I use MXNet as a front end for Torch?](http://mxnet.io/how_to/torch.html)
 
 ## Questions about Using MXNet
-If you are not sure of how to use MXNet for something, or have questions about applying it to a particular kind of problem, please post a question at [Stackoverflow](http://stackoverflow.com/) with tag - ```mxnet```. You can view StackOverflow questions about mxnet [here](http://stackoverflow.com/questions/tagged/mxnet).
+If you need help with using MXNet, have questions about applying it to a particular kind of problem, or have a discussion topic, please use our [forum](https://discuss.mxnet.io).
 
 ## Issue Tracker
 We track bugs and new feature requests in the MXNet Github repo in the issues folder: [mxnet/issues](https://github.com/dmlc/mxnet/issues).
diff --git a/docs/how_to/model_parallel_lstm.md b/docs/faq/model_parallel_lstm.md
similarity index 100%
rename from docs/how_to/model_parallel_lstm.md
rename to docs/faq/model_parallel_lstm.md
diff --git a/docs/how_to/multi_devices.md b/docs/faq/multi_devices.md
similarity index 91%
rename from docs/how_to/multi_devices.md
rename to docs/faq/multi_devices.md
index 327206224383..c79d1f80be25 100644
--- a/docs/how_to/multi_devices.md
+++ b/docs/faq/multi_devices.md
@@ -167,6 +167,19 @@ python ../../tools/launch.py -n 2 -H hosts --sync-dst-dir /tmp/mxnet \
    python train_mnist.py --network lenet --kv-store dist_sync
 ```
 
+
+### Gradient compression
+
+If your model has fully connected components or recurrent neural networks, you may achieve increased training speed using gradient compression with potentially slight loss of accuracy. Please see [Gradient Compression](https://mxnet.incubator.apache.org/versions/master/faq/gradient_compression.html) for more details on when and how to use it. For the above example, gradient compression can be enabled by running the following:
+
+```bash
+python ../../tools/launch.py -n 2 --launcher ssh -H hosts python train_mnist.py --network lenet \
+    --kv-store dist_sync --gc-type 2bit
+```
+
+In this example, `gc-type` has been set to `2bit`, to enable two bit gradient compression.
+
+
 ### Use a Particular Network Interface
 
 _MXNet_ often chooses the first available network interface.
diff --git a/docs/how_to/new_op.md b/docs/faq/new_op.md
similarity index 99%
rename from docs/how_to/new_op.md
rename to docs/faq/new_op.md
index 55b7409ca2f0..994a2a6f823e 100644
--- a/docs/how_to/new_op.md
+++ b/docs/faq/new_op.md
@@ -339,7 +339,7 @@ NNVM_REGISTER_OP(_backward_abs)
 [](const NodeAttrs& attrs){
   return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};
 })
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::sign> >);
+.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, backward_grad<mshadow_op::sign> >);
 ```
 
 ### Legacy Operators
diff --git a/docs/how_to/nnpack.md b/docs/faq/nnpack.md
similarity index 100%
rename from docs/how_to/nnpack.md
rename to docs/faq/nnpack.md
diff --git a/docs/how_to/perf.md b/docs/faq/perf.md
similarity index 100%
rename from docs/how_to/perf.md
rename to docs/faq/perf.md
diff --git a/docs/how_to/recordio.md b/docs/faq/recordio.md
similarity index 100%
rename from docs/how_to/recordio.md
rename to docs/faq/recordio.md
diff --git a/docs/how_to/s3_integration.md b/docs/faq/s3_integration.md
similarity index 100%
rename from docs/how_to/s3_integration.md
rename to docs/faq/s3_integration.md
diff --git a/docs/how_to/smart_device.md b/docs/faq/smart_device.md
similarity index 100%
rename from docs/how_to/smart_device.md
rename to docs/faq/smart_device.md
diff --git a/docs/how_to/torch.md b/docs/faq/torch.md
similarity index 100%
rename from docs/how_to/torch.md
rename to docs/faq/torch.md
diff --git a/docs/how_to/visualize_graph.md b/docs/faq/visualize_graph.md
similarity index 100%
rename from docs/how_to/visualize_graph.md
rename to docs/faq/visualize_graph.md
diff --git a/docs/get_started/index.md b/docs/get_started/index.md
index 1112c9d8a46e..a743930b33dd 100644
--- a/docs/get_started/index.md
+++ b/docs/get_started/index.md
@@ -2,7 +2,7 @@
 <html lang="en-US">
     <head>
         <meta charset="UTF-8">
-        <meta http-equiv="refresh" content="0; url=http://mxnet.io/get_started/why_mxnet.html">
+        <meta http-equiv="refresh" content="0; url=why_mxnet.html">
         <title>Page Redirection</title>
     </head>
 </html>
diff --git a/docs/get_started/why_mxnet.md b/docs/get_started/why_mxnet.md
index 170a6acc8012..ed8cef143070 100755
--- a/docs/get_started/why_mxnet.md
+++ b/docs/get_started/why_mxnet.md
@@ -7,7 +7,7 @@ in self-driving cars, machine translation, speech recognition and more.
 While widespread interest in deep learning took off in 2012,
 deep learning has become an indispensable tool for countless industries.
 
-![alt text](https://raw.githubusercontent.com/kevinthesun/web-data/master/mxnet/get-started/image-classification.png)
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/get-started/image-classification.png)
 
 It might not come as a surprise that researchers
 have investigated neural networks for decades.
@@ -16,7 +16,7 @@ suggested the forerunner of today's artificial neurons back in 1943.
 Each neuron is connected to other neurons along _edges_, analogous to the synapses that connect real neurons. 
 And associated with each edge is a _weight_ that indicates whether the connection is excitatory or inhibitatory and the strength of the connection. 
 
-![alt_text](https://raw.githubusercontent.com/kevinthesun/web-data/master/mxnet/get-started/artificial-neuron-2.png)
+![alt_text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/get-started/artificial-neuron-2.png)
 
 In the 1980s, the modern version of neural networks took shape.
 Researchers arranged artificial neurons into _layers_.
@@ -53,7 +53,7 @@ replacing classical models and hand-engineered features.
 Similarly, nearly every production speech recognition system now relies on neural networks, 
 where replacing the hidden Markov models that previously held sway.
 
-![alt text](https://raw.githubusercontent.com/kevinthesun/web-data/master/mxnet/get-started/nvidia-gpus.jpg)
+![alt text](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/get-started/nvidia-gpus.jpg)
 
 While GPUs and clusters present a huge opportunity for accelerating neural network training,
 adapting traditional machine learning code
diff --git a/docs/gluon/index.md b/docs/gluon/index.md
new file mode 100644
index 000000000000..4bea06edcabf
--- /dev/null
+++ b/docs/gluon/index.md
@@ -0,0 +1,96 @@
+![](https://github.com/dmlc/web-data/blob/master/mxnet/image/image-gluon-logo.png?raw=true)
+
+Based on the [the Gluon API specification](https://github.com/gluon-api/gluon-api), the new Gluon library in Apache MXNet provides a clear, concise, and simple API for deep learning. It makes it easy to prototype, build, and train deep learning models without sacrificing training speed. Install the latest version of MXNet to get access to Gluon by either following these easy steps or using this simple command:
+
+```python
+    pip install mxnet --pre --user
+```
+<br/>
+<div class="boxed">
+    Advantages
+</div>
+
+1. Simple, Easy-to-Understand Code: Gluon offers a full set of plug-and-play neural network building blocks, including predefined layers, optimizers, and initializers.
+
+2. Flexible, Imperative Structure: Gluon does not require the neural network model to be   rigidly defined, but rather brings the training algorithm and model closer together to provide flexibility in the development process.
+
+3. Dynamic Graphs: Gluon enables developers to define neural network models that are dynamic, meaning they can be built on the fly, with any structure, and using any of Python’s native control flow.
+
+4. High Performance: Gluon provides all of the above benefits without impacting the training speed that the underlying engine provides.
+<br/>
+<div class="boxed">
+    The Straight Dope
+</div>
+
+The community is also working on parallel effort to create a foundational resource for learning about machine learning. The Straight Dope is a book composed of introductory as well as advanced tutorials – all based on the Gluon interface. For example,
+
+* [Learn about machine learning basics](http://gluon.mxnet.io/chapter01_crashcourse/introduction.html).
+* [Develop and train a simple neural network model](http://gluon.mxnet.io/chapter03_deep-neural-networks/mlp-gluon.html).
+* [Implement a Recurrent Neural Network (RNN) model for Language Modeling](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/simple-rnn.html).
+
+<br/>
+<div class="boxed">
+    Code Examples
+</div>
+
+**__Simple, Easy-to-Understand Code__**
+
+Use plug-and-play neural network building blocks, including predefined layers, optimizers, and initializers:
+
+```python
+net = gluon.nn.Sequential()
+# When instantiated, Sequential stores a chain of neural network layers. 
+# Once presented with data, Sequential executes each layer in turn, using 
+# the output of one layer as the input for the next
+with net.name_scope():
+    net.add(gluon.nn.Dense(256, activation="relu")) # 1st layer (256 nodes)
+    net.add(gluon.nn.Dense(256, activation="relu")) # 2nd hidden layer
+    net.add(gluon.nn.Dense(num_outputs))
+```
+<br/>
+
+**__Flexible, Imperative Structure__**
+
+Prototype, build, and train neural networks in fully imperative manner using the MXNet autograd package and the Gluon trainer method:
+
+```python
+epochs = 10
+
+for e in range(epochs):
+    for i, (data, label) in enumerate(train_data):
+        with autograd.record():
+            output = net(data) # the forward iteration
+            loss = softmax_cross_entropy(output, label)
+            loss.backward()
+        trainer.step(data.shape[0])
+```
+
+<br/>
+**__Dynamic Graphs__**
+
+Build neural networks on the fly for use cases where neural networks must change in size and shape during model training:
+
+```python
+def forward(self, F, inputs, tree):
+    children_outputs = [self.forward(F, inputs, child)
+                        for child in tree.children]
+    #Recursively builds the neural network based on each input sentence’s
+    #syntactic structure during the model definition and training process
+    …
+```
+<br/>
+**__High Performance__**
+
+Easily cache the neural network to achieve high performance by defining your neural network with ``HybridSequential`` and calling the ``hybridize`` method: 
+
+```python
+net = nn.HybridSequential()
+with net.name_scope():
+    net.add(nn.Dense(256, activation="relu"))
+    net.add(nn.Dense(128, activation="relu"))
+    net.add(nn.Dense(2))
+```
+
+```python
+net.hybridize()
+```
diff --git a/docs/how_to/add_op_in_backend.md b/docs/how_to/add_op_in_backend.md
new file mode 100644
index 000000000000..03d77510aacc
--- /dev/null
+++ b/docs/how_to/add_op_in_backend.md
@@ -0,0 +1,608 @@
+# A Beginner's Guide to Implementing Operators in MXNet Backend
+
+## Introduction
+Operators are essential elements for constructing neural networks. They define mathematical formulas
+of transforming input data (tensors) to outputs. MXNet has a rich set of operators from simple ones,
+such as element-wise sum, to complicated ones, such as convolution, that is
+capable of constructing most of the popular neural networks. You may have noticed
+that many operators implemented in MXNet have their equivalent forms in Numpy, such as
+[repeat](https://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html),
+[tile](https://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html),
+etc., and wonder why we could not simply use those Numpy operators in MXNet. One of the
+major reasons is that we need to support both CPU and GPU computing for the operators in MXNet,
+while Numpy operators do not possess GPU computing capability.
+In addition, we have performed plenty of
+optimizations for various components in MXNet, such as tensor data structure (`NDArray`),
+execution engine, computational graph and so on, for maximizing memory and runtime efficiency.
+An operator implemented under the MXNet operator framework would greatly
+leverage those optimizations for exhaustive performance enhancement.
+
+In this tutorial, we are going to practice implementing an operator using
+C++ in the MXNet backend. After finishing the implementation,
+we will add unit tests using Python for the operator we just implemented.
+
+## Implementation
+### An Operator Example
+Let's take the [quadratic function](https://en.wikipedia.org/wiki/Quadratic_function)
+as an example: `f(x) = ax^2+bx+c`. We want to implement an operator called `quadratic`
+taking `x`, which is a tensor, as an input and generating an output tensor `y`
+satisfying `y.shape=x.shape` and each element of `y` is calculated by feeding the
+corresponding element of `x` into the quadratic function `f`.
+Here variables `a`, `b`, and `c` are user input parameters.
+In frontend, the operator works like this:
+```python
+x = [[1, 2], [3, 4]]
+y = quadratic(data=x, a=1, b=2, c=3)
+y = [[6, 11], [18, 27]]
+```
+To implement this, we first create three files: `quadratic_op-inl.h`,
+`quadratic_op.cc`, and `quadratic_op.cu`. The header file's name
+is prefixed by the operator name and followed by `op` and `-inl`
+indicating that this is an operator implementation with inline
+functions shared by CPU and GPU computing. The CPU and GPU
+specific implementations reside in their own `.cc` and `.cu` files,
+respectively. We normally put pure tensor related operators
+(e.g. `tile`, `repeat`, etc.) under
+the directory `src/operator/tensor`, and neural network operators
+(e.g. `Convolution`, `Pooling`, etc.) under `src/operator/nn`.
+You may have noticed that many neural network operators including
+`Convolution` and `Pooling` are currently saved under `src/operator`.
+We plan to move them to `src/operator/nn` for better file organization
+and clearer hierarchy in the future.
+
+Next, we are going to
+1. Define the parameter struct
+for registering `a`, `b`, and `c` in `quadratic_op-inl.h`.
+2. Define type and shape inference functions in `quadratic_op-inl.h`.
+3. Define forward and backward functions in `quadratic_op-inl.h`.
+4. Register the operator using [nnvm](https://github.com/dmlc/nnvm)
+in `quadratic_op.cc` and `quadratic_op.cu` for
+CPU and GPU computing, respectively.
+
+Now let's walk through the process step by step.
+
+### Parameter Registration
+We first define `struct QuadraticParam` as a placeholder for the
+parameters `a`, `b`, and `c` in `quadratic_op-inl.h`.
+The struct inherits from a base template
+struct named `dmlc::Parameter`, where the template argument is the derived struct
+`QuadraticParam`. This technique, which is called [curiously recurring template
+pattern](https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern),
+achieves static polymorphism. It is similar to using a virtual function,
+but without the cost associated with dynamic polymorphism.
+
+```cpp
+struct QuadraticParam : public dmlc::Parameter<QuadraticParam> {
+  float a, b, c;
+  DMLC_DECLARE_PARAMETER(QuadraticParam) {
+    DMLC_DECLARE_FIELD(a)
+      .set_default(0.0)
+      .describe("Coefficient of the quadratic term in the quadratic function.");
+    DMLC_DECLARE_FIELD(b)
+      .set_default(0.0)
+      .describe("Coefficient of the linear term in the quadratic function.");
+    DMLC_DECLARE_FIELD(c)
+      .set_default(0.0)
+      .describe("Constant term in the quadratic function.");
+  }
+};
+```
+
+The function calls in the above parameter struct are self-explanatory by their names.
+Note that for each parameter, we set the default value to `0.0` such that users can
+skip passing 0-value parameters through the quadratic operator interface. You
+can choose not to define the default value for a parameter if it is required
+at runtime. Meanwhile, adding brief descriptions to the parameters enables
+the documentation engine to display them on
+[MXNet documentation web page](https://mxnet.incubator.apache.org/api/python/index.html).
+
+### Attribute Inference
+Attribute inference is the process of deducing the properties of `NDArray`s
+in neural networks from user provided information. Two most common attributes
+of an `NDArray` are data shape and data type.
+Let's take a look at the following example.
+Given an input `NDArray` called `data`, you invoke the `quadratic` operator
+like this: `output = mx.nd.quadratic(data, a=1, b=2, c=3)`. Before calculating
+the `output` values, its shape and data type are inferred from the input
+`data`'s shape and type following
+the rules you defined in order to allocate memory space for the output tensor.
+
+One important thing to note that inference functions should be capable of
+performing **mutual inference**, i.e.
+inferring one argument's attribute from another argument's attribute if
+possible according to the definition of the operator.
+This is very useful for a computational graph to deduce unknown attributes
+for a neural network in symbolic programming. Users can view the computational
+graph as a symbol with every element initialized for running data
+throughout the neural network, including memory allocation for each tensor,
+device placement for each operator, etc. Users normally just need
+to provide minimum necessary information, such as input data shapes, etc.,
+to the computational graph, and the graph will fill up the unknown attributes
+using the attribute inference functions defined in the operators building up
+the neural network.
+
+Let's consider the following example.
+```python
+>>> import mxnet as mx
+>>> a = mx.sym.Variable('a', shape=(2, 0))
+>>> b = mx.sym.Variable('b')
+>>> c = mx.sym.Variable('c', shape=(0, 3))
+>>> d = a * b + b * c
+>>> print d.infer_shape()
+([(2L, 3L), (2L, 3L), (2L, 3L)], [(2L, 3L)], [])
+```
+The last line of the above code snippet is a tuple of three lists returned
+by `d.infer_shape()`. The first list contains all the argument shapes
+of `a`, `b`, and `c`. The second contains the output shape of `d`. The
+third one represents the shapes of auxiliary states, which is not used
+in this case, and thus is empty. 
+In this example, we only specified values for variable `a`'s first dimension
+and `c`'s second dimension. The `0` in shape `(2, 0)` indicates that the size
+of the second dimension is unknown, same meaning for shape `(0, 3)`.
+However, the symbol `d` still successfully inferred the shapes
+for all the variables and final output. This is a result of mutual
+inference. In MXNet, the whole process can be interpreted as this:
+1. `a` and `b` are combined via an element-wise multiplication operator,
+so the shapes of `a` and `b` are same and `b`'s first dimension size is `2`.
+2. `b` and `c` are combined via an element-wise multiplication operator too,
+so the shapes of `b` and `c` are same and `b`'s second dimension size is `3`.
+3. Now `b`'s shape is completely known, so `a` and `c` missing dimension sizes
+are known as well.
+4. `d` is a result from adding `a * b` and `b * c`, so d should also
+have the same shape as `b`.
+
+The above four steps illustrate how shape inference logic works in MXNet.
+It is actually implemented in the shape inference functions of the operators for
+element-wise multiplication and addition.
+
+For our `quadratic` operator, shape inference possesses quite similar logic.
+```cpp
+inline bool QuadraticOpShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape>* in_attrs,
+                             std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return out_attrs->at(0).ndim() != 0U && out_attrs->at(0).Size() != 0U;
+}
+```
+Here are a few things to note about the above function:
+
+1. `attrs` contains parameters `a`, `b`, and `c` from user input.
+It's not used here since we don't rely on that information for shape inference.
+2. `in_attrs` is a vector containing all input shapes. Since there is
+only one input argument for operator `quadratic`, we used macro `CHECK_EQ`
+to assert when the vector's size is wrong.
+3. `out_attrs` is a vector containing all output shapes. We also used
+`CHECK_EQ` to verify the size of the vector since there is only one output.
+4. We called macro `SHAPE_ASSIGN_CHECK` twice for mutual inference. One for
+inferring the output shape from the input shape, the other one is for inferring
+the input shape from the output shape.
+If there are any unequal non-zero values in the same
+dimension of two shapes, such as `(2, 3)` and `(3, 3)`, the macro would throw an
+exception with an error message for shape inference.
+5. At the end of the function body, we checked whether the output shape
+is completely known by testing whether the shape is not empty and
+the shape's size is greater than `0`. Note that in MXNet, an empty shape
+means that the shape is unknown, and
+a `0` in a shape means that the size of that dimension is unknown. In both
+situations, the missing shape information must
+be inferred from other shapes. If it cannot be inferred,
+the function should return `false` to notify the caller about shape inference failure.
+6. MXNet provides a convenience function implementing the logic of mutual inference
+for general element-wise operators with the following interface. Users can
+instantiate this function with `n_in=1` and `n_out=1` to replace the above
+function `QuadraticOpShape` in operator registration (explained later).
+The function `QuadraticOpShape` posted here is for the purpose of illustration only.
+```cpp
+template<int n_in, int n_out>
+inline bool ElemwiseShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs);
+```
+
+The same logic goes for data type inference. We will leave the analysis of
+the following code sample to users. Note that `-1` means the data type
+is unknown and must be inferred from other input or output data types.
+```cpp
+inline bool QuadraticOpType(const nnvm::NodeAttrs& attrs,
+                            std::vector<int>* in_attrs,
+                            std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, out_attrs->at(0));
+  return out_attrs->at(0) != -1;
+}
+```
+
+Again, MXNet provides the following convenience function for mutual
+type inference of element-wise operators. Users can use that
+in operator registration (explained later).
+```cpp
+template<int n_in, int n_out>
+inline bool ElemwiseType(const nnvm::NodeAttrs& attrs,
+                         std::vector<int>* in_attrs,
+                         std::vector<int>* out_attrs);
+```
+
+### Forward Function
+Forward function defines the operator's behavior in the forward pass
+of neural networks. For our `quadratic` operator, it simply implements
+the logic of running a tensor through the quadratic function by performing
+a few element-wise operations. The forward function's signature is fixed
+in MXNet as follows:
+```cpp
+void (const nnvm::NodeAttrs& attrs,
+      const OpContext& ctx,
+      const std::vector<TBlob>& inputs,
+      const std::vector<OpReqType>& req,
+      const std::vector<TBlob>& outputs);
+```
+We first paste the whole forward function code here
+and then go through it line by line.
+```cpp
+template<typename xpu>                                                        // 1
+void QuadraticOpForward(const nnvm::NodeAttrs& attrs,                         // 2
+                        const OpContext& ctx,                                 // 3
+                        const std::vector<TBlob>& inputs,                     // 4
+                        const std::vector<OpReqType>& req,                    // 5
+                        const std::vector<TBlob>& outputs) {                  // 6
+  CHECK_EQ(inputs.size(), 1U);                                                // 7
+  CHECK_EQ(outputs.size(), 1U);                                               // 8
+  CHECK_EQ(req.size(), 1U);                                                   // 9
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();                            // 10
+  const TBlob& in_data = inputs[0];                                           // 11
+  const TBlob& out_data = outputs[0];                                         // 12
+  const QuadraticParam& param = nnvm::get<QuadraticParam>(attrs.parsed);      // 13
+  using namespace mxnet_op;                                                   // 14
+  MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {                           // 15
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {                               // 16
+      Kernel<quadratic_forward<req_type>, xpu>::Launch(                       // 17
+          s, out_data.Size(), out_data.dptr<DType>(), in_data.dptr<DType>(),  // 18
+          param.a, param.b, param.c);                                         // 19
+    });                                                                       // 20
+  });                                                                         // 21
+}                                                                             // 22
+```
+- Line 1: `xpu` stands for a generic device type so that the function can be instantiated
+for both CPU and GPU computing using concrete types `cpu` and `gpu`. The instantiation happens
+at the time when the operator is registered in `.cc` and `.cu` files.
+- Line 2: `attrs` is a node attribute containing the user input parameters `a`, `b`, and `c`.
+Here the node represents a placeholder for the operator in the whole computational graph for
+the neural network.
+- Line 3: `ctx` holds something called `stream` for
+serializing asynchronous executions. Let's consider
+this example for understanding the functionality of `stream`.
+We want to launch several GPU kernels with the same `stream` from CPU.
+Even though the launching operation is non-blocking, the `stream` guarantees
+that the kernels execute in the same order on GPU as they are launched from CPU.
+- Line 4: `inputs` is a vector of input tensors (only one input tensor
+for the `quadratic` operator).
+- Line 5: `req` is a vector of `OpReqType` values. Each value defines
+the way of writing calculated values to the output tensors.
+Therefore, the number of `req`s must be the same as the number of output tensors.
+MXNet currently supports three types of `req` in frontend: `null`, `write`, and `add`.
+`null` means skipping calculating the corresponding output tensor,
+`write` means overwriting the values in the output tensor with the ones
+calculated by this operator, and `add` means adding the calculated values
+to the existing ones in the output tensor. Note that `null` and `add` are usually
+seen in backward passes. The former is for skipping calculating
+the gradients of un-learnable parameters (such as index arrays),
+and the latter is for accumulating gradients throughout networks.
+- Line 6: `outputs` is a vector of output tensors (only one
+output tensor for the `quadratic` operator).
+- Lines 7-9: Verify that the size of each vector is expected.
+Otherwise, stop moving forward and print error message.
+- Line 10: Get the `stream` from the `ctx` for launching kernels.
+- Lines 11-12: Define the references of the input and output tensors
+for later coding convenience. Note that `TBlob` can be understood
+as a uniform data structure for tensors of various dimensions, such
+that tensors of different dimensions can be put in a homogeneous container,
+such as `std::vector` and `std::list`. You can still
+get tensors of desired dimensions from a `TBlob` object through
+the interface `get_with_shape`.
+- Line 13: Get user input parameters from the node attribute.
+- Lines 15-21: This is the place where the mathematical formula of the operator
+is implemented. The macros `MSHADOW_TYPE_SWITCH` and `MXNET_ASSIGN_REQ_SWITCH` enable
+the code block to work for all the supported data types and `req` types in MXNet.
+Inside the inner-most macro, we launch the kernel for calculating
+the output tensor such that each thread takes an element from
+the input tensor, feeds it into the quadratic function, and assigns
+the output element to the output tensor based on `req` type. Note that
+`Kernel::Launch` serves as a universal interface for launching
+parallel computation on both CPU and GPU. This allows most of
+the simple operators to share the same piece of code for CPU and GPU as
+parallelization approaches are often identical on both types of devices.
+The kernel function is defined as the following, where the function
+`Map` is executed by each thread for each input element. To explain a little
+bit more on the two macros used in the kernel struct: (1) `MSHADOW_XINLINE` is
+a consolidated macro for inlining functions compiled by both CPU and GPU
+compilers. It enables CPU and GPU computing to share the same piece of code.
+(2) `KERNEL_ASSIGN` is a macro for unifying the statements of different `req`s
+into the same line of code. It's named `KERNEL_ASSIGN` because we call
+the code blocks running parallel computation kernels.
+On CPUs, the kernels are normally wrapped by the OpenMP `parallel` directive;
+while on GPUs, they are the kernel functions launched by CUDA library.
+
+```cpp
+template<int req>
+struct quadratic_forward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, const DType* in_data,
+                                  const float a, const float b, const float c) {
+    KERNEL_ASSIGN(out_data[i], req, in_data[i] * (a * in_data[i] + b) + c);
+  }
+};
+```
+
+### Backward Function
+Backward functions play the role of propagating derivatives of loss function
+with respect to the outputs of the last layer throughout the network to the first
+layer. The whole process is often known as backward propagation. We are not
+going to delineate the principle of backward propagation here since users can find
+great details covered in other resources, such as
+[CS231n](http://cs231n.github.io/optimization-2/) and
+[How the backgropagation algorithm works](http://neuralnetworksanddeeplearning.com/chap2.html).
+The problem we are going to solve here for the `quadratic` operator is that
+given a tensor representing the gradient of the loss function with respect
+to the output of the operator, calculate the gradient with respect to
+the input of the operator. There is no need to calculate the derivatives
+of loss function with respect to user input parameters `a`, `b`, and `c`
+since they are not learnable parameters in the network. To formulate the problem:
+given `dL/dy` and `y = a*x^2 + b*x + c`, where `L` represents the loss function and
+`y` stands for the output of the quadratic tensor, we need to solve for
+`dL/dx`. Using the chain-rule, it is obvious to find that
+```
+dL/dx = dL/dy * dy/dx = dL/dy * (2*a*x + b).
+```
+The above equation indicates that `dL/dx` depends on the gradient
+of the output tensor and value of the input tensor.
+The backward function's signature is the same as the forward function's.
+With the aforementioned information in mind,
+let's breakdown the following backward function line by line.
+```cpp
+template<typename xpu>                                                       // 1
+void QuadraticOpBackward(const nnvm::NodeAttrs& attrs,                       // 2
+                         const OpContext& ctx,                               // 3
+                         const std::vector<TBlob>& inputs,                   // 4
+                         const std::vector<OpReqType>& req,                  // 5
+                         const std::vector<TBlob>& outputs) {                // 6
+  CHECK_EQ(inputs.size(), 2U);                                               // 7
+  CHECK_EQ(outputs.size(), 1U);                                              // 8
+  CHECK_EQ(req.size(), 1U);                                                  // 9
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();                           // 10
+  const TBlob& out_grad = inputs[0];                                         // 11
+  const TBlob& in_data = inputs[1];                                          // 12
+  const TBlob& in_grad = outputs[0];                                         // 13
+  const QuadraticParam& param = nnvm::get<QuadraticParam>(attrs.parsed);     // 14
+  using namespace mxnet_op;                                                  // 15
+  MSHADOW_TYPE_SWITCH(out_grad.type_flag_, DType, {                          // 16
+    MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {                              // 17
+      Kernel<quadratic_backward<req_type>, xpu>::Launch(                     // 18
+          s, in_grad.Size(), in_grad.dptr<DType>(), out_grad.dptr<DType>(),  // 19
+          in_data.dptr<DType>(), param.a, param.b);                          // 20
+    });                                                                      // 21
+  });                                                                        // 22
+}                                                                            // 23
+```
+- Lines 1-6: Backward function has the same signature as forward function.
+- Lines 7-9: Check the sizes of the function arguments. One thing to note
+that since the gradient of the input depends on both the gradient of the output and
+the input tensor itself, `inputs` must contain two `TBlob` objects.
+- Line 10: Get the `stream` of the context for serializing asynchronous executions.
+- Lines 11-13: Convenience reference variables for later use. We name `out_grad`
+as the gradient of the operator output, `in_data` as the input of the operator,
+and `in_grad` as the gradient of the operator input.
+- Line 14: Get the parameter object of `QuadraticParam`.
+- Lines 16-22: Same as in the forward function, this is where parallel
+computation for `in_grad` happens. The struct `quadratic_backward` implements
+the formula of calculating each element of `in_grad` by one thread as the following.
+
+```cpp
+template<int req>
+struct quadratic_backward {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* in_grad, const DType* out_grad,
+                                  const DType* in_data, const float a, const float b) {
+    KERNEL_ASSIGN(in_grad[i], req, out_grad[i] * (2 * a * in_data[i] + b));
+  }
+};
+```
+
+### Operator Registration
+So far, we have implemented necessary data structure and functions for the operator `quadratic`.
+Now let's register them using `nnvm` to expose the operator `quadratic`
+to frontend. Users can consider the registration process as creating the operator object
+instance, saving it in the operator manager (a singleton),
+and setting attributes for the operator instance.
+
+The following code is from `quadratic_op.cc`, which is responsible
+for registering the operator working on CPU.
+```cpp
+DMLC_REGISTER_PARAMETER(QuadraticParam);                                           // 1
+
+NNVM_REGISTER_OP(quadratic)                                                        // 2
+.describe(R"code(This operators implements the quadratic function:                 // 3
+.. math::
+
+    f(x) = ax^2+bx+c
+
+where :math:`x` is an input tensor and all operations
+in the function are element-wise.
+
+Example::
+  x = [[1, 2], [3, 4]]
+  y = quadratic(data=x, a=1, b=2, c=3)
+  y = [[6, 11], [18, 27]]
+
+)code" ADD_FILELINE)                                                               // 4
+.set_attr_parser(ParamParser<QuadraticParam>)                                      // 5
+.set_num_inputs(1)                                                                 // 6
+.set_num_outputs(1)                                                                // 7
+.set_attr<nnvm::FListInputNames>("FListInputNames",                                // 8
+  [](const NodeAttrs& attrs) {                                                     // 9
+    return std::vector<std::string>{"data"};                                       // 10
+  })                                                                               // 11
+.set_attr<nnvm::FInferShape>("FInferShape", QuadraticOpShape)                      // 12
+.set_attr<nnvm::FInferType>("FInferType", QuadraticOpType)                         // 13
+.set_attr<FCompute>("FCompute<cpu>", QuadraticOpForward<cpu>)                      // 14
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_quadratic"})  // 15
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",                                  // 16
+  [](const NodeAttrs& attrs) {                                                     // 17
+    return std::vector<std::pair<int, int> >{{0, 0}};                              // 18
+  })                                                                               // 19
+.add_argument("data", "NDArray-or-Symbol", "Input ndarray")                        // 20
+.add_arguments(QuadraticParam::__FIELDS__());                                      // 21
+
+NNVM_REGISTER_OP(_backward_quadratic)                                              // 22
+.set_attr_parser(ParamParser<QuadraticParam>)                                      // 23
+.set_num_inputs(2)                                                                 // 24
+.set_num_outputs(1)                                                                // 25
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)                                  // 26
+.set_attr<FCompute>("FCompute<cpu>", QuadraticOpBackward<cpu>);                    // 27
+```
+
+- Line 1: Register the parameter struct.
+- Line 2: Register an operator named `quadratic` by creating an instance
+of `Op` type and save it in the operator manager and return a reference
+of the just created operator object.
+- Lines 3-4: Add description as an operator attribute
+including examples of the operator. The documentation engine would extract
+this description and display it on the documentation web page.
+- Line 5: Set parameter struct parser for the operator. It is used for parsing
+the parameters `a`, `b`, and `c` input from frontend.
+- Line 6: Set the number of inputs for the operator.
+- Line 7: Set the number of outputs for the operator.
+- Lines 8-11: Defines a function generating a vector of names of
+the operator input arguments. This function is used to add missing
+arguments that users did not specify when creating a symbolic operator.
+For example, `quad_func=mx.sym.quadratic()` is still a valid symbol
+since we have added the attribute `FListInputNames` to the operator node
+in the computational graph. MXNet would
+add the missing argument with name `quadratic0_data`, where the prefix
+`quadratic0` is the operator name appended with an index and the postfix
+`data` comes from the return value of the user defined `FListInputName` function.
+Users still can generate an executor for the `quand_func` like the following:
+```python
+quand_exe = quand_func.simple_bind(ctx=mx.cpu(), quandratic0_data=(1,))
+```
+- Line 12: Register shape inference function.
+- Line 13: Register type inference function.
+- Line 14: Register forward function.
+- Line 15: Register the function for creating the node of the operator in
+a backward pass. Note that we used a convenience functor struct `ElemwiseGradUseIn`.
+As you can tell from the name, the registered functor creates the node for gradient computation
+with dependencies on the output gradient node and input node. Similarly, there are
+other three functors defined as `ElemwiseGradUseOut`, `ElemwiseGradUseInOut`,
+and `ElemwiseGradUseNone` for developers' convenience. In order to add
+this attribute, we also need to register a backward operator for `quadratic` with
+several basic attributes, as it can share attribute inference
+functions with the forward operator and is not exposed to frontend.
+- Lines 16-19: This registered function implies that which output tensor can reuse
+which input tensor's memory space instead of allocating a new memory space for the output.
+In the operator `quadratic`, there is only one input and output, and the output can reuse
+the input memory space, so we store a pair of zeros in the function return vector
+indicating that `inputs[0]`'s memory space can be reused by `outputs[0]`.
+Note that this function just provides a hint to the computational graph initializer.
+If there are other nodes depending on the input tensor, the memory space
+of the input tensor will not be overwritten by the output.
+- Line 20: Define the input argument name as `data` for the operator.
+- Line 21: Add user input parameters `a`, `b`, and `c` as the attributes of the operator.
+- Line 22: Register an operator named `_backward_quadratic` for backward pass
+of the operator `quadratic`. The underscore prefix in the operator name indicates
+that this is an operator not exposed to users. The convention
+of naming an internally used backward operator is prepending the prefix `_backward_`
+to the corresponding forward operator name.
+- Line 23: Set the parameter parser for the operator `_backward_quadratic`.
+- Line 24: Set the number of inputs.
+- Line 25: Set the number of outputs.
+- Line 26: Add `TIsBackward` attribute for the operator. The shape and type
+inference passes use this attribute to determine whether a node in the graph is a
+forward or backward node.
+- Line 27: Register backward function.
+
+So far, we have acquired an operator working on CPU in frontend.
+In order to register the operator working on GPUs, we just need to add the following
+code to `quadratic_op.cu`. Note that forward and backward functions
+are registered with attribute key `FCompute<gpu>`, rather than `FCompute<cpu>`.
+```cpp
+NNVM_REGISTER_OP(quadratic)
+.set_attr<FCompute>("FCompute<gpu>", QuadraticOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_quadratic)
+.set_attr<FCompute>("FCompute<gpu>", QuadraticOpBackward<gpu>);
+```
+
+### Unit Test
+Now we have finished implementing the operator `quadratic` in MXNet backend.
+If you use python, when you type `import mxnet as mx`, two python
+functions for invoking your backend implementation are
+generated on the fly: one is for imperative programming
+registered as `mxnet.ndarray.quadratic` or `mx.nd.quadratic` for short;
+the other one is for symbolic
+programming registered under module `mxnet.symbol.quadratic`
+or `mx.sym.quadratic` for short.
+
+In order to unit test it in frontend, we need to add the following code
+to the python file `test_operator.py`. Note that while testing the
+forward pass is straightforward using `mx.nd.quadratic`, testing
+the backward involves a bit of more efforts. We create a
+`quadratic` symbol and feed it into the utility function `check_numeric_gradient`.
+The utility function will perform a perturbation on the input
+and calculate the response rate of the output using the
+[finite difference method](https://en.wikipedia.org/wiki/Finite_difference_method).
+Then it will compare the gradient from the backward pass with
+the values from the finite difference method. The test
+will be successful once the comparison satisfies user specified
+relative and absolute thresholds.
+```python
+def test_quadratic_function():
+    def f(x, a, b, c):
+        return a * x**2 + b * x + c
+
+    a = np.random.random_sample()
+    b = np.random.random_sample()
+    c = np.random.random_sample()
+    for ndim in range(1, 6):
+        # check forward
+        shape = rand_shape_nd(ndim, 5)
+        data = rand_ndarray(shape=shape, stype='default')
+        data_np = data.asnumpy()
+        expected = f(data_np, a, b, c)
+        output = mx.nd.quadratic(data, a=a, b=b, c=c)
+        assert_almost_equal(output.asnumpy(), expected)
+
+        # check backward using finite difference
+        data = mx.sym.Variable('data')
+        quad_sym = mx.sym.quadratic(data=data, a=a, b=b, c=c)
+        check_numeric_gradient(quad_sym, [data_np])
+```
+Note that here we used `mx.nd.quadratic` to test the forward function
+and `check_numeric_gradient` to test the backward function. In MXNet,
+two other utility functions are also commonly used: `check_symbolic_forward`
+and `check_symbolic_backward`. By using them in unit tests,
+users need to pass in the operator symbols and expected results
+for comparison. Please also note that
+we highly recommend adding `check_numeric_gradient` test for every operator
+with backward function implemented as it eliminates the possibility
+of passing incorrect expected results into `check_symbolic_backward`.
+
+
+## Summary
+In this tutorial, we practiced implementing the operator `quadratic` in MXNet backend
+and unit testing the implementation in frontend. More specifically, we added parameter
+struct for user-input parameters, walked through shape and type inference workflow,
+implemented forward and backward functions, and registered the operator
+using nnvm. Congratulations! You now know how to add operators.
+We welcome your contributions to MXNet.
+
+**Note**: Source code in the tutorial can be found in
+[quadratic_op-inl.h](https://github.com/reminisce/mxnet/blob/add_op_example_for_tutorial/src/operator/tensor/quadratic_op-inl.h),
+[quadratic_op.cc](https://github.com/reminisce/mxnet/blob/add_op_example_for_tutorial/src/operator/tensor/quadratic_op.cc),
+[quadratic_op.cu](https://github.com/reminisce/mxnet/blob/add_op_example_for_tutorial/src/operator/tensor/quadratic_op.cu),
+and
+[test_operator.py](https://github.com/reminisce/mxnet/blob/add_op_example_for_tutorial/tests/python/unittest/test_operator.py#L4008).
diff --git a/docs/how_to/security.md b/docs/how_to/security.md
new file mode 100644
index 000000000000..6f64a9e608f7
--- /dev/null
+++ b/docs/how_to/security.md
@@ -0,0 +1,24 @@
+# MXNet Security best practices
+
+MXNet framework has no built-in security protections. It assumes that the MXNet entities involved in model training and inferencing (hosting) are fully trusted. It also assumes that their communications cannot be eavesdropped or tampered with. MXNet consumers shall ensure that the above assumptions are met.
+
+In particular the following threat-vectors exist when training using MXNet:
+
+* When running distributed training using MXNet there is no built-in support for authenticating cluster nodes participating in the training job.
+* Data exchange between cluster nodes happens is in plain-text.
+* Using `kvstore.set_optimizer` one can use a custom optimizer to combine gradients. This optimizer code is sent to the server nodes as a pickle file. A server does not perform any further validation of the pickle file and simply executes the code trusting the sender (worker).
+* Since there is no authentication between nodes, a malicious actor running on the same network can launch a Denial of Service (DoS) attack by sending data that can overwhelm/crash a scheduler or other server nodes.
+
+It is highly recommended that the following best practices be followed when using MXNet:
+
+* Run MXNet with least privilege, i.e. not as root.
+* Run MXNet training jobs inside a secure and isolated environment. If you are using a cloud provider like Amazon AWS, running your training job inside a [private VPC] (https://aws.amazon.com/vpc/) is a good way to accomplish this. Additionally, configure your network security settings so as to only allow connections that the cluster nodes require.
+* Make sure no unauthorized actors have physical or remote access to the nodes participating in MXNet training.
+* During training, one can configure MXNet to periodically save model checkpoints. To protect these model checkpoints from unauthorized access, make sure the checkpoints are written out to an encrypted storage volume, and have a provision to delete checkpoints that are no longer needed.
+* When sharing trained models, or when receiving trained models from other parties, ensure that model artifacts are authenticated and integrity protected using cryptographic signatures, thus ensuring that the data received comes from trusted sources and has not been maliciously (or accidentally) modified in transit.
+* By default, mx.random uses a static and fixed seed value. The random utilities in MXNet should therefore never be used to implement any type of security critical functionality where cryptographically secure pseudorandom number generation is required.
+
+# Deployment Considerations
+The following are not MXNet framework specific threats but are applicable to Machine Learning models in general.
+
+* When deploying high-value, proprietary models for inference, care should be taken to prevent an adversary from stealing the model. The research paper [Stealing Machine Learning Models via Prediction APIs] (https://arxiv.org/pdf/1609.02943.pdf) outlines experiments performed to show how an attacker can use a prediction API to leak the ML model or construct a nearly identical replica. A simple way to thwart such an attack is to not expose the prediction probabilities to a high degree of precision in the API response.
diff --git a/docs/index.md b/docs/index.md
index 37d58879e046..a1591c2e0b3d 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -8,7 +8,8 @@ These are used to generate the indexes for search functionality.
 - [C++ Documents](api/c++/index.md)
 - [Scala Documents](api/scala/index.md)
 - [Perl Documents](api/perl/index.md)
-- [HowTo Documents](how_to/index.md)
+- [HowTo Documents](faq/index.md)
 - [Get Started Documents](get_started/index.md)
 - [System Documents](architecture/index.md)
 - [Tutorials](tutorials/index.md)
+- [Community](community/index.md)
diff --git a/docs/get_started/amazonlinux_setup.md b/docs/install/amazonlinux_setup.md
similarity index 100%
rename from docs/get_started/amazonlinux_setup.md
rename to docs/install/amazonlinux_setup.md
diff --git a/docs/get_started/build_from_source.md b/docs/install/build_from_source.md
similarity index 96%
rename from docs/get_started/build_from_source.md
rename to docs/install/build_from_source.md
index 4ff2cc09aa82..82baa1bb02ef 100644
--- a/docs/get_started/build_from_source.md
+++ b/docs/install/build_from_source.md
@@ -367,7 +367,13 @@ Both JDK and Maven are required to build the Scala package.
 <div class="ubuntu">
 
 ```bash
-sudo apt-get install -y maven default-jdk
+apt-get install -y software-properties-common
+add-apt-repository -y ppa:webupd8team/java
+apt-get update
+echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections
+apt-get install -y oracle-java8-installer
+apt-get install -y oracle-java8-set-default
+apt-get install -y maven
 ```
 
 </div>
@@ -430,7 +436,7 @@ Run the following command from the MXNet source root directory to build the MXNe
 
 ```bash
     sudo apt-get install libmouse-perl pdl cpanminus swig libgraphviz-perl
-    cpanm -q -L "${HOME}/perl5" Function::Parameters
+    cpanm -q -L "${HOME}/perl5" Function::Parameters Hash::Ordered
 
     MXNET_HOME=${PWD}
     export LD_LIBRARY_PATH=${MXNET_HOME}/lib
diff --git a/docs/get_started/centos_setup.md b/docs/install/centos_setup.md
similarity index 100%
rename from docs/get_started/centos_setup.md
rename to docs/install/centos_setup.md
diff --git a/docs/get_started/install.md b/docs/install/index.md
similarity index 91%
rename from docs/get_started/install.md
rename to docs/install/index.md
index 0e88a0d2a2ee..e0935c340406 100644
--- a/docs/get_started/install.md
+++ b/docs/install/index.md
@@ -2,7 +2,7 @@
 
 Indicate your preferred configuration. Then, follow the customized commands to install *MXNet*.
 
-<script type="text/javascript" src='../../_static/js/options.js'></script>
+<script type="text/javascript" src='../_static/js/options.js'></script>
 
 <!-- START - OS Menu -->
 
@@ -150,6 +150,11 @@ pip install graphviz
 
 **Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
 
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-mkl
+```
+
 </div>
 
 <div class="docker">
@@ -313,6 +318,11 @@ pip install graphviz
 
 **Step 4**  Validate the installation by running simple MXNet code described [here](#validate-mxnet-installation).
 
+**Experimental Choice** If You would like to install mxnet with Intel MKL, try the experimental pip package with MKL:
+```bash
+$ pip install mxnet-cu80mkl
+```
+
 </div>
 
 <div class="virtualenv">
@@ -496,10 +506,6 @@ pip install graphviz
 
 The following installation instructions have been tested on OSX Sierra and El Capitan.
 
-**Prerequisites**
-
-If not already installed, [download and install Xcode](https://developer.apple.com/xcode/) (or [insall it from the App Store](https://itunes.apple.com/us/app/xcode/id497799835)) for macOS. [Xcode](https://en.wikipedia.org/wiki/Xcode) is an integrated development environment for macOS containing a suite of software development tools like C/C++ compilers, BLAS library and more.
-
 <div class="virtualenv">
 <br/>
 
@@ -635,6 +641,12 @@ mxnet/python        latest              00d026968b3c        3 weeks ago
 <div class="build-from-source">
 <br/>
 
+**Prerequisites**
+
+If not already installed, [download and install Xcode](https://developer.apple.com/xcode/) (or [insall it from the App Store](https://itunes.apple.com/us/app/xcode/id497799835)) for macOS. [Xcode](https://en.wikipedia.org/wiki/Xcode) is an integrated development environment for macOS containing a suite of software development tools like C/C++ compilers, BLAS library and more.
+
+<br/>
+
 Building *MXNet* from source is a 2 step process.
 1. Build the *MXNet* core shared library, `libmxnet.so`, from the C++ sources.
 2. Build the language specific bindings. Example - Python bindings, Scala bindings.
@@ -713,7 +725,7 @@ The CPU version of MXNet R package can be installed in R like other packages
 
 ```r
 cran <- getOption("repos")
-cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/"
+cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
 options(repos = cran)
 install.packages("mxnet")
 ```
@@ -878,7 +890,7 @@ The CPU version of MXNet R package can be installed in R like other packages
 
 ```r
 cran <- getOption("repos")
-cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/"
+cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
 options(repos = cran)
 install.packages("mxnet")
 ```
@@ -894,7 +906,7 @@ The GPU version of MXNet R package can be installed in R like other packages
 
 ```r
 cran <- getOption("repos")
-cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/GPU"
+cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU"
 options(repos = cran)
 install.packages("mxnet")
 ```
@@ -1441,13 +1453,77 @@ Will be available soon.
 </div>
 
 <div class="windows">
-  <div class="python scala julia perl">
-    <div class="cpu gpu">
+  <div class="python">
+  <div class="cpu">
+  </div>
+    <div class="gpu">
 
-Will be available soon.
+The following installation instructions have been tested on Ubuntu 14.04 and 16.04.
+
+
+**Prerequisites**
+
+Install the following NVIDIA libraries to setup *MXNet* with GPU support:
+
+1. Install CUDA 8.0 following the NVIDIA's [installation guide](http://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows).
+2. Install cuDNN 7 for CUDA 8.0 following the NVIDIA's [installation guide](https://developer.nvidia.com/cudnn). You may need to register with NVIDIA for downloading the cuDNN library.
+
+**Note:** Make sure to add CUDA install path to `PATH`.
+
+<div class="pip">
+<br/>
+
+**Step 1**  Install python.
+
+Recommend install ```Anaconda3``` [here](https://www.anaconda.com/download/)
+
+**Step 2**  Install *MXNet* with GPU support using CUDA 8.0
+
+```bash
+$ pip install mxnet-cu80
+```
 
 </div>
+<div class="build-from-source">
+<br/>
+To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
+
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and Install [CMake](https://cmake.org/) if it is not already installed.
+3. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
+4. Unzip the OpenCV package.
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory```.
+6. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](http://sourceforge.net/projects/openblas/files/v0.2.14/).
+7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```.
+8. Download and install [CuDNN](https://developer.nvidia.com/cudnn). To get access to the download link, register as an NVIDIA community user.
+
+After you have installed all of the required dependencies, build the MXNet source code:
+
+1. Download the MXNet source code from [GitHub](https://github.com/dmlc/mxnet).
+2. Use [CMake](https://cmake.org/) to create a Visual Studio solution in ```./build```.
+3. In Visual Studio, open the solution file,```.sln```, and compile it.
+These commands produce a library called ```mxnet.dll``` in the ```./build/Release/``` or ```./build/Debug``` folder.
+
+
+
+&nbsp;
+Next, we install ```graphviz``` library that we use for visualizing network graphs you build on MXNet. We will also install [Jupyter Notebook](http://jupyter.readthedocs.io/)  used for running MXNet tutorials and examples.
+- Install ```graphviz``` by downloading MSI installer from [Graphviz Download Page](http://www.graphviz.org/Download_windows.php).
+**Note** Make sure to add graphviz executable path to PATH environment variable. Refer [here for more details](http://stackoverflow.com/questions/35064304/runtimeerror-make-sure-the-graphviz-executables-are-on-your-systems-path-aft)
+
+
+&nbsp;
 </div>
+
+  </div>
+  </div>
+    <div class="scala julia perl">
+    <div class="cpu gpu">
+
+Will be available soon.
+
+  </div>
+  </div>
 </div>
 
 <div class="devices">
@@ -1463,4 +1539,4 @@ Will be available soon.
 </div>
 </div>
 
-# Download Source Package
\ No newline at end of file
+# Download Source Package
diff --git a/docs/get_started/osx_setup.md b/docs/install/osx_setup.md
similarity index 98%
rename from docs/get_started/osx_setup.md
rename to docs/install/osx_setup.md
index 8e5439435a59..a009123fa0af 100644
--- a/docs/get_started/osx_setup.md
+++ b/docs/install/osx_setup.md
@@ -118,7 +118,7 @@ For OS X (Mac) users, MXNet provides a prebuilt binary package for CPUs. The pre
 
 ```r
   cran <- getOption("repos")
-  cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/"
+  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
   options(repos = cran)
   install.packages("mxnet")
 ```
@@ -188,7 +188,7 @@ After you build the shared library, run the following command from the MXNet sou
 ```bash
     brew install swig
     sudo sh -c 'curl -L https://cpanmin.us | perl - App::cpanminus'
-    sudo cpanm -q -n PDL Mouse Function::Parameters
+    sudo cpanm -q -n PDL Mouse Function::Parameters Hash::Ordered
 
     MXNET_HOME=${PWD}
     export PERL5LIB=${HOME}/perl5/lib/perl5
diff --git a/docs/get_started/raspbian_setup.md b/docs/install/raspbian_setup.md
similarity index 100%
rename from docs/get_started/raspbian_setup.md
rename to docs/install/raspbian_setup.md
diff --git a/docs/get_started/tx2_setup.md b/docs/install/tx2_setup.md
similarity index 100%
rename from docs/get_started/tx2_setup.md
rename to docs/install/tx2_setup.md
diff --git a/docs/get_started/ubuntu_setup.md b/docs/install/ubuntu_setup.md
similarity index 99%
rename from docs/get_started/ubuntu_setup.md
rename to docs/install/ubuntu_setup.md
index b7130bef4967..15d06fc6ae98 100644
--- a/docs/get_started/ubuntu_setup.md
+++ b/docs/install/ubuntu_setup.md
@@ -238,7 +238,7 @@ Before you build MXNet for Perl from source code, you must complete [building th
 
 ```bash
     sudo apt-get install libmouse-perl pdl cpanminus swig libgraphviz-perl
-    cpanm -q -L "${HOME}/perl5" Function::Parameters
+    cpanm -q -L "${HOME}/perl5" Function::Parameters Hash::Ordered
 
     MXNET_HOME=${PWD}
     export LD_LIBRARY_PATH=${MXNET_HOME}/lib
diff --git a/docs/get_started/windows_setup.md b/docs/install/windows_setup.md
similarity index 91%
rename from docs/get_started/windows_setup.md
rename to docs/install/windows_setup.md
index f9067732d11a..b0d13c3bcc58 100755
--- a/docs/get_started/windows_setup.md
+++ b/docs/install/windows_setup.md
@@ -21,16 +21,14 @@ This produces a library called ```libmxnet.dll```.
 
 To build and install MXNet yourself, you need the following dependencies. Install the required dependencies:
 
-1. If [Microsoft Visual Studio 2013](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
-2. Install [Visual C++ Compiler](http://landinghub.visualstudio.com/visual-cpp-build-tools).
-3. Back up all of the files in the ```C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC``` folder to a different location.
-4. Copy all of the files in the ```C:\Program Files (x86)\Microsoft Visual C++ Compiler Nov 2013 CTP``` folder (or the folder where you extracted the zip archive) to the ```C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC``` folder, and overwrite all existing files.
-5. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
-6. Unzip the OpenCV package.
-7. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory```.
-8. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](http://sourceforge.net/projects/openblas/files/v0.2.14/).
-9. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```.
-10. Download and install [CuDNN](https://developer.nvidia.com/cudnn). To get access to the download link, register as an NVIDIA community user.
+1. If [Microsoft Visual Studio 2015](https://www.visualstudio.com/downloads/) is not already installed, download and install it. You can download and install the free community edition.
+2. Download and Install [CMake](https://cmake.org/) if it is not already installed.
+3. Download and install [OpenCV](http://sourceforge.net/projects/opencvlibrary/files/opencv-win/3.0.0/opencv-3.0.0.exe/download).
+4. Unzip the OpenCV package.
+5. Set the environment variable ```OpenCV_DIR``` to point to the ```OpenCV build directory```.
+6. If you don't have the Intel Math Kernel Library (MKL) installed, download and install [OpenBlas](http://sourceforge.net/projects/openblas/files/v0.2.14/).
+7. Set the environment variable ```OpenBLAS_HOME``` to point to the ```OpenBLAS``` directory that contains the ```include``` and ```lib``` directories. Typically, you can find the directory in ```C:\Program files (x86)\OpenBLAS\```.
+8. Download and install [CuDNN](https://developer.nvidia.com/cudnn). To get access to the download link, register as an NVIDIA community user.
 
 After you have installed all of the required dependencies, build the MXNet source code:
 
@@ -99,7 +97,7 @@ For CPU-only package:
 
 ```r
   cran <- getOption("repos")
-  cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/"
+  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
   options(repos = cran)
   install.packages("mxnet")
 ```
@@ -108,7 +106,7 @@ For GPU-enabled package:
 
 ```r
   cran <- getOption("repos")
-  cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/GPU"
+  cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/GPU"
   options(repos = cran)
   install.packages("mxnet")
 ```
diff --git a/docs/mxdoc.py b/docs/mxdoc.py
index 2726a1ca0676..caf135680dda 100644
--- a/docs/mxdoc.py
+++ b/docs/mxdoc.py
@@ -23,7 +23,8 @@
 import sys
 from recommonmark import transform
 import pypandoc
-import StringIO
+# import StringIO from io for python3 compatibility
+from io import StringIO
 import contextlib
 
 # white list to evaluate the code block output, such as ['tutorials/gluon']
@@ -61,8 +62,12 @@ def generate_doxygen(app):
 
 def build_mxnet(app):
     """Build mxnet .so lib"""
-    _run_cmd("cd %s/.. && cp make/config.mk config.mk && make -j$(nproc) DEBUG=1" %
-            app.builder.srcdir)
+    if not os.path.exists(os.path.join(app.builder.srcdir, '..', 'config.mk')):
+        _run_cmd("cd %s/.. && cp make/config.mk config.mk && make -j$(nproc) DEBUG=1" %
+                app.builder.srcdir)
+    else:
+        _run_cmd("cd %s/.. && make -j$(nproc) DEBUG=1" %
+                app.builder.srcdir)
 
 def build_r_docs(app):
     """build r pdf"""
@@ -316,7 +321,7 @@ def _get_src_download_btn(out_prefix, langs, lines):
         with open(ipynb, 'w') as f:
             json.dump(_get_jupyter_notebook(lang, lines), f)
         f = ipynb.split('/')[-1]
-        btn += '<div class="download_btn"><a href="%s" download="%s">' \
+        btn += '<div class="download-btn"><a href="%s" download="%s">' \
                '<span class="glyphicon glyphicon-download-alt"></span> %s</a></div>' % (f, f, f)
     btn += '</div>\n'
     return btn
diff --git a/docs/tutorials/basic/data.md b/docs/tutorials/basic/data.md
index d4db7d0de1b6..60a7ec185bce 100644
--- a/docs/tutorials/basic/data.md
+++ b/docs/tutorials/basic/data.md
@@ -325,7 +325,7 @@ tar.close()
 **Note:** You will still need ``OpenCV``(not the CV2 Python library) installed to use `mx.image.imdecode`.
 
 ```python
-img = mx.image.imdecode(open('data/test_images/ILSVRC2012_val_00000001.JPEG').read())
+img = mx.image.imdecode(open('data/test_images/ILSVRC2012_val_00000001.JPEG', 'rb').read())
 plt.imshow(img.asnumpy()); plt.show()
 ```
 
diff --git a/docs/tutorials/basic/ndarray.md b/docs/tutorials/basic/ndarray.md
index bd76702aa376..bc5ce89c7bad 100644
--- a/docs/tutorials/basic/ndarray.md
+++ b/docs/tutorials/basic/ndarray.md
@@ -33,7 +33,7 @@ Each NDArray supports some important attributes that you'll often want to query:
   and `m` columns, its `shape` will be `(n, m)`.
 - **ndarray.dtype**: A `numpy` _type_ object describing the type of its
   elements.
-- **ndarray.size**: the total number of components in the array - equal to the
+- **ndarray.size**: The total number of components in the array - equal to the
   product of the components of its `shape`
 - **ndarray.context**: The device on which this array is stored, e.g. `cpu()` or
   `gpu(1)`.
@@ -81,7 +81,7 @@ We can specify the element type with the option `dtype`, which accepts a numpy
 type. By default, `float32` is used:
 
 ```python
-# float32 is used in default
+# float32 is used by default
 a = mx.nd.array([1,2,3])
 # create an int32 array
 b = mx.nd.array([1,2,3], dtype=np.int32)
diff --git a/docs/tutorials/basic/ndarray_indexing.md b/docs/tutorials/basic/ndarray_indexing.md
new file mode 100644
index 000000000000..37168b340145
--- /dev/null
+++ b/docs/tutorials/basic/ndarray_indexing.md
@@ -0,0 +1,377 @@
+
+# NDArray Indexing - Array indexing features
+
+MXNet's advanced indexing features are modeled after [NumPy's implementation and documentation](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing). You will see direct adaptations of many NumPy indexing features and examples which are close, if not identical, so we borrow much from their documentation.
+
+`NDArray`s can be indexed using the standard Python `x[obj]` syntax, where _x_ is the array and _obj_ the selection.
+
+There are two kinds of indexing available:
+
+1. basic slicing
+1. advanced indexing
+
+In MXNet, we support both basic and advanced indexing following the convention of indexing NumPy's `ndarray`.
+
+
+## Basic Slicing and Indexing
+
+Basic slicing extends Python’s basic concept of slicing to N dimensions. For a quick review:
+
+```
+a[start:end] # items start through end-1
+a[start:]    # items start through the rest of the array
+a[:end]      # items from the beginning through end-1
+a[:]         # a copy of the whole array
+```
+
+
+```python
+from mxnet import nd
+```
+
+For some working examples of basic slicing we'll start simple.
+
+
+```python
+x = nd.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')
+x[5:]
+```
+
+
+
+
+
+    [5 6 7 8 9]
+    <NDArray 5 @cpu(0)>
+
+
+
+
+```python
+x = nd.array([0, 1, 2, 3])
+print('1D complete array, x=', x)
+s = x[1:3]
+print('slicing the 2nd and 3rd elements, s=', s)
+```
+
+    1D complete array, x=
+    [ 0.  1.  2.  3.]
+    <NDArray 4 @cpu(0)>
+    slicing the 2nd and 3rd elements, s=
+    [ 1.  2.]
+    <NDArray 2 @cpu(0)>
+
+
+Now let's try slicing the 2nd and 3rd elements of a multi-dimensional array.
+
+
+```python
+x = nd.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]])
+print('multi-D complete array, x=', x)
+s = x[1:3]
+print('slicing the 2nd and 3rd elements, s=', s)
+```
+
+    multi-D complete array, x=
+    [[  1.   2.   3.   4.]
+     [  5.   6.   7.   8.]
+     [  9.  10.  11.  12.]]
+    <NDArray 3x4 @cpu(0)>
+    slicing the 2nd and 3rd elements, s=
+    [[  5.   6.   7.   8.]
+     [  9.  10.  11.  12.]]
+    <NDArray 2x4 @cpu(0)>
+
+
+Now let's try writing to a specific element. We'll write `9` to element `2` using `x[2] = 9.0`, which will update the whole row.
+
+
+```python
+print('original x, x=', x)
+x[2] = 9.0
+print('replaced entire row with x[2] = 9.0, x=', x)
+```
+
+    original x, x=
+    [[  1.   2.   3.   4.]
+     [  5.   6.   7.   8.]
+     [  9.  10.  11.  12.]]
+    <NDArray 3x4 @cpu(0)>
+    replaced entire row with x[2] = 9.0, x=
+    [[ 1.  2.  3.  4.]
+     [ 5.  6.  7.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+
+
+We can target specific elements too. Let's replace the number `3` in the first row with the number `9` using `x[0, 2] = 9.0`.
+
+
+```python
+print('original x, x=', x)
+x[0, 2] = 9.0
+print('replaced specific element with x[0, 2] = 9.0, x=', x)
+```
+
+    original x, x=
+    [[ 1.  2.  3.  4.]
+     [ 5.  6.  7.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+    replaced specific element with x[0, 2] = 9.0, x=
+    [[ 1.  2.  9.  4.]
+     [ 5.  6.  7.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+
+
+Now lets target even more by selecting a couple of targets at the same time. We'll replace the `6` and the `7` with `x[1:2, 1:3] = 5.0`.
+
+
+```python
+print('original x, x=', x)
+x[1:2, 1:3] = 5.0
+print('replaced range of elements with x[1:2, 1:3] = 5.0, x=', x)
+```
+
+    original x, x=
+    [[ 1.  2.  9.  4.]
+     [ 5.  6.  7.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+    replaced range of elements with x[1:2, 1:3] = 5.0, x=
+    [[ 1.  2.  9.  4.]
+     [ 5.  5.  5.  8.]
+     [ 9.  9.  9.  9.]]
+    <NDArray 3x4 @cpu(0)>
+
+
+## New Indexing Features in v1.0
+
+### Step
+
+The basic slice syntax is `i:j:k` where _i_ is the starting index, _j_ is the stopping index, and _k_ is the step (_k_ must be nonzero).
+
+**Note**: Previously, MXNet supported basic slicing and indexing only with `step=1`. From release 1.0, arbitrary values of `step` are supported.
+
+
+```python
+x = nd.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')
+# Select elements 1 through 7, and use a step of 2
+x[1:7:2]
+```
+
+
+
+
+
+    [1 3 5]
+    <NDArray 3 @cpu(0)>
+
+
+
+## Negative Indices
+Negative _i_ and _j_ are interpreted as _n + i_ and _n + j_ where _n_ is the number of elements in the corresponding dimension. Negative _k_ makes stepping go towards smaller indices.
+
+
+```python
+x[-2:10]
+```
+
+
+
+
+
+    [8 9]
+    <NDArray 2 @cpu(0)>
+
+
+
+If the number of objects in the selection tuple is less than N , then : is assumed for any subsequent dimensions.
+
+
+```python
+x = nd.array([[[1],[2],[3]],
+              [[4],[5],[6]]], dtype='int32')
+x[1:2]
+```
+
+
+
+
+
+    [[[4]
+      [5]
+      [6]]]
+    <NDArray 1x3x1 @cpu(0)>
+
+
+
+You may use slicing to set values in the array, but (unlike lists) you can never grow the array. The size of the value to be set in `x[obj] = value` must be able to broadcast to the same shape as `x[obj]`.
+
+
+```python
+x = nd.arange(16, dtype='int32').reshape((4, 4))
+print(x)
+```
+
+
+    [[ 0  1  2  3]
+     [ 4  5  6  7]
+     [ 8  9 10 11]
+     [12 13 14 15]]
+    <NDArray 4x4 @cpu(0)>
+
+
+
+```python
+print(x[1:4:2, 3:0:-1])
+```
+
+
+    [[ 7  6  5]
+     [15 14 13]]
+    <NDArray 2x3 @cpu(0)>
+
+
+
+```python
+x[1:4:2, 3:0:-1] = [[16], [17]]
+print(x)
+```
+
+
+    [[ 0  1  2  3]
+     [ 4 16 16 16]
+     [ 8  9 10 11]
+     [12 17 17 17]]
+    <NDArray 4x4 @cpu(0)>
+
+
+## New Advanced Indexing Features in v1.0
+
+Advanced indexing is triggered when the selection object, obj, is a non-tuple sequence object (e.g. a Python list), a NumPy `ndarray` (of data type integer), an MXNet `NDArray`, or a tuple with at least one sequence object.
+
+Advanced indexing always returns a __copy__ of the data.
+
+**Note**:
+- When the selection object is a Python list, it must be a list of integers. MXNet does not support the selection object being a nested list. That is, `x[[1, 2]]` is supported, while `x[[1], [2]]` is not.
+- When the selection object is a NumPy `ndarray` or an MXNet `NDArray`, there is no dimension restrictions on the object.
+- When the selection object is a tuple containing Python list(s), both integer lists and nested lists are supported. That is, both `x[1:4, [1, 2]]` and `x[1:4, [[1], [2]]` are supported.
+
+### Purely Integer Array Indexing
+When the index consists of as many integer arrays as the array being indexed has dimensions, the indexing is straight forward, but different from slicing.
+
+Advanced indexes always are [broadcast](https://docs.scipy.org/doc/numpy-1.13.0/reference/ufuncs.html#ufuncs-broadcasting) and iterated as one:
+```python
+result[i_1, ..., i_M] == x[ind_1[i_1, ..., i_M], ind_2[i_1, ..., i_M],
+                           ..., ind_N[i_1, ..., i_M]]
+```
+Note that the result shape is identical to the (broadcast) indexing array shapes `ind_1, ..., ind_N`.
+
+**Example:**
+From each row, a specific element should be selected. The row index is just [0, 1, 2] and the column index specifies the element to choose for the corresponding row, here [0, 1, 0]. Using both together the task can be solved using advanced indexing:
+
+
+```python
+x = nd.array([[1, 2],
+              [3, 4],
+              [5, 6]], dtype='int32')
+x[[0, 1, 2], [0, 1, 0]]
+```
+
+
+
+
+
+    [1 4 5]
+    <NDArray 3 @cpu(0)>
+
+
+
+To achieve a behavior similar to the basic slicing above, broadcasting can be used. This is best understood with an example.
+
+Example:
+From a 4x3 array the corner elements should be selected using advanced indexing. Thus all elements for which the column is one of `[0, 2]` and the row is one of `[0, 3]` need to be selected. To use advanced indexing one needs to select all elements explicitly. Using the method explained previously one could write:
+
+
+```python
+x = nd.array([[ 0,  1,  2],
+              [ 3,  4,  5],
+              [ 6,  7,  8],
+              [ 9, 10, 11]], dtype='int32')
+x[[[0, 0], [3, 3]],
+  [[0, 2], [0, 2]]]
+```
+
+
+
+
+
+    [[ 0  2]
+     [ 9 11]]
+    <NDArray 2x2 @cpu(0)>
+
+
+
+However, since the indexing arrays above just repeat themselves, broadcasting can be used.
+
+
+```python
+x[[[0], [3]],
+  [[0, 2]]]
+```
+
+
+
+
+
+    [[ 0  2]
+     [ 9 11]]
+    <NDArray 2x2 @cpu(0)>
+
+
+
+### Combining Advanced and Basic Indexing
+There are three situations we need to consider when mix advanced and basic indices in a single selection object. Let's look at examples to understand each one's behavior.
+
+- There is only one advanced index in the selection object. For example, `x` is an `NDArray` with `shape=(10, 20, 30, 40, 50)` and `result=x[:, :, ind]` has one advanced index `ind` with `shape=(2, 3, 4)` on the third axis. The `result` will have `shape=(10, 20, 2, 3, 4, 40, 50)` because the subspace of `x` in the third dimension is replaced by the subspace of `shape=(2, 3, 4)`. If we let _i_, _j_, _k_ loop over the (2, 3, 4)-shaped subspace, it is equivalent to `result[:, :, i, j, k, :, :] = x[:, :, ind[i, j, k], :, :]`.
+
+
+```python
+import numpy as np
+shape = (10, 20, 30, 40, 50)
+x = nd.arange(np.prod(shape), dtype='int32').reshape(shape)
+ind = nd.arange(24).reshape((2, 3, 4))
+print(x[:, :, ind].shape)
+```
+
+    (10, 20, 2, 3, 4, 40, 50)
+
+
+- There are at least two advanced indices in the selection object, and all the advanced indices are adjacent to each other. For example, `x` is an `NDArray` with `shape=(10, 20, 30, 40, 50)` and `result=x[:, :, ind1, ind2, :]` has two advanced indices with shapes that are broadcastable to `shape=(2, 3, 4)`. Then the `result` has `shape=(10, 20, 2, 3, 4, 50)` because `(30, 40)`-shaped subspace has been replaced with `(2, 3, 4)`-shaped subspace from the indices.
+
+
+```python
+ind1 = [0, 1, 2, 3]
+ind2 = [[[0], [1], [2]], [[3], [4], [5]]]
+print(x[:, :, ind1, ind2, :].shape)
+```
+
+    (10, 20, 2, 3, 4, 50)
+
+
+- There are at least two advanced indices in the selection object, and there is at least one advanced index separated from the others by basic indices. For example,  `x` is an `NDArray` with `shape=(10, 20, 30, 40, 50)` and `result=x[:, :, ind1, :, ind2]` has two advanced indices with shapes that are broadcastable to `shape=(2, 3, 4)`. Then the `result` has `shape=(2, 3, 4, 10, 20, 40)` because there is no unambiguous place to place the indexing subspace, hence it is prepended to the beginning.
+
+
+```python
+print(x[:, :, ind1, :, ind2].shape)
+```
+
+    (2, 3, 4, 10, 20, 40)
+
+## References
+
+[NumPy documentation](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing)
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/docs/tutorials/gluon/gluon.md b/docs/tutorials/gluon/gluon.md
index ac1aa3f60f5e..a1688ea121dd 100644
--- a/docs/tutorials/gluon/gluon.md
+++ b/docs/tutorials/gluon/gluon.md
@@ -102,7 +102,8 @@ To compute loss and backprop for one iteration, we do:
 label = mx.nd.arange(10)  # dummy label
 with autograd.record():
     output = net(data)
-    loss = gluon.loss.softmax_cross_entropy_loss(output, label)
+    L = gluon.loss.SoftmaxCrossEntropyLoss()
+    loss = L(output, label)
     loss.backward()
 print('loss:', loss)
 print('grad:', net.fc1.weight.grad())
@@ -127,9 +128,10 @@ this is a commonly used functionality, gluon provide a `Trainer` class for it:
 ```python
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.01})
 
-with record():
+with autograd.record():
     output = net(data)
-    loss = gluon.loss.softmax_cross_entropy_loss(output, label)
+    L = gluon.loss.SoftmaxCrossEntropyLoss()
+    loss = L(output, label)
     loss.backward()
 
 # do the update. Trainer needs to know the batch size of data to normalize
diff --git a/docs/tutorials/gluon/hybrid.md b/docs/tutorials/gluon/hybrid.md
index 7e043c816402..6b4c6aafbba6 100644
--- a/docs/tutorials/gluon/hybrid.md
+++ b/docs/tutorials/gluon/hybrid.md
@@ -1,6 +1,6 @@
 # Hybrid - Faster training and easy deployment
 
-*Note: a newer version is available [here](http://gluon.mxnet.io/P14-C05-hybridize.html).*
+*Note: a newer version is available [here](http://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html).*
 
 Deep learning frameworks can be roughly divided into two categories: declarative
 and imperative. With declarative frameworks (including Tensorflow, Theano, etc)
@@ -39,7 +39,7 @@ from mxnet.gluon import nn
 class Net(gluon.HybridBlock):
     def __init__(self, **kwargs):
         super(Net, self).__init__(**kwargs)
-        with self.name_scope:
+        with self.name_scope():
             # layers created in name_scope will inherit name space
             # from parent layer.
             self.conv1 = nn.Conv2D(6, kernel_size=5)
diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md
index 32d8bd8ae9d1..325f385c6d48 100644
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
@@ -2,17 +2,44 @@
 
 These tutorials introduce a few fundamental concepts in deep learning and how to implement them in _MXNet_. The _Basics_ section contains tutorials on manipulating arrays, building networks, loading/preprocessing data, etc. The _Training and Inference_ section talks about implementing Linear Regression, training a Handwritten digit classifier using MLP and CNN, running inferences using a pre-trained model, and lastly, efficiently training a large scale image classifier.
 
-**Note:** We are working on a set of tutorials for the new imperative interface called Gluon. A preview version is hosted at [thestraightdope.mxnet.io](http://thestraightdope.mxnet.io).
 
-## Python
+## Gluon
 
-### Basic
+Gluon is the high-level interface for MXNet. It is more intuitive and easier to use than the lower level interface.
+Gluon supports dynamic (define-by-run) graphs with JIT-compilation to achieve both flexibility and efficiency.
+This is a selected subset of Gluon tutorials. For the comprehensive tutorial on Gluon,
+please see [gluon.mxnet.io](http://gluon.mxnet.io).
+
+### Basics
+
+- [Manipulate data the MXNet way with ndarray](http://gluon.mxnet.io/chapter01_crashcourse/ndarray.html)
+- [Automatic differentiation with autograd](http://gluon.mxnet.io/chapter01_crashcourse/autograd.html)
+- [Linear regression with gluon](http://gluon.mxnet.io/chapter02_supervised-learning/linear-regression-gluon.html)
+- [Serialization - saving, loading and checkpointing](http://gluon.mxnet.io/chapter03_deep-neural-networks/serialization.html)
+
+### Neural Networks
+
+- [Multilayer perceptrons in gluon](http://gluon.mxnet.io/chapter03_deep-neural-networks/mlp-gluon.html)
+- [Convolutional Neural Networks in gluon](http://gluon.mxnet.io/chapter04_convolutional-neural-networks/cnn-gluon.html)
+- [Recurrent Neural Networks with gluon](http://gluon.mxnet.io/chapter05_recurrent-neural-networks/rnns-gluon.html)
+
+### Advanced
+
+- [Plumbing: A look under the hood of gluon](http://gluon.mxnet.io/chapter03_deep-neural-networks/plumbing.html)
+- [Designing a custom layer with gluon](http://gluon.mxnet.io/chapter03_deep-neural-networks/custom-layer.html)
+- [Fast, portable neural networks with Gluon HybridBlocks](http://gluon.mxnet.io/chapter07_distributed-learning/hybridize.html)
+- [Training on multiple GPUs with gluon](http://gluon.mxnet.io/chapter07_distributed-learning/multiple-gpus-gluon.html)
+
+## MXNet
+
+### Basics
 
 ```eval_rst
 .. toctree::
    :maxdepth: 1
 
    basic/ndarray
+   basic/ndarray_indexing
    basic/symbol
    basic/module
    basic/data
@@ -29,6 +56,18 @@ These tutorials introduce a few fundamental concepts in deep learning and how to
    python/predict_image
    vision/large_scale_classification
 ```
+
+### Sparse NDArray
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   sparse/csr
+   sparse/row_sparse
+   sparse/train
+```
+
 <br>
 More tutorials and examples are available in the GitHub [repository](https://github.com/dmlc/mxnet/tree/master/example).
 
diff --git a/docs/tutorials/python/predict_image.md b/docs/tutorials/python/predict_image.md
index 1c6cfa8e2e27..9a62e67feabc 100644
--- a/docs/tutorials/python/predict_image.md
+++ b/docs/tutorials/python/predict_image.md
@@ -12,7 +12,7 @@ To complete this tutorial, we need:
 - [Python Requests](http://docs.python-requests.org/en/master/), [Matplotlib](https://matplotlib.org/) and [Jupyter Notebook](http://jupyter.org/index.html).
 
 ```
-$ pip install requests matplotlib jupyter
+$ pip install requests matplotlib jupyter opencv-python
 ```
 
 ## Loading
diff --git a/docs/tutorials/r/CallbackFunctionTutorial.md b/docs/tutorials/r/CallbackFunction.md
similarity index 100%
rename from docs/tutorials/r/CallbackFunctionTutorial.md
rename to docs/tutorials/r/CallbackFunction.md
diff --git a/docs/tutorials/r/CustomIteratorTutorial.md b/docs/tutorials/r/CustomIterator.md
similarity index 100%
rename from docs/tutorials/r/CustomIteratorTutorial.md
rename to docs/tutorials/r/CustomIterator.md
diff --git a/docs/tutorials/r/classifyRealImageWithPretrainedModel.md b/docs/tutorials/r/classifyRealImageWithPretrainedModel.md
index 4276fdeef282..b2f2035426ec 100644
--- a/docs/tutorials/r/classifyRealImageWithPretrainedModel.md
+++ b/docs/tutorials/r/classifyRealImageWithPretrainedModel.md
@@ -89,7 +89,7 @@ Load and plot the image:
     plot(im)
  ```
 
-![plot of chunk unnamed-chunk-5](../../web-data/mxnet/knitr/classifyRealImageWithPretrainedModel-unnamed-chunk-5-1.png)
+![plot of chunk unnamed-chunk-5](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/knitr/classifyRealImageWithPretrainedModel-unnamed-chunk-5-1.png)
 
 Before feeding the image to the deep network, we need to perform some preprocessing
 to make the image meet the deep network input requirements. Preprocessing
diff --git a/docs/tutorials/r/index.md b/docs/tutorials/r/index.md
new file mode 100644
index 000000000000..4692e7adce7e
--- /dev/null
+++ b/docs/tutorials/r/index.md
@@ -0,0 +1,23 @@
+# R Tutorials
+
+These tutorials introduce a few fundamental concepts in deep learning and how to implement them in R using _MXNet_.
+
+```eval_rst
+.. toctree::
+   :maxdepth: 1
+
+   ndarray
+   symbol
+   fiveMinutesNeuralNetwork
+   classifyRealImageWithPretrainedModel
+   mnistCompetition
+   CatsDogsFinetune
+   CharRnnModel
+   CallbackFunction
+   CustomIterator
+   CustomLossFunction
+```
+
+<br>
+
+More tutorials and examples are available in the GitHub [repository](https://github.com/apache/incubator-mxnet/tree/master/example).
diff --git a/docs/tutorials/r/mnistCompetition.md b/docs/tutorials/r/mnistCompetition.md
index 04e68cf41ce2..ed3c2827011d 100644
--- a/docs/tutorials/r/mnistCompetition.md
+++ b/docs/tutorials/r/mnistCompetition.md
@@ -333,7 +333,7 @@ Now, we can submit the result to Kaggle to see the improvement of our ranking!
     write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
  ```
 
-![](../../web-data/mxnet/knitr/mnistCompetition-kaggle-submission.png)
+![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/knitr/mnistCompetition-kaggle-submission.png)
 
 ##  Next Steps
 * [Character Language Model using RNN](http://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/tutorials/r/symbol.md b/docs/tutorials/r/symbol.md
index 8333265e1273..6ab4dc2d3d31 100644
--- a/docs/tutorials/r/symbol.md
+++ b/docs/tutorials/r/symbol.md
@@ -104,9 +104,9 @@ In the example, *net* is used as a function to apply to an existing symbol
 
 ## Training a Neural Net
 
-The [model API](../../../R-package/R/model.R) is a thin wrapper around the symbolic executors to support neural net training.
+The [model API](https://github.com/apache/incubator-mxnet/blob/master/R-package/R/model.R) is a thin wrapper around the symbolic executors to support neural net training.
 
-We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](../python/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
+We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](../../api/python/symbol_in_pictures/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
 
 ## How Efficient Is the Symbolic API?
 
diff --git a/docs/tutorials/sparse/csr.md b/docs/tutorials/sparse/csr.md
new file mode 100644
index 000000000000..19ab482c5856
--- /dev/null
+++ b/docs/tutorials/sparse/csr.md
@@ -0,0 +1,355 @@
+
+# CSRNDArray - NDArray in Compressed Sparse Row Storage Format
+
+Many real world datasets deal with high dimensional sparse feature vectors. Take for instance a recommendation system where the number of categories and users is on the order of millions. The purchase data for each category by user would show that most users only make a few purchases, leading to a dataset with high sparsity (i.e. most of the elements are zeros).
+
+Storing and manipulating such large sparse matrices in the default dense structure results in wasted memory and processing on the zeros. To take advantage of the sparse structure of the matrix, the `CSRNDArray` in MXNet stores the matrix in [compressed sparse row (CSR)](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_.28CSR.2C_CRS_or_Yale_format.29) format and uses specialized algorithms in operators.
+**The format is designed for 2D matrices with a large number of columns,
+and each row is sparse (i.e. with only a few nonzeros).**
+
+## Advantages of Compressed Sparse Row NDArray (CSRNDArray)
+For matrices of high sparsity (e.g. ~1% non-zeros = ~1% density), there are two primary advantages of `CSRNDArray` over the existing `NDArray`:
+
+- memory consumption is reduced significantly
+- certain operations are much faster (e.g. matrix-vector multiplication)
+
+You may be familiar with the CSR storage format in [SciPy](https://www.scipy.org/) and will note the similarities in MXNet's implementation. However there are some additional competitive features in `CSRNDArray` inherited from `NDArray`, such as non-blocking asynchronous evaluation and automatic parallelization that are not available in SciPy's flavor of CSR. You can find further explainations for evaluation and parallization strategy in MXNet in the [NDArray tutorial](https://mxnet.incubator.apache.org/tutorials/basic/ndarray.html#lazy-evaluation-and-automatic-parallelization).
+
+The introduction of `CSRNDArray` also brings a new attribute, `stype` as a holder for storage type info, to `NDArray`. You can query **ndarray.stype** now in addition to the oft-queried attributes such as **ndarray.shape**, **ndarray.dtype**, and **ndarray.context**. For a typical dense NDArray, the value of `stype` is **"default"**. For a `CSRNDArray`, the value of stype is **"csr"**.
+
+## Prerequisites
+
+To complete this tutorial, you will need:
+
+- MXNet. See the instructions for your operating system in [Setup and Installation](https://mxnet.io/get_started/install.html)
+- [Jupyter](http://jupyter.org/)
+    ```
+    pip install jupyter
+    ```
+- Basic knowledge of NDArray in MXNet. See the detailed tutorial for NDArray in [NDArray - Imperative tensor operations on CPU/GPU](https://mxnet.incubator.apache.org/tutorials/basic/ndarray.html).
+- SciPy - A section of this tutorial uses SciPy package in Python. If you don't have SciPy, the example in that section will be ignored.
+- GPUs - A section of this tutorial uses GPUs. If you don't have GPUs on your machine, simply set the variable `gpu_device` (set in the GPUs section of this tutorial) to `mx.cpu()`.
+
+## Compressed Sparse Row Matrix
+
+A CSRNDArray represents a 2D matrix as three separate 1D arrays: **data**, **indptr** and **indices**, where the column indices for row `i` are stored in `indices[indptr[i]:indptr[i+1]]` in ascending order, and their corresponding values are stored in `data[indptr[i]:indptr[i+1]]`.
+
+- **data**: CSR format data array of the matrix
+- **indices**: CSR format index array of the matrix
+- **indptr**: CSR format index pointer array of the matrix
+
+### Example Matrix Compression
+
+For example, given the matrix:
+```
+[[7, 0, 8, 0]
+ [0, 0, 0, 0]
+ [0, 9, 0, 0]]
+```
+
+We can compress this matrix using CSR, and to do so we need to calculate `data`, `indices`, and `indptr`.
+
+The `data` array holds all the non-zero entries of the matrix in row-major order. Put another way, you create a data array that has all of the zeros removed from the matrix, row by row, storing the numbers in that order. Your result:
+
+    data = [7, 8, 9]
+
+The `indices` array stores the column index for each non-zero element in `data`. As you cycle through the data array, starting with 7, you can see it is in column 0. Then looking at 8, you can see it is in column 2. Lastly 9 is in column 1. Your result:
+
+    indices = [0, 2, 1]
+
+The `indptr` array is what will help identify the rows where the data appears. It stores the offset into `data` of the first non-zero element number of each row of the matrix. This array always starts with 0 (reasons can be explored later), so indptr[0] is 0. Each subsequent value in the array is the aggregate number of non-zero elements up to that row. Looking at the first row of the matrix you can see two non-zero values, so indptr[1] is 2. The next row contains all zeros, so the aggregate is still 2, so indptr[2] is 2. Finally, you see the last row contains one non-zero element bring the aggregate to 3, so indptr[3] is 3. To reconstruct the dense matrix, you will use `data[0:2]` and `indices[0:2]` for the first row, `data[2:2]` and `indices[2:2]` for the second row (which contains all zeros), and `data[2:3]` and `indices[2:3]` for the third row. Your result:
+
+    indptr = [0, 2, 2, 3]
+
+Note that in MXNet, the column indices for a given row are always sorted in ascending order,
+and duplicated column indices for the same row are not allowed.
+
+## Array Creation
+
+There are a few different ways to create a `CSRNDArray`, but first let's recreate the matrix we just discussed using the `data`, `indices`, and `indptr` we calculated in the previous example.
+
+You can create a CSRNDArray with data, indices and indptr by using the `csr_matrix` function:
+
+
+```python
+import mxnet as mx
+# Create a CSRNDArray with python lists
+shape = (3, 4)
+data_list = [7, 8, 9]
+indices_list = [0, 2, 1]
+indptr_list = [0, 2, 2, 3]
+a = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), shape=shape)
+# Inspect the matrix
+a.asnumpy()
+```
+
+
+```python
+import numpy as np
+# Create a CSRNDArray with numpy arrays
+data_np = np.array([7, 8, 9])
+indptr_np = np.array([0, 2, 2, 3])
+indices_np = np.array([0, 2, 1])
+b = mx.nd.sparse.csr_matrix((data_np, indices_np, indptr_np), shape=shape)
+b.asnumpy()
+```
+
+
+```python
+# Compare the two. They are exactly the same.
+{'a':a.asnumpy(), 'b':b.asnumpy()}
+```
+
+You can create an MXNet CSRNDArray from a `scipy.sparse.csr.csr_matrix` object by using the `array` function:
+
+
+```python
+try:
+    import scipy.sparse as spsp
+    # generate a csr matrix in scipy
+    c = spsp.csr.csr_matrix((data_np, indices_np, indptr_np), shape=shape)
+    # create a CSRNDArray from a scipy csr object
+    d = mx.nd.sparse.array(c)
+    print('d:{}'.format(d.asnumpy()))
+except ImportError:
+    print("scipy package is required")
+```
+
+What if you have a big set of data and you haven't calculated indices or indptr yet? Let's try a simple CSRNDArray from an existing array of data and derive those values with some built-in functions. We can mockup a "big" dataset with a random amount of the data being non-zero, then compress it by using the `tostype` function, which is explained further in the [Storage Type Conversion](#storage-type-conversion) section:
+
+```python
+big_array = mx.nd.round(mx.nd.random.uniform(low=0, high=1, shape=(1000, 100)))
+print(big_array)
+big_array_csr = big_array.tostype('csr')
+# Access indices array
+indices = big_array_csr.indices
+# Access indptr array
+indptr = big_array_csr.indptr
+# Access data array
+data = big_array_csr.data
+# The total size of `data`, `indices` and `indptr` arrays is much lesser than the dense big_array!
+```
+
+You can also create a CSRNDArray from another using the `array` function specifying the element data type with the option `dtype`,
+which accepts a numpy type. By default, `float32` is used.
+
+
+```python
+# Float32 is used by default
+e = mx.nd.sparse.array(a)
+# Create a 16-bit float array
+f = mx.nd.array(a, dtype=np.float16)
+(e.dtype, f.dtype)
+```
+
+## Inspecting Arrays
+
+A variety of methods are available for you to use for inspecting CSR arrays:
+* **.asnumpy()**
+* **.data**
+* **.indices**
+* **.indptr**
+
+As you have seen already, we can inspect the contents of a `CSRNDArray` by filling
+its contents into a dense `numpy.ndarray` using the `asnumpy` function.
+
+
+```python
+a.asnumpy()
+```
+
+You can also inspect the internal storage of a CSRNDArray by accessing attributes such as `indptr`, `indices` and `data`:
+
+
+```python
+# Access data array
+data = a.data
+# Access indices array
+indices = a.indices
+# Access indptr array
+indptr = a.indptr
+{'a.stype': a.stype, 'data':data, 'indices':indices, 'indptr':indptr}
+```
+
+## Storage Type Conversion
+
+You can also convert storage types with:
+* **tostype**
+* **cast_storage**
+
+To convert an NDArray to a CSRNDArray and vice versa by using the ``tostype`` function:
+
+
+```python
+# Create a dense NDArray
+ones = mx.nd.ones((2,2))
+# Cast the storage type from `default` to `csr`
+csr = ones.tostype('csr')
+# Cast the storage type from `csr` to `default`
+dense = csr.tostype('default')
+{'csr':csr, 'dense':dense}
+```
+
+To convert the storage type by using the `cast_storage` operator:
+
+
+```python
+# Create a dense NDArray
+ones = mx.nd.ones((2,2))
+# Cast the storage type to `csr`
+csr = mx.nd.sparse.cast_storage(ones, 'csr')
+# Cast the storage type to `default`
+dense = mx.nd.sparse.cast_storage(csr, 'default')
+{'csr':csr, 'dense':dense}
+```
+
+## Copies
+
+You can use the `copy` method which makes a deep copy of the array and its data, and returns a new array.
+You can also use the `copyto` method or the slice operator `[]` to deep copy to an existing array.
+
+
+```python
+a = mx.nd.ones((2,2)).tostype('csr')
+b = a.copy()
+c = mx.nd.sparse.zeros('csr', (2,2))
+c[:] = a
+d = mx.nd.sparse.zeros('csr', (2,2))
+a.copyto(d)
+{'b is a': b is a, 'b.asnumpy()':b.asnumpy(), 'c.asnumpy()':c.asnumpy(), 'd.asnumpy()':d.asnumpy()}
+```
+
+If the storage types of source array and destination array do not match,
+the storage type of destination array will not change when copying with `copyto` or
+the slice operator `[]`.
+
+
+```python
+e = mx.nd.sparse.zeros('csr', (2,2))
+f = mx.nd.sparse.zeros('csr', (2,2))
+g = mx.nd.ones(e.shape)
+e[:] = g
+g.copyto(f)
+{'e.stype':e.stype, 'f.stype':f.stype, 'g.stype':g.stype}
+```
+
+## Indexing and Slicing
+You can slice a CSRNDArray on axis 0 with operator `[]`, which copies the slices and returns a new CSRNDArray.
+
+
+```python
+a = mx.nd.array(np.arange(6).reshape(3,2)).tostype('csr')
+b = a[1:2].asnumpy()
+c = a[:].asnumpy()
+{'a':a, 'b':b, 'c':c}
+```
+
+Note that multi-dimensional indexing or slicing along a particular axis is currently not supported for a CSRNDArray.
+
+## Sparse Operators and Storage Type Inference
+
+Operators that have specialized implementation for sparse arrays can be accessed in `mx.nd.sparse`. You can read the [mxnet.ndarray.sparse API documentation](https://mxnet.incubator.apache.org/versions/master/api/python/ndarray/sparse.html) to find what sparse operators are available.
+
+
+```python
+shape = (3, 4)
+data = [7, 8, 9]
+indptr = [0, 2, 2, 3]
+indices = [0, 2, 1]
+a = mx.nd.sparse.csr_matrix((data, indices, indptr), shape=shape) # a csr matrix as lhs
+rhs = mx.nd.ones((4, 1))      # a dense vector as rhs
+out = mx.nd.sparse.dot(a, rhs)  # invoke sparse dot operator specialized for dot(csr, dense)
+{'out':out}
+```
+
+For any sparse operator, the storage type of output array is inferred based on inputs. You can either read the documentation or inspect the `stype` attribute of the output array to know what storage type is inferred:
+
+
+```python
+b = a * 2  # b will be a CSRNDArray since zero multiplied by 2 is still zero
+c = a + mx.nd.ones(shape=(3, 4))  # c will be a dense NDArray
+{'b.stype':b.stype, 'c.stype':c.stype}
+```
+
+For operators that don't specialize in sparse arrays, we can still use them with sparse inputs with some performance penalty. In MXNet, dense operators require all inputs and outputs to be in the dense format.
+
+If sparse inputs are provided, MXNet will convert sparse inputs into dense ones temporarily, so that the dense operator can be used.
+
+If sparse outputs are provided, MXNet will convert the dense outputs generated by the dense operator into the provided sparse format.
+
+
+```python
+e = mx.nd.sparse.zeros('csr', a.shape)
+d = mx.nd.log(a) # dense operator with a sparse input
+e = mx.nd.log(a, out=e) # dense operator with a sparse output
+{'a.stype':a.stype, 'd.stype':d.stype, 'e.stype':e.stype} # stypes of a and e will be not changed
+```
+
+Note that warning messages will be printed when such a storage fallback event happens. If you are using jupyter notebook, the warning message will be printed in your terminal console.
+
+## Data Loading
+
+You can load data in batches from a CSRNDArray using `mx.io.NDArrayIter`:
+
+
+```python
+# Create the source CSRNDArray
+data = mx.nd.array(np.arange(36).reshape((9,4))).tostype('csr')
+labels = np.ones([9, 1])
+batch_size = 3
+dataiter = mx.io.NDArrayIter(data, labels, batch_size, last_batch_handle='discard')
+# Inspect the data batches
+[batch.data[0] for batch in dataiter]
+```
+
+You can also load data stored in the [libsvm file format](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/) using `mx.io.LibSVMIter`, where the format is: ``<label> <col_idx1>:<value1> <col_idx2>:<value2> ... <col_idxN>:<valueN>``. Each line in the file records the label and the column indices and data for non-zero entries. For example, for a matrix with 6 columns, ``1 2:1.5 4:-3.5`` means the label is ``1``, the data is ``[[0, 0, 1,5, 0, -3.5, 0]]``. More detailed examples of `mx.io.LibSVMIter` are available in the [API documentation](https://mxnet.incubator.apache.org/versions/master/api/python/io/io.html#mxnet.io.LibSVMIter).
+
+
+```python
+# Create a sample libsvm file in current working directory
+import os
+cwd = os.getcwd()
+data_path = os.path.join(cwd, 'data.t')
+with open(data_path, 'w') as fout:
+    fout.write('1.0 0:1 2:2\n')
+    fout.write('1.0 0:3 5:4\n')
+    fout.write('1.0 2:5 8:6 9:7\n')
+    fout.write('1.0 3:8\n')
+    fout.write('-1 0:0.5 9:1.5\n')
+    fout.write('-2.0\n')
+    fout.write('-3.0 0:-0.6 1:2.25 2:1.25\n')
+    fout.write('-3.0 1:2 2:-1.25\n')
+    fout.write('4 2:-1.2\n')
+
+# Load CSRNDArrays from the file
+data_train = mx.io.LibSVMIter(data_libsvm=data_path, data_shape=(10,), label_shape=(1,), batch_size=3)
+for batch in data_train:
+    print(data_train.getdata())
+    print(data_train.getlabel())
+```
+
+Note that in the file the column indices are expected to be sorted in ascending order per row, and be zero-based instead of one-based.
+
+## Advanced Topics
+
+### GPU Support
+
+By default, `CSRNDArray` operators are executed on CPU. In MXNet, GPU support for `CSRNDArray` is experimental with only a few sparse operators such as `cast_storage` and `dot`.
+
+To create a `CSRNDArray` on a GPU, we need to explicitly specify the context:
+
+**Note** If a GPU is not available, an error will be reported in the following section. In order to execute it a cpu, set `gpu_device` to `mx.cpu()`.
+
+
+```python
+import sys
+gpu_device=mx.gpu() # Change this to mx.cpu() in absence of GPUs.
+try:
+    a = mx.nd.sparse.zeros('csr', (100, 100), ctx=gpu_device)
+    a
+except mx.MXNetError as err:
+    sys.stderr.write(str(err))
+```
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
+
diff --git a/docs/tutorials/sparse/row_sparse.md b/docs/tutorials/sparse/row_sparse.md
new file mode 100644
index 000000000000..55f8a7d1c56e
--- /dev/null
+++ b/docs/tutorials/sparse/row_sparse.md
@@ -0,0 +1,392 @@
+
+# RowSparseNDArray - NDArray for Sparse Gradient Updates
+
+## Motivation
+
+Many real world datasets deal with high dimensional sparse feature vectors. When learning
+the weights of models with sparse datasets, the derived gradients of the weights could be sparse.
+
+Let's say we perform a matrix multiplication of ``X``  and ``W``, where ``X`` is a 1x2 matrix, and ``W`` is a 2x3 matrix. Let ``Y`` be the matrix multiplication of the two matrices:
+
+```python
+import mxnet as mx
+X = mx.nd.array([[1,0]])
+W = mx.nd.array([[3,4,5], [6,7,8]])
+Y = mx.nd.dot(X, W)
+{'X': X, 'W': W, 'Y': Y}
+```
+
+As you can see,
+
+```
+Y[0][0] = X[0][0] * W[0][0] + X[0][1] * W[1][0] = 1 * 3 + 0 * 6 = 3
+Y[0][1] = X[0][0] * W[0][1] + X[0][1] * W[1][1] = 1 * 4 + 0 * 7 = 4
+Y[0][2] = X[0][0] * W[0][2] + X[0][1] * W[1][2] = 1 * 5 + 0 * 8 = 5
+```
+
+What about dY / dW, the gradient for ``W``? Let's call it ``grad_W``. To start with, the shape of ``grad_W`` is the same as that of ``W`` as we are taking the derivatives with respect to ``W``, which is 2x3. Then we calculate each entry in ``grad_W`` as follows:
+
+```
+grad_W[0][0] = X[0][0] = 1
+grad_W[0][1] = X[0][0] = 1
+grad_W[0][2] = X[0][0] = 1
+grad_W[1][0] = X[0][1] = 0
+grad_W[1][1] = X[0][1] = 0
+grad_W[1][2] = X[0][1] = 0
+```
+
+As a matter of fact, you can calculate ``grad_W`` by multiplying the transpose of ``X`` with a matrix of ones:
+
+```python
+grad_W = mx.nd.dot(X, mx.nd.ones_like(Y), transpose_a=True)
+grad_W
+```
+
+As you can see, row 0 of ``grad_W`` contains non-zero values while row 1 of ``grad_W`` does not. Why did that happen?
+If you look at how ``grad_W`` is calculated, notice that since column 1 of ``X`` is filled with zeros, row 1 of ``grad_W`` is filled with zeros too.
+
+In the real world, gradients for parameters that interact with sparse inputs ususally have gradients where many row slices are completely zeros. Storing and manipulating such sparse matrices with many row slices of all zeros in the default dense structure results in wasted memory and processing on the zeros. More importantly, many gradient based optimization methods such as SGD, [AdaGrad](https://stanford.edu/~jduchi/projects/DuchiHaSi10_colt.pdf) and [Adam](https://arxiv.org/pdf/1412.6980.pdf)
+take advantage of sparse gradients and prove to be efficient and effective. 
+**In MXNet, the ``RowSparseNDArray`` stores the matrix in ``row sparse`` format, which is designed for arrays of which most row slices are all zeros.**
+In this tutorial, we will describe what the row sparse format is and how to use RowSparseNDArray for sparse gradient updates in MXNet.
+
+## Prerequisites
+
+To complete this tutorial, we need:
+
+- MXNet. See the instructions for your operating system in [Setup and Installation](https://mxnet.io/get_started/install.html)
+- [Jupyter](http://jupyter.org/)
+    ```
+    pip install jupyter
+    ```
+- Basic knowledge of NDArray in MXNet. See the detailed tutorial for NDArray in [NDArray - Imperative tensor operations on CPU/GPU](https://mxnet.incubator.apache.org/tutorials/basic/ndarray.html)
+- Understanding of [automatic differentiation with autograd](http://gluon.mxnet.io/chapter01_crashcourse/autograd.html)
+- GPUs - A section of this tutorial uses GPUs. If you don't have GPUs on your
+machine, simply set the variable `gpu_device` (set in the GPUs section of this
+tutorial) to `mx.cpu()`
+
+## Row Sparse Format
+
+A RowSparseNDArray represents a multidimensional NDArray using two separate 1D arrays:
+`data` and `indices`.
+
+- data: an NDArray of any dtype with shape `[D0, D1, ..., Dn]`.
+- indices: a 1D int64 NDArray with shape `[D0]` with values sorted in ascending order.
+
+The ``indices`` array stores the indices of the row slices with non-zeros,
+while the values are stored in ``data`` array. The corresponding NDArray `dense` represented by RowSparseNDArray `rsp` has
+
+``dense[rsp.indices[i], :, :, :, ...] = rsp.data[i, :, :, :, ...]``
+
+A RowSparseNDArray is typically used to represent non-zero row slices of a large NDArray of shape [LARGE0, D1, .. , Dn] where LARGE0 >> D0 and most row slices are zeros.
+
+Given this two-dimension matrix:
+
+
+```python
+[[ 1, 2, 3],
+ [ 0, 0, 0],
+ [ 4, 0, 5],
+ [ 0, 0, 0],
+ [ 0, 0, 0]]
+```
+
+The row sparse representation would be:
+- `data` array holds all the non-zero row slices of the array.
+- `indices` array stores the row index for each row slice with non-zero elements.
+
+
+
+```python
+data = [[1, 2, 3], [4, 0, 5]]
+indices = [0, 2]
+```
+
+`RowSparseNDArray` supports multidimensional arrays. Given this 3D tensor:
+
+
+```python
+[[[1, 0],
+  [0, 2],
+  [3, 4]],
+
+ [[5, 0],
+  [6, 0],
+  [0, 0]],
+
+ [[0, 0],
+  [0, 0],
+  [0, 0]]]
+```
+
+The row sparse representation would be (with `data` and `indices` defined the same as above):
+
+
+```python
+data = [[[1, 0], [0, 2], [3, 4]], [[5, 0], [6, 0], [0, 0]]]
+indices = [0, 1]
+```
+
+``RowSparseNDArray`` is a subclass of ``NDArray``. If you query **stype** of a RowSparseNDArray,
+the value will be **"row_sparse"**.
+
+## Array Creation
+
+You can create a `RowSparseNDArray` with data and indices by using the `row_sparse_array` function:
+
+
+```python
+import mxnet as mx
+import numpy as np
+# Create a RowSparseNDArray with python lists
+shape = (6, 2)
+data_list = [[1, 2], [3, 4]]
+indices_list = [1, 4]
+a = mx.nd.sparse.row_sparse_array((data_list, indices_list), shape=shape)
+# Create a RowSparseNDArray with numpy arrays
+data_np = np.array([[1, 2], [3, 4]])
+indices_np = np.array([1, 4])
+b = mx.nd.sparse.row_sparse_array((data_np, indices_np), shape=shape)
+{'a':a, 'b':b}
+```
+
+## Function Overview
+
+Similar to `CSRNDArray`, the are several functions with `RowSparseNDArray` that behave the same way. In the code blocks below you can try out these common functions:
+
+- **.dtype** - to set the data type
+- **.asnumpy** - to cast as a numpy array for inspecting it
+- **.data** - to access the data array
+- **.indices** - to access the indices array
+- **.tostype** - to set the storage type
+- **.cast_storage** - to convert the storage type
+- **.copy** - to copy the array
+- **.copyto** - to copy to deep copy an existing array
+
+
+## Setting Type
+
+You can create a `RowSparseNDArray` from another specifying the element data type with the option `dtype`, which accepts a numpy type. By default, `float32` is used.
+
+
+```python
+# Float32 is used by default
+c = mx.nd.sparse.array(a)
+# Create a 16-bit float array
+d = mx.nd.array(a, dtype=np.float16)
+(c.dtype, d.dtype)
+```
+
+## Inspecting Arrays
+
+As with `CSRNDArray`, you can inspect the contents of a `RowSparseNDArray` by filling
+its contents into a dense `numpy.ndarray` using the `asnumpy` function.
+
+
+```python
+a.asnumpy()
+```
+
+You can inspect the internal storage of a RowSparseNDArray by accessing attributes such as `indices` and `data`:
+
+
+```python
+# Access data array
+data = a.data
+# Access indices array
+indices = a.indices
+{'a.stype': a.stype, 'data':data, 'indices':indices}
+```
+
+## Storage Type Conversion
+
+You can convert an NDArray to a RowSparseNDArray and vice versa by using the `tostype` function:
+
+
+```python
+# Create a dense NDArray
+ones = mx.nd.ones((2,2))
+# Cast the storage type from `default` to `row_sparse`
+rsp = ones.tostype('row_sparse')
+# Cast the storage type from `row_sparse` to `default`
+dense = rsp.tostype('default')
+{'rsp':rsp, 'dense':dense}
+```
+
+You can also convert the storage type by using the `cast_storage` operator:
+
+
+```python
+# Create a dense NDArray
+ones = mx.nd.ones((2,2))
+# Cast the storage type to `row_sparse`
+rsp = mx.nd.sparse.cast_storage(ones, 'row_sparse')
+# Cast the storage type to `default`
+dense = mx.nd.sparse.cast_storage(rsp, 'default')
+{'rsp':rsp, 'dense':dense}
+```
+
+## Copies
+
+You can use the `copy` method which makes a deep copy of the array and its data, and returns a new array.
+We can also use the `copyto` method or the slice operator `[]` to deep copy to an existing array.
+
+
+```python
+a = mx.nd.ones((2,2)).tostype('row_sparse')
+b = a.copy()
+c = mx.nd.sparse.zeros('row_sparse', (2,2))
+c[:] = a
+d = mx.nd.sparse.zeros('row_sparse', (2,2))
+a.copyto(d)
+{'b is a': b is a, 'b.asnumpy()':b.asnumpy(), 'c.asnumpy()':c.asnumpy(), 'd.asnumpy()':d.asnumpy()}
+```
+
+If the storage types of source array and destination array do not match,
+the storage type of destination array will not change when copying with `copyto` or the slice operator `[]`. The source array will be temporarily converted to desired storage type before the copy.
+
+
+```python
+e = mx.nd.sparse.zeros('row_sparse', (2,2))
+f = mx.nd.sparse.zeros('row_sparse', (2,2))
+g = mx.nd.ones(e.shape)
+e[:] = g
+g.copyto(f)
+{'e.stype':e.stype, 'f.stype':f.stype, 'g.stype':g.stype}
+```
+
+## Retain Row Slices
+
+You can retain a subset of row slices from a RowSparseNDArray specified by their row indices.
+
+
+```python
+data = [[1, 2], [3, 4], [5, 6]]
+indices = [0, 2, 3]
+rsp = mx.nd.sparse.row_sparse_array((data, indices), shape=(5, 2))
+# Retain row 0 and row 1
+rsp_retained = mx.nd.sparse.retain(rsp, mx.nd.array([0, 1]))
+{'rsp.asnumpy()': rsp.asnumpy(), 'rsp_retained': rsp_retained, 'rsp_retained.asnumpy()': rsp_retained.asnumpy()}
+```
+
+## Sparse Operators and Storage Type Inference
+
+Operators that have specialized implementation for sparse arrays can be accessed in ``mx.nd.sparse``. You can read the [mxnet.ndarray.sparse API documentation](http://mxnet.io/versions/master/api/python/ndarray/sparse.html) to find what sparse operators are available.
+
+
+```python
+shape = (3, 5)
+data = [7, 8, 9]
+indptr = [0, 2, 2, 3]
+indices = [0, 2, 1]
+# A csr matrix as lhs
+lhs = mx.nd.sparse.csr_matrix((data, indices, indptr), shape=shape)
+# A dense matrix as rhs
+rhs = mx.nd.ones((3, 2))
+# row_sparse result is inferred from sparse operator dot(csr.T, dense) based on input stypes
+transpose_dot = mx.nd.sparse.dot(lhs, rhs, transpose_a=True)
+{'transpose_dot': transpose_dot, 'transpose_dot.asnumpy()': transpose_dot.asnumpy()}
+```
+
+For any sparse operator, the storage type of output array is inferred based on inputs. You can either read the documentation or inspect the `stype` attribute of output array to know what storage type is inferred:
+
+
+```python
+a = transpose_dot.copy()
+b = a * 2  # b will be a RowSparseNDArray since zero multiplied by 2 is still zero
+c = a + mx.nd.ones((5, 2))  # c will be a dense NDArray
+{'b.stype':b.stype, 'c.stype':c.stype}
+```
+
+For operators that don't specialize in sparse arrays, you can still use them with sparse inputs with some performance penalty.
+In MXNet, dense operators require all inputs and outputs to be in the dense format.
+
+If sparse inputs are provided, MXNet will convert sparse inputs into dense ones temporarily so that the dense operator can be used.
+
+If sparse outputs are provided, MXNet will convert the dense outputs generated by the dense operator into the provided sparse format.
+
+For operators that don't specialize in sparse arrays, you can still use them with sparse inputs with some performance penalty.
+
+
+```python
+e = mx.nd.sparse.zeros('row_sparse', a.shape)
+d = mx.nd.log(a) # dense operator with a sparse input
+e = mx.nd.log(a, out=e) # dense operator with a sparse output
+{'a.stype':a.stype, 'd.stype':d.stype, 'e.stype':e.stype} # stypes of a and e will be not changed
+```
+
+Note that warning messages will be printed when such a storage fallback event happens. If you are using jupyter notebook, the warning message will be printed in your terminal console.
+
+## Sparse Optimizers
+
+In MXNet, sparse gradient updates are applied when weight, state and gradient are all in `row_sparse` storage.
+The sparse optimizers only update the row slices of the weight and the states whose indices appear
+in `gradient.indices`. For example, the default update rule for SGD optimizer is:
+
+```
+rescaled_grad = learning_rate * rescale_grad * clip(grad, clip_gradient) + weight_decay * weight
+state = momentum * state + rescaled_grad
+weight = weight - state
+```
+
+Meanwhile, the sparse update rule for SGD optimizer is:
+
+```
+for row in grad.indices:
+    rescaled_grad[row] = learning_rate * rescale_grad * clip(grad[row], clip_gradient) + weight_decay * weight[row]
+    state[row] = momentum[row] * state[row] + rescaled_grad[row]
+    weight[row] = weight[row] - state[row]
+```
+
+
+```python
+# Create weight
+shape = (4, 2)
+weight = mx.nd.ones(shape).tostype('row_sparse')
+# Create gradient
+data = [[1, 2], [4, 5]]
+indices = [1, 2]
+grad = mx.nd.sparse.row_sparse_array((data, indices), shape=shape)
+sgd = mx.optimizer.SGD(learning_rate=0.01, momentum=0.01)
+# Create momentum
+momentum = sgd.create_state(0, weight)
+# Before the update
+{"grad.asnumpy()":grad.asnumpy(), "weight.asnumpy()":weight.asnumpy(), "momentum.asnumpy()":momentum.asnumpy()}
+```
+
+
+```python
+sgd.update(0, weight, grad, momentum)
+# Only row 0 and row 2 are updated for both weight and momentum
+{"weight.asnumpy()":weight.asnumpy(), "momentum.asnumpy()":momentum.asnumpy()}
+```
+
+Note that both [mxnet.optimizer.SGD](https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.optimizer.SGD)
+and [mxnet.optimizer.Adam](https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.optimizer.Adam) support sparse updates in MXNet.
+
+## Advanced Topics
+
+### GPU Support
+
+By default, RowSparseNDArray operators are executed on CPU. In MXNet, GPU support for RowSparseNDArray is experimental
+with only a few sparse operators such as cast_storage and dot.
+
+To create a RowSparseNDArray on gpu, we need to explicitly specify the context:
+
+**Note** If a GPU is not available, an error will be reported in the following section. In order to execute it on a cpu, set gpu_device to mx.cpu().
+
+
+```python
+import sys
+gpu_device=mx.gpu() # Change this to mx.cpu() in absence of GPUs.
+try:
+    a = mx.nd.sparse.zeros('row_sparse', (100, 100), ctx=gpu_device)
+    a
+except mx.MXNetError as err:
+    sys.stderr.write(str(err))
+```
+
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
+
+
diff --git a/docs/tutorials/sparse/train.md b/docs/tutorials/sparse/train.md
new file mode 100644
index 000000000000..22ce039ee7f5
--- /dev/null
+++ b/docs/tutorials/sparse/train.md
@@ -0,0 +1,256 @@
+
+# Train a Linear Regression Model with Sparse Symbols
+In previous tutorials, we introduced `CSRNDArray` and `RowSparseNDArray`,
+the basic data structures for manipulating sparse data.
+MXNet also provides `Sparse Symbol` API, which enables symbolic expressions that handle sparse arrays.
+In this tutorial, we first focus on how to compose a symbolic graph with sparse operators,
+then train a linear regression model using sparse symbols with the Module API.
+
+## Prerequisites
+
+To complete this tutorial, we need:
+
+- MXNet. See the instructions for your operating system in [Setup and Installation](http://mxnet.io/get_started/install.html).  
+
+- [Jupyter Notebook](http://jupyter.org/index.html) and [Python Requests](http://docs.python-requests.org/en/master/) packages.
+```
+pip install jupyter requests
+```
+
+- Basic knowledge of Symbol in MXNet. See the detailed tutorial for Symbol in [Symbol - Neural Network Graphs and Auto-differentiation](https://mxnet.incubator.apache.org/tutorials/basic/symbol.html).
+
+- Basic knowledge of CSRNDArray in MXNet. See the detailed tutorial for CSRNDArray in [CSRNDArray - NDArray in Compressed Sparse Row Storage Format](https://mxnet.incubator.apache.org/versions/master/tutorials/sparse/csr.html).
+
+- Basic knowledge of RowSparseNDArray in MXNet. See the detailed tutorial for RowSparseNDArray in [RowSparseNDArray - NDArray for Sparse Gradient Updates](https://mxnet.incubator.apache.org/versions/master/tutorials/sparse/row_sparse.html).
+
+## Variables
+
+Variables are placeholder for arrays. We can use them to hold sparse arrays too.
+
+### Variable Storage Types
+
+The `stype` attribute of a variable is used to indicate the storage type of the array.
+By default, the `stype` of a variable is "default" which indicates the default dense storage format.
+We can specify the `stype` of a variable as "csr" or "row_sparse" to hold sparse arrays.
+
+
+```python
+import mxnet as mx
+# Create a variable to hold an NDArray
+a = mx.sym.Variable('a')
+# Create a variable to hold a CSRNDArray
+b = mx.sym.Variable('b', stype='csr')
+# Create a variable to hold a RowSparseNDArray
+c = mx.sym.Variable('c', stype='row_sparse')
+(a, b, c)
+```
+
+### Bind with Sparse Arrays
+
+The sparse symbols constructed above declare storage types of the arrays to hold.
+To evaluate them, we need to feed the free variables with sparse data.
+
+You can instantiate an executor from a sparse symbol by using the `simple_bind` method,
+which allocate zeros to all free variables according to their storage types.
+The executor provides `forward` method for evaluation and an attribute
+`outputs` to get all the results. Later, we will show the use of the `backward` method and other methods computing the gradients and updating parameters. A simple example first:
+
+
+```python
+shape = (2,2)
+# Instantiate an executor from sparse symbols
+b_exec = b.simple_bind(ctx=mx.cpu(), b=shape)
+c_exec = c.simple_bind(ctx=mx.cpu(), c=shape)
+b_exec.forward()
+c_exec.forward()
+# Sparse arrays of zeros are bound to b and c
+print(b_exec.outputs, c_exec.outputs)
+```
+
+You can update the array held by the variable by accessing executor's `arg_dict` and assigning new values.
+
+
+```python
+b_exec.arg_dict['b'][:] = mx.nd.ones(shape).tostype('csr')
+b_exec.forward()
+# The array `b` holds are updated to be ones
+eval_b = b_exec.outputs[0]
+{'eval_b': eval_b, 'eval_b.asnumpy()': eval_b.asnumpy()}
+```
+
+## Symbol Composition and Storage Type Inference
+
+### Basic Symbol Composition
+
+The following example builds a simple element-wise addition expression with different storage types.
+The sparse symbols are available in the `mx.sym.sparse` package.
+
+
+```python
+# Element-wise addition of variables with "default" stype
+d = mx.sym.elemwise_add(a, a)
+# Element-wise addition of variables with "csr" stype
+e = mx.sym.sparse.negative(b)
+# Element-wise addition of variables with "row_sparse" stype
+f = mx.sym.sparse.elemwise_add(c, c)
+{'d':d, 'e':e, 'f':f}
+```
+
+### Storage Type Inference
+
+What will be the output storage types of sparse symbols? In MXNet, for any sparse symbol, the result storage types are inferred based on storage types of inputs.
+You can read the [Sparse Symbol API](http://mxnet.io/versions/master/api/python/symbol/sparse.html) documentation to find what output storage types are. In the example below we will try out the storage types introduced in the Row Sparse and Compressed Sparse Row tutorials: `default` (dense), `csr`, and `row_sparse`.
+
+
+```python
+add_exec = mx.sym.Group([d, e, f]).simple_bind(ctx=mx.cpu(), a=shape, b=shape, c=shape)
+add_exec.forward()
+dense_add = add_exec.outputs[0]
+# The output storage type of elemwise_add(csr, csr) will be inferred as "csr"
+csr_add = add_exec.outputs[1]
+# The output storage type of elemwise_add(row_sparse, row_sparse) will be inferred as "row_sparse"
+rsp_add = add_exec.outputs[2]
+{'dense_add.stype': dense_add.stype, 'csr_add.stype':csr_add.stype, 'rsp_add.stype': rsp_add.stype}
+```
+
+### Storage Type Fallback
+
+For operators that don't specialize in certain sparse arrays, you can still use them with sparse inputs with some performance penalty. In MXNet, dense operators require all inputs and outputs to be in the dense format. If sparse inputs are provided, MXNet will convert sparse inputs into dense ones temporarily so that the dense operator can be used. If sparse outputs are provided, MXNet will convert the dense outputs generated by the dense operator into the provided sparse format. Warning messages will be printed when such a storage fallback event happens.
+
+
+```python
+# `log` operator doesn't support sparse inputs at all, but we can fallback on the dense implementation
+csr_log = mx.sym.log(a)
+# `elemwise_add` operator doesn't support adding csr with row_sparse, but we can fallback on the dense implementation
+csr_rsp_add = mx.sym.elemwise_add(b, c)
+fallback_exec = mx.sym.Group([csr_rsp_add, csr_log]).simple_bind(ctx=mx.cpu(), a=shape, b=shape, c=shape)
+fallback_exec.forward()
+fallback_add = fallback_exec.outputs[0]
+fallback_log = fallback_exec.outputs[1]
+{'fallback_add': fallback_add, 'fallback_log': fallback_log}
+```
+
+### Inspecting Storage Types of the Symbol Graph (Work in Progress)
+
+When the environment variable `MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING` is set to `1`, MXNet will log the storage type information of
+operators' inputs and outputs in the computation graph. For example, we can inspect the storage types of
+a linear classification network with sparse operators as follows:
+
+
+```python
+# Set logging level for executor
+import mxnet as mx
+import os
+os.environ['MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING'] = "1"
+# Data in csr format
+data = mx.sym.var('data', stype='csr', shape=(32, 10000))
+# Weight in row_sparse format
+weight = mx.sym.var('weight', stype='row_sparse', shape=(10000, 2))
+bias = mx.symbol.Variable("bias", shape=(2,))
+dot = mx.symbol.sparse.dot(data, weight)
+pred = mx.symbol.broadcast_add(dot, bias)
+y = mx.symbol.Variable("label")
+output = mx.symbol.SoftmaxOutput(data=pred, label=y, name="output")
+executor = output.simple_bind(ctx=mx.cpu())
+```
+
+## Training with Module APIs
+
+In the following section we'll walk through how one can implement **linear regression** using sparse symbols and sparse optimizers.
+
+The function you will explore is: *y = x<sub>1</sub>  +  2x<sub>2</sub> + ... 100x<sub>100*, where *(x<sub>1</sub>,x<sub>2</sub>, ..., x<sub>100</sub>)* are input features and *y* is the corresponding label.
+
+### Preparing the Data
+
+In MXNet, both [mx.io.LibSVMIter](https://mxnet.incubator.apache.org/versions/master/api/python/io.html#mxnet.io.LibSVMIter)
+and [mx.io.NDArrayIter](https://mxnet.incubator.apache.org/versions/master/api/python/io.html#mxnet.io.NDArrayIter)
+support loading sparse data in CSR format. In this example, we'll use the `NDArrayIter`.
+
+You may see some warnings from SciPy. You don't need to worry about those for this example.
+
+
+```python
+# Random training data
+feature_dimension = 100
+train_data = mx.test_utils.rand_ndarray((1000, feature_dimension), 'csr', 0.01)
+target_weight = mx.nd.arange(1, feature_dimension + 1).reshape((feature_dimension, 1))
+train_label = mx.nd.dot(train_data, target_weight)
+batch_size = 1
+train_iter = mx.io.NDArrayIter(train_data, train_label, batch_size, last_batch_handle='discard', label_name='label')
+```
+
+### Defining the Model
+
+Below is an example of a linear regression model specifying the storage type of the variables.
+
+
+```python
+initializer = mx.initializer.Normal(sigma=0.01)
+X = mx.sym.Variable('data', stype='csr')
+Y = mx.symbol.Variable('label')
+weight = mx.symbol.Variable('weight', stype='row_sparse', shape=(feature_dimension, 1), init=initializer)
+bias = mx.symbol.Variable('bias', shape=(1, ))
+pred = mx.sym.broadcast_add(mx.sym.sparse.dot(X, weight), bias)
+lro = mx.sym.LinearRegressionOutput(data=pred, label=Y, name="lro")
+```
+
+The above network uses the following symbols:
+
+1. `Variable X`: The placeholder for sparse data inputs. The `csr` stype indicates that the array to hold is in CSR format.
+
+2. `Variable Y`: The placeholder for dense labels.
+
+3. `Variable weight`: The placeholder for the weight to learn. The `stype` of weight is specified as `row_sparse` so that it is initialized as RowSparseNDArray,
+   and the optimizer will perform sparse update rules on it. The `init` attribute specifies what initializer to use for this variable.
+
+4. `Variable bias`: The placeholder for the bias to learn.
+
+5. `sparse.dot`: The dot product operation of `X` and `weight`. The sparse implementation will be invoked to handle `csr` and `row_sparse` inputs.
+
+6. `broadcast_add`: The broadcasting add operation to apply `bias`.
+
+7. `LinearRegressionOutput`: The output layer which computes *l2* loss against its input and the labels provided to it.
+
+### Training the model
+
+Once we have defined the model structure, the next step is to create a module and initialize the parameters and optimizer.
+
+
+```python
+# Create module
+mod = mx.mod.Module(symbol=lro, data_names=['data'], label_names=['label'])
+# Allocate memory by giving the input data and label shapes
+mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
+# Initialize parameters by random numbers
+mod.init_params(initializer=initializer)
+# Use SGD as the optimizer, which performs sparse update on "row_sparse" weight
+sgd = mx.optimizer.SGD(learning_rate=0.05, rescale_grad=1.0/batch_size, momentum=0.9)
+mod.init_optimizer(optimizer=sgd)
+```
+
+Finally, we train the parameters of the model to fit the training data by using the `forward`, `backward`, and `update` methods in Module.
+
+
+```python
+# Use mean square error as the metric
+metric = mx.metric.create('MSE')
+# Train 10 epochs
+for epoch in range(10):
+    train_iter.reset()
+    metric.reset()
+    for batch in train_iter:
+        mod.forward(batch, is_train=True)       # compute predictions
+        mod.update_metric(metric, batch.label)  # accumulate prediction accuracy
+        mod.backward()                          # compute gradients
+        mod.update()                            # update parameters
+    print('Epoch %d, Metric = %s' % (epoch, metric.get()))
+```
+
+
+### Training the model with multiple machines
+
+To train a sparse model with multiple machines, please refer to the example in [mxnet/example/sparse/](https://github.com/apache/incubator-mxnet/tree/master/example/sparse)
+
+<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
+
+
diff --git a/example/README.md b/example/README.md
index 12ada4d0ceef..507b144ad607 100644
--- a/example/README.md
+++ b/example/README.md
@@ -53,7 +53,7 @@ If you want to contribute to this list and the examples, please open a new pull
 * [Fast R-CNN](https://github.com/precedenceguo/mx-rcnn) by [Jian Guo](https://github.com/precedenceguo)
 * "End2End Captcha Recognition (OCR)" by [xlvector](https://github.com/xlvector) [github link](https://github.com/xlvector/learning-dl/tree/master/mxnet/ocr) [Blog in Chinese](http://blog.xlvector.net/2016-05/mxnet-ocr-cnn/)
 * "Prediction step of xlvector's lstm ocr" by [melody-rain](https://github.com/melody-rain) [github link](https://github.com/melody-rain/mxnet/commit/46002e31fc34c746c01bcaa7ade999187068ad3c) [Blog in Chinese](https://zhuanlan.zhihu.com/p/22698511)
-* "Solving classificiation + regression with MXnet in Multi Input + Multi Obj" by [xlvector](https://github.com/xlvector) [github link](https://gist.github.com/xlvector/c304d74f9dd6a3b68a3387985482baac) [Blog in Chinese](http://blog.xlvector.net/2016-05/mxnet-regression-classification-for-concret-continuous-features/)
+* "Solving classification + regression with MXnet in Multi Input + Multi Obj" by [xlvector](https://github.com/xlvector) [github link](https://gist.github.com/xlvector/c304d74f9dd6a3b68a3387985482baac) [Blog in Chinese](http://blog.xlvector.net/2016-05/mxnet-regression-classification-for-concret-continuous-features/)
 * "Learn to sort by LSTM" by [xlvector](https://github.com/xlvector) [github link](https://github.com/xlvector/learning-dl/tree/master/mxnet/lstm_sort) [Blog in Chinese](http://blog.xlvector.net/2016-05/mxnet-lstm-example/)
 * [Neural Art using extremely lightweight (<500K) neural network](https://github.com/pavelgonchar/neural-art-mini) Lightweight version of mxnet neural art implementation by [Pavel Gonchar](https://github.com/pavelgonchar)
 * [Neural Art with generative networks](https://github.com/zhaw/neural_style) by [zhaw](https://github.com/zhaw)
diff --git a/example/adversary/adversary_generation.ipynb b/example/adversary/adversary_generation.ipynb
index 3af733ffea9e..8adae3d3b719 100644
--- a/example/adversary/adversary_generation.ipynb
+++ b/example/adversary/adversary_generation.ipynb
@@ -15,7 +15,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
@@ -28,7 +28,10 @@
     "import matplotlib.pyplot as plt\n",
     "import matplotlib.cm as cm\n",
     "\n",
-    "from data import mnist_iterator"
+    "import os\n",
+    "import sys\n",
+    "sys.path.append(os.path.join(os.getcwd(), \"../../tests/python/common\"))\n",
+    "from get_data import MNISTIterator"
    ]
   },
   {
@@ -42,20 +45,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
    "source": [
-    "dev = mx.gpu()\n",
+    "dev = mx.cpu()\n",
     "batch_size = 100\n",
     "train_iter, val_iter = mnist_iterator(batch_size=batch_size, input_shape = (1,28,28))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {
     "collapsed": true
    },
@@ -83,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {
     "collapsed": true
    },
@@ -99,7 +102,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {
     "collapsed": true
    },
@@ -108,7 +111,7 @@
     "def LogLossGrad(alpha, label):\n",
     "    grad = np.copy(alpha)\n",
     "    for i in range(alpha.shape[0]):\n",
-    "        grad[i, label[i]] -= 1.\n",
+    "        grad[i, int(label[i])] -= 1.\n",
     "    return grad"
    ]
   },
@@ -121,7 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
@@ -151,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {
     "collapsed": true
    },
@@ -165,7 +168,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {
     "collapsed": true
    },
@@ -181,7 +184,7 @@
     "def CalLoss(pred_prob, label):\n",
     "    loss = 0.\n",
     "    for i in range(pred_prob.shape[0]):\n",
-    "        loss += -np.log(max(pred_prob[i, label[i]], 1e-10))\n",
+    "        loss += -np.log(max(pred_prob[i, int(label[i])], 1e-10))\n",
     "    return loss"
    ]
   },
@@ -194,30 +197,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:11: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n",
-      "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:4: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train Accuracy: 0.92\t Train Loss: 0.28074\n",
-      "Train Accuracy: 0.98\t Train Loss: 0.08431\n",
-      "Train Accuracy: 0.98\t Train Loss: 0.05848\n",
-      "Train Accuracy: 0.99\t Train Loss: 0.04575\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "num_round = 4\n",
     "train_acc = 0.\n",
@@ -260,27 +244,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "('Val Batch Accuracy: ', 0.98999999999999999)\n",
-      "('Val Batch Accuracy after pertubation: ', 0.02)\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:4: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "val_iter.reset()\n",
     "batch = val_iter.next()\n",
@@ -313,30 +281,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {
     "collapsed": false
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "true: 5\n",
-      "pred: 3\n"
-     ]
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWEAAAFfCAYAAACfj30KAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJztvW2sLctZHvjU3mt/nHttXyAefPENEUYYwUWIkcxoRiaW\niWYMGUCE/EkEKBhGg8KQHyijIIQJg8XE+ZEoAcmA0CAGgZQggfiMxJdFiAfGMZMwIopzpSEOEBNf\n7gXjxL73nLP3Wnuvnh/nvOs861nvW121vrrX2vVIra6u7tWrurrqqbfej+rUdR0aGhoaGobBydAF\naGhoaLjLaCTc0NDQMCAaCTc0NDQMiEbCDQ0NDQOikXBDQ0PDgGgk3NDQ0DAgGgk3NDQ0DIhGwg0N\nDQ0DopFwQ0NDw4CYDF2AlNKfA/DlAP4QwNWwpWloaGjYCi4BfBaAX+267s9yF+6MhFNK3wrg7wD4\nDAAfAvC3u677LefSLwfwT3ZVjoaGhoYB8fUA/mnugp2QcErprwP4PgDfAuADj/e/nFL6/K7r/pNc\n/ofb+M/JZIKzszOcn58v7S3Nxycnw2thXnjhBTz//PNDF2Nn2NXz3dzcYDqdYjabYTabYTqdLo41\nv62L0rApTk5OcHJygtPT06W9l8eYTqf42Mc+BhTw264k4b8N4Ee6rvsxO04pfTmA/wXAd8m1W1FB\npJRwenqKyWSC8/NzXFxcLPa22fEYSHgymeCZZ54Zuhg7w66ebzab4fr6GtfX15hOp4sOkFJC13Xo\nug63t7eL44aGTZBSWiLd3Bagl9+2zkYppTMAbwHwPjn1awDeuu3/a2hoaDhk7EIkfD2AUwAvS/7L\nAJ7dwf81NDQ0HCwG944wTCaTFTXBbDbD2dlZ0e9N3zuZTDCZTBZTBJtKpJSQUtpF0Rew6S/vvTwA\nmM/nuL6+3ml5SsD1ktvvuu4ado+S93xzc4PJ5BEtlLTjhs2xCxL+GIBbAG+Q/DcAeCn60Xw+X3mx\np6enOD8/L/pTNsZ5RLwvIrEGa8+jm+W//vWvx3Q63Xl5+mD1YhvXlUfQpXjjG9+4i+I2rAl9p9H7\nPjk5wWQyybbdRsTLuH//Ph4+fLiUd3t7W/z7rZNw13WzlNLvAHgHgF+gU+8A8PPR71772tcuRuB1\nYN4RnkRsDW0fsMZqW3T8zDPPjEIS5pmCpfV4nbp77rnndlDahk3ApMvvmvMuLy/DNptSWuQBjYQN\nTz/9NF73utct5V1fX+PFF18s+v2u1BH/GMBPPCbjfwngbwL4TAA/HP3AJNl1YaQbScH7kIZZYrDt\n9vZ2JX17ezuaBpxzvQHQvAyOBCzp6nvWtHmYcLvVvlMj6TXksRMS7rrup1JKnwbgu/EkWON/7Lru\nj6LfmPvYujD3NNuPQR1xe3sbbiZNDAlz6zs9PcV8Pl9xtWkEfFxgIs65WmnbTSktSNfad7MRbA87\nM8x1XffDyEi+ik0lYW5EQxrmWBK2Rnxzc4Obm5tFeiwkDGBp4FK9PHfahsOG5+/qCS2TyWSp3Wq/\n4TbesB2Mxjvi7OxsI0lYI1i8afY+oJKwEbBts9kMNzc3o2nE8/l8yRADPCHf+XzeCPhIoOoI6yNs\nP7FNCdhIuOs6nJycLPTDbaa0HYyGhDdVR0QGh31KwgCWCJiJ2MJpLT0GnZp1IpaA1XjD5Nxw2FAS\nNtI1Ijaj9u3t7ZJBVmd4zWVxuzgaEo7cbtQwt0uoO49KwryuwZhIWCVg66jqktRwuOD+weoI23jN\nlZubm6Xfcjj4PmeVdwWjIeFNdcLq05o73iUinbAuODMGEgaQJeHT09NGwEeESC+sC10xyXJ7Pj09\ndT0lGjbDqEh4E0mYMWQD8bwjWBI2ElZpYwjwIMWd01QpLNk3HDY8zwhWRxgB8wJXKlDs29PormA0\nJHxM0xyNKooij3YJL9JN8yw6yjNibhItt2uomolJRfXxhijs9i4NMNoWoug5L2rSa0cN28FoSLhh\ne4hCVL0Ox2stR/7VY4Kn2zQC5oHOfFtzobd3kYgbxodGwkcI1e1G6ZOTk5VQ7yjScEzwrPxqRGQS\n5ghGMzhyOG4j4YYh0Uj4SBGFp6o/tbopjV0SBlZJeD6fL1bb4wGIoxNZTWHXsJGpEXHDUGgkfIQo\nCU9lfbC37sa+/atLoSoV9eBgVYVFJ9rGxAws6+4bGoZCI+EjRGQJ9/aaVqIeEwEblIBtvQuVgpmI\nbQ8se7CM8fka7hYaCR8ZVB/cF6KaC/M+FElY80ztcHNzg5OTk0UILrDsrcIhuE0ibhgKjYSPFDmf\nUN5HYd6HYJjjY8uzoAIzwmn4re1b+G3DWNBI+AihkjBLvuwNcXZ2lg3xHiNJcVntmEOs2UvCpGSD\nBtKMcYBpuHtoJHyEiHTCGp56dnbmhnYPEe5dAyuP6YRNtaABMlH4La+BMMbna7hbaCR8hPDWgFBJ\nWElYfz9mKHFG+lwOv/VCcMcQOt7Q0Ej4SBFJuErOh4aaQUPXl+ZZwe3t7SLIg70kcmHNSvZeKO9Y\nw3sjF8RoNhAZeL26O1bwbHKXRutGwg1HDR10oq+I3NzcuOt8RCHOJaHhY1J1qC0g+vxXNFh7UYnA\n8Yd85wzX2yLiRsINRw3Vj+tXRAzmUcFqCy8dLfvppdWLY0jkvkTukQnXmX19xQsNP3YSVkO1l94U\njYQbjhaeROdJc+ZL7H0l2+6hIc7eVN07HpMknAtPj1z++kLDj10dYfvIWN0k4YaGHqgkHH3I1NZR\n1i8Ma5CH/c72ns5ZA2DGAC80vU8SLqm3Y5eEDbscTBsJNxwtPCJhYySf4/BmJiXTFevHLSNds5ce\nA3Ih6krCuXrLkfMxom8d6m08fyPhhqOGkoaXxy5rppowaIizd1/PH5slzjHAC09XFQqDJd0oNHws\nn+jaFfoMtez6uAkaCTccNdQ4llJahDSbrpNVEJ6rml3vGa4iP2xLj4mEPd21p47gOvMMkRwafuyS\ncM5Ya+S7qYGykXDD0YKJQ4+5Q5nvq5IwhzhHBhmVgjU0fDIZRxfLhafndMImBXuh4cdslDNE61Eb\ntrEA1DhaSEPDjmAdxojYpti66ffoWNoxlUUkCbM0rBGJ5lEwNCILv+fPzHVm+m9TS3i+08cKaxfR\netR2zaZoJNxw1PBUDAYN1uB8XWeiz3vAk4QvLi5GQ8IMtfR7ln82QjKOnXgZXde561HbuW2tSd1I\neAfQqZ4aQ3iKGkUfbdrY1XDApMJeAFzmXLq2oXnPss3n6yuvpr1jhkl6Xmgzh+nad+uYdKPF8sfk\nHbEuxuLnnENJH1qnveUMcbpX1UyNqqaR8JbhWcxtQXWewnlTu1y4bA1Un+m5Xtl10dRUp6/rYBfP\nF+lmc9Pr2vva++P1Ebis8/l8Sd0Q+d027A997Wud9qbqCP1clh4zGgkPBO3IGibLlmb2P/Wsr2z4\nqCUqnU5bg2FiUKt/X+jtOpKw9zzes9YgMixpmPA6JKjqBTPacf3bFD1ai2GTOmtYHzrry7W/mnua\nMY7J1tuaJDwiqCTsxdub1Zktrur2sonVVRukEbBHwp7LEkd6bRIVpaqQ6Hlr7u+5V9msgsu8DnQA\njUKcu67rDQFuBLxflLa12kFfSZjvq3v9XSkaCW8Z3pSWz1m+TWU0XJaxDfLLuV4xCasOkweNdcvg\nSeTamE3HWgovTNh0tFxeU7WUQt9bX6iuhgFHq5I17B6scvAIMyex9iEi9Eh44t+VopHwFqGSlJKL\ndnIjYk+PyZbX2ikU/94ajN7X8o08PKmPSacWrIJRHRrr0moXVmc1j5XZi+haBx4R8zmOLIvCgBsB\nD4NowNc2V0OOngrN2zy1WpOEB4QSseYxAZtEHEmq25BCc/6v5vdokpxHwutK4/p/3Cl0q/mPaDlK\nlVTXBYfq6n11LQVvFtHUEcPAm3l5ba021LrPnhHpmpskPCBUEkspLQiVdY2R1KQGs3U6M4/MTO4W\n9WS+r17oqZJZrc6Wy+B5aNzc3GA2m2E2my3StSSck9rXLa/dQ4173rsD4sW+mzQ8DCIC5nY2m82q\nSZjbsXpdaB6jkfBA8Dox5/HI2aer1QitGvC97DjyhNDGoobDbUvC1hlsm06nVf/heStoeWtdkfg+\nXoizvjsAbl3yvhHw/qDE6A360+l0Qci199b/0DbWSHhksM7HIZ+8GArrSXMEvOmU1hqFLk6uA4Mn\nTZqagn2Z1/l/HXS8TlFLwt6gsY3y2r2A/nBdvtarV0/H37BbqKrN2pu2tU1IONp77a1KsKgqUUMv\nrPPxS/DS5hUQEfCm6gjPO4CPuZwq9Znk5037a8vhGUpYCr6+vq66P08nWQXA1up1Ufru+Pq+dMN+\noEZgb9C/vr5e+wvbURutzfcwGhJex1LO6COc3LXbQk1HTCktudFomKy6YOXIoLQheNeZkZD39i7Y\nfa723USeEJ7luqbBar3pYLHuoNFI9LiwiRotd59cP1wXoyFhG602QcnUcEwdzLO8ex9UtOmxp4fa\nlHxUEjevjU2t/NPpdDH9Y8u0EmdDwzagXkkccs5trdZ9saTPbdqOR0PCNl1YFzkjiYa0jgUeAbMu\n1a5ho1NuD6wX4sxqA9Oveq5zNVADHPtp8vM1Im7YBpiAbSaprozrcMCu+h1jNCQ8m802IuGcu5Cd\n9/SkQ0JHbyOo6Ku23gZsvrC0Z9Dw3OZqwO5BniS8DQmiocHg9SWWVNmLphRq7OPN3BeBusAMD6Mh\n4el0utGyf57jvFm2DWMl4L5QWdbNsqGLwdb8GqgkHLnN1TY0dZbX0NGmjmjYFjy1nuf1w8RZAjP0\ncZ/jvsDkfjSS8CYkbOs08Aio4axj7PRGsh75anizpyYA1pNU+bfqnRGdq4FnkLu5uWkE3LAT5IiY\nz9UKE9Z2jcSZgCMvpFpsnYRTSt8D4Hsk+6Wu696Y+910Ot3oYXTpQe8FjA3sq+utU2AkZkS8ixBn\nu48n7RphmiReg75FVJo6omHb8PoSzzTXWTvClhZQl1Hue5tiV5LwhwD89wCMGXp78Gw22+gP+9YT\nGJv0pUETnG/lZTVEFOJsDWFdTwZuSNzA7J5ssKuB6pnVUj2299FwuMj1JWvH1pdq2pyR8MnJyYKf\nvD43Okn4MW66rvvTmh9Mp9ONRhWVrnhq4inpx4Co0TBRmd8wk6yqCTZxJeN6sUZq9zTpYZ37qwVZ\nybdJwg3bRNSXuP1pyHsfWOfLeWpD2RS7IuE3p5Q+CuAawG8DeFfXdX+Q+8FsNtuIhCMC9hT1Y4G9\nQCM5G3nVB9ELcfaMaZsQMXtYWAP2/Kxr7tm3NTRsC6V9qZaE+d5qOxmzJPxBAN8A4PcAvAHAdwP4\nQErp+a7r/nP0o3UW12DwlFoJeKzGIDUYcvk4jwmSCXgbSyfq7IDvs2mAS99zNTRsC6V9qQZ6D1ap\nMcdsiq2TcNd1v0qH/y6l9EEA/wHAOwF8f/S7TX3tzMtA1xE4hMAAj+Q0SILdbLxtU4y9jhoaIniC\nwzagdg1vSQHWNzNqpO6du6h1XfcgpfRvAbx51/91aOAXFU2bTB1hYcC8LmrzuW1o2D1YwPO+wO05\nFdS4xO6chFNKFwA+H8D/tev/OjSwakHTalTg9VA5AEIbRENDw/bAM1BvfRfDxcXF0u9ubm7w6quv\nFv3HLvyE/yGAfwbgI3ikE/67AF4L4Me3/V+HDtUzRZsths6hwOx32wi4oWE38MKheWkBu0Z1wzVq\nkV1Iwn8ewD8F8HoAf4pHhrr/ruu6P9rBfx0s1PDmhUZa2vtWVrQgTkNDw3ag/se5kOhRkXDXdV+7\n7XseKzwCjtbcjdblbeqIhobdom9tFzvHqOmLo1k74i5C/X1ZwlWp1wv/1TDghoaG7YKJtus6TCaT\nFenYC4euiXloJDwgPEnY0/8yCXvhwC0AoqFhN9BwaPad109rMWpiHhoJDwQNvFBpmN3Rbm5u3PBf\nTTc0NGwf+gVucz/jcOhGwgeKiID1k/BGwt7WJOGGht2Bw6EtsrQkHLrmU22NhAeE6oS9rxGbRGzX\n82+9fUNDw/YQhUP39buarwQ1Eh4YnlrC0xP3YZNwzdxvo3Pr/GYbODs7w/n5+dLetslkspTexGKd\nQ+4+uzhXir565wWadL/tsmyKqAxDl610PZWaNSUaCQ8I9T/0QiKB+i/E1paBG5am1z22PP0v7/+j\ncnkwkj0/P1/aLi4uFmk7762eV3Pcd05DzmvTucVmcmW0vFydb3NdkagMmt9Xt+v+p1dfY1fB1ayP\n3kh4QHiO4F5I5CaffSopgy4MxMfrpu3e/D9RnqZz50zaZSnYk4zPz88XujsgnkaWqHi8faSTj/L7\njkvLxOnIau+9y3UQEWxUPl10yru+9v+9+vLWph4bmk74gKBLbmqjSilttMRnDjwAcDq3lV5n999k\n7+VNJpMVtQOrJDiPDSjAdiTWSH3Ex7lz7MnCy6/myDjas9HII99NllusHay867zjmv/36tnbxkjC\nTRI+AGiH4amzSjq7koRVCt/mvk/N0ZeO9jZb4M2IV7fIih2Ras01kc92lMfh6DzIqGtTzeBgbUYH\nbX2ntdJw3wzAy9NysGTM19dC684L7x8jCTdJ+ECgHYYbMp/bpQ8wL9HH66TWbN5vI31xqW45Sp+c\nnCxIlv/b2wP9ROttkYqBz0cfMo2O7YOR9iwGncaXDAL8O501qBRs74PRR8jrzgy4fuz8JmApWKNE\nNXR/bGiS8IGAO41JNpZv4ZCmptgVmLQ2SWtebgH6EkLO6TiV8L08q8scuXp6xr5zpWt9cFqfh4Nv\nIgIu2bjtaJvSQTIHJeV1Zg1Mvio4rEvG3qDnLWQ1RhJukvCBQDsRE818/uQrsbtqZCmlcErfd9x3\njRrsPGPRuhtPs6O06UEjQq3Ze3nRqnZ8bAOpSqxWlmiWUyqhW9m0Damht4+EtU2sM2uwcrCe257F\n7lkDfUaucw3rHyMJN0n4AJAjJWt8bKzbVRn6jFsl+V6ePldJumTzDIRRWjtybTo6z+t76Bofs9ls\nSUJXFQQTlkr9fcQbSev8Pm2v6ohanXAJ8RrBsqHRiNja8ToErOXwJGGOKt3002i7QAtbPhAwCVnD\nLQmJ3Ob/97l51eRxWkkxt4/IOEfQJeTtSbSRdb3PAq8GN17Xgwnh9PQUs9nM9UzQ/1GvhZyU6e3V\n+4PbFBOwzUxK0Sd9a90y7Dy/g3XB9aURpRZJOp1OR0nCNWVqJDwgVO8J7D8cWYMetrXVSKy10rLW\nHe85XUOunndDlK+h5WqY9FQiVhbzA2c1haeqyBGxZzi05+Y6i3TCfYTM/22SrUfALAXb77zBcB1o\nHaiqh8P6d+XCuQlq+m0j4YHgkccQuLi4WESbeXvvvHetpm0K3OdnXCoh17pZAegl1NxxLs0k4BkJ\nI7cwT89pEp5KzTwz0um9pj1vCPWnVuTqMid9R7MLG1QiHTjvS8D1x7/X/zRS7num3LN66X2ikfAd\nh0pO1nk1JJgJljfVDbOPrif5RnmepLvtAcpTaUTgjm/T/hzheWojbxBScjw7O1uaXuckX91z+Lan\nKuK9liuHUn0wS6k8oGzDn1c/Lc+zDZ152NQ/aj/eoJBTR0Wqll2hkfAdRjR1Zf2uEfDl5eVKx2YS\nZvc0dlHLSbV9Kgcu56bPuA756t4j4lICtro19cVsNlvUI+uXSyVQI+HSrbY+VdURHSv55rYaUru5\nuakiYa8Necdd12V9ji29S1uMopFww4oxxyNiI+HcFvkJ5zpHiYFtk+fy9KX8zAolGeu4bATjgcsz\nnvJ5T0XguVnx4v2RxKlp9kxRF0HN47KVwlMjaNp05H2ktksSZtVNTvVl75sNqWpYtfphXfeu0Uj4\njsMjYF4kh/XCkUsaqyFUEo4k0V2Qbu4ZmTzZ4MRlAfxwXFVHWB4Tsf2P1ufp6Slubm4W5Ht2dtb7\n9eycOoA3L3hGNzbM7aJujYT1g7TqN10b2WbqCA0I8tKmU/d8xjUPeLTWrxn1bGMdtD3TvtBI+I4j\np5JgnfDl5WUobWnaI2H+rz4C3iYhMwEbmIC9KTanlYhVEvbq0QjEpDkj4EhKVGlRN688Xdf1Soh8\nXFOXfC2/E+98LpLNG2BKoSSck4pVf5zbAODq6gpXV1e4vr7G1dWVS8A5ldW20Uj4jiOnijg7O1vS\nCXsSVrRFxNuXl0tv8owKJtUSElZJWNUPng7YpsqRjtQzZpXqYo2Eo9BtPq7xLCl9H3bsETAHr3De\nJoa5nFQ8n8/d8144PQA8fPhwITRw3diz1A5am6KR8B2HdSjPtckk4cvLS1xeXvZODz3JSzuwl+ed\n5/22ntPLiwhOCdc7p3Vo0rWRb85FLkr3GQY5T0lf9Z86DS+tp5qNXfb69uvqhPvI1fyvcyH2HL7N\nX15RCdgCbRoJN+wVnsuV5x1ROuVTEvb2tefWBet6+Z5MqH3Sr6fTtvswWUeeDSXuUCwFW1kMETH3\nuQHyuZq64ufy9pzmEO6cwWsdEs6RL+9NPx595orTAJYImI2LHHjTSLhhb1CdsKojmIh1qqtT3mj6\nG6Vrzm3jOaM8JuhSo5h1/kha9jbP6OapIKwcDO+4xLOkVJ2j76HE04BJONrM/9l8oEtxe3tbRcJe\nCL13DDwZvKz+7Rmur68bCTfsH6oTzhnmommvlxc14lzj3lXDzxEwsKpnjQhUJeEaqdWTcvsk35pn\n22Q2oecjlYaXbwTGZMvrOpydnS0FopSCdcIlJOwFEXnBRTzgKgHzuieNhLeAnCRR08hrsM8Xty14\nkrBtqo6okZB2gW28t4iQlWT5WFUOdo9tW9CZIIYAP5fn5uXtdR0HC0KxPQen1JAwe5bkyNiCZUrD\n6YFl/2YjYHPBvNOSsI1AjD7LtaX5eg2jNHcTzwBUghrDUkMMj0D13eWkyOgepf/bJ61q28mljUxq\n21FJPpdzF+0qJzFbOpKAVS+sg5QZJDmK0J7HBrpSGBHy/XXQN3L2JOEozN6uZcJVNZr26XU4YD6f\nF68pPCoSVituTnemlmTOzxEwsB4Jq0HCy2t4BNO3AXUkGOlNdauBN3hHeRqVlgsZ5mctqQ9Ne/ro\niJC31bYi4tB05GUR+X4zKXpGRru+ZjDVVeaU7NX3V9UPnjri7OxsSX/s+bZ79oyo70ekbeU/OBI+\nPz9fCq8EllfB4o5hI6rtI4lGK8fO15Jw3xR86GnkkCh57r7ZTOQpkPMeKEGk2y0h+T5dbk0dlEhP\nTFIeGWseX18CJTMvT6XOnLop+o1Jpt6z15RXSVilX1ZRsHTbtwb2fD5fkoRVGo4GmWhw4nzGzc0N\nHjx4UPSsoyFhb7UnViuwhMsN0lZQUgL2JGDV6ZUg0o8ZrHHdZSKOkJM6vdlL5FPLebX/nyN03XtE\nlztWROc90rL/86Rkj4y1fOuoZqKyeBJezj2tlID5/0xKLgUvyuN54zAJA3DXMlHPiLOzM8zn8xVJ\nWI2O3nsv0ZHrb0oxehI2QrW0wSO+XIflDlhLwtaArJGx2sSbijSsIieFRksfese1krBH8JHkHZGS\nt+XgdWLv3n2/3fbArhKwSnJavr603svrG+xFoe8u9y4t8MXz0mDDnPkfe4tJecR8e3sbSsKRusWT\nxD19cumzKUZDwqZMZ3BIpI5QKtV6OmG91pOk+3By8mQNWE8faI3vLiNXnznpl9O5lbh0icFSMMH3\nGd0A3zUrmoLW1A0TV9d1S4SndbirGVWNyqFEWtZz2i/0np6UnAMvdK/kZ5+RsoAOAEuRclHAhq0z\nES3BGumEowFAvTUYB0nCtk4B4+bmZmVkso7LI6yBJSs+5s5WE8IJYCEFqz4ypbS4r46CDY8QqSFU\nEtVFYLz1CCyvpnGrp0xO0gaw1Nm9LZp65sDqB+7gJycnS22HVRBMxFHarqtFRMR9JKy/9YhY/8Oe\nj/tQKVgSPj09XSFeWxTJjG1euLK3wNTt7W12DWydqXiScBQezahRnY2GhD1JWCtE9XgeObPeyY6N\nrD0Feh88EuaXEhlrGh6hj4A9Eva+YGzpGr2ikm1uMW8A2QWKWP9YaxCzdmJEbLB8Jl7d2/+pdFzb\n3jzy9aTViIQs7eXZM/H9Pb27h+g5mMBt4/ZhJGxLTpYuLKXqiD4C5mfUYCaVuBkH+bVlTyfsNTwm\nVtWtedPcaDpVCk//y422doQ/VvTVac4Yp8s65sJga6Wp3Dq3vAcQTmNNejLUzKZYCs6d13bs5W8D\nkSTshZzrPsoDngwoPOB4Wx/sGrPBMPF5MyVTUUVqAk2b9LyuOsILaPJIeDqdFr+TUZGwSsIewdrL\nMcmWz9uep2q5Eb0EPM3QxruO29RdQaQH9jweWDL1Ql/1G2ylMBLOrXHL0pTn5mSS0ybvOPIaYEHB\n6mgX+mD+zxwBs9+tXZ+7Fz8HUB9k02eY4zWFdebC7w9AGF2netyTk5NqFzWPiD2/ZMb19XX8IgSj\nIWFPHWHQqas3dbDreL8NmJTkScC8BmyD32lzZKzGMVY/cCgsb7VuTn3qDTtOKS3a4M3NDS4uLhZl\n4+erNczl6orJ1xMicnvVCZcce3pcj4zXeZZtgMtrth2TXi3tqZW6bnWB+2iz96wuan3BGqoP5hkT\nh0Qb9DiH0ZCwdTrN0w6ja6/ugwDVgMTRePzS1mnAQ+Lk5MT91EukCij1HrD6Ug+HSC9r8fv2/1F6\nExLOGf3MAGzPeX5+jul0uuhc19fXS51NjTA5pJSWSMAjCs5TkvSI01OpaT/QY1XNqbrODGmeUOMd\nK8GX7Evqyru/qjgURsLezFmPdblNNdDWckr0jFWz7eIrdwzriJpnpOBVWE2HXBfeVDoKhz40L4mT\nkxOXgJWI2U3Q22y6zVsNATLhGunyntPrqCP6dMJGwrbwjK3+FS2LWKsTjoxEnhHJk1B1gFvHwGx2\nFI98bfN863N7fsZoGr+OCjC6rxrCOc39kQmY1TzW3nOC3RAz21GRcCQJc6UNJQmzNHxycrLkPmfn\nD1USVmkzkoQ1sogJWI/7vrLAaSZb3fTcOt4RkUTO+UaWRsC5xcHXIWHPZ1X3OjXWKDGu31qwpNu3\nAeWLZ+XARPcxAAAgAElEQVQkd3VbWwceEdt/23m1/7Bxz/b2+z7BrsaIuC2MhoRtyTuGp8fbZOqw\nDjw9ptdYcxbwscKThCPVhLmHMTmoZMYRhSzl5hb6jgjYPsSoeTUkbBJ5n48wu6jlPo1j52ves+og\ncyG1RvBqVLK2p0biGpSSL9edR766seSuUW7sxbQOIkLX8x4PaNkN2qaVU+60JDydTlckDG8Kq1OH\nfYCn2NwwdPp9iCRcIgXbIGjka8TLUjATMEvCkYGNNyXe3L5WElYjoBc1Z++1LyKKybEURsIlK3yZ\nvpk3I1/Vwda2NRYmcgRsaZUKS0iY1QXAqi63hoxVws1J2F5Z+Zl5YymYBTudXd9JSdgMP4ycQWcI\nwxzrzfhlmdvctqzE+4JJwiXGOZOEmXgjAjZJWEk4p2awz5Az4Xp5NZFIqsvPbQB6LeusFqipY2+x\ncd7PZjNcXFwsPAB4sRmVzIyEagUQzwDnETALF+pzr23emwV5/7sJrDyRKsKehd8jv3dtAyoJD2Fn\nUlSTcErpbQC+HcBbAHwGgK/puu4X5Zp3A/hmAJ8K4LcB/K2u617I3dfchBgqvXjThn2qI9T6ahIw\nW7YPCTnvCJUYTBLmQYfJmAkYwJIayQhXJVuPbKPt4cOHa6kjtCNGeVYfqo+N8kqRUlp8mcS+UnJx\ncYHpdLog4MvLy8WMz3yTcwS8jl6Yidfu5UnAVm9aT9FeB2G+X6QqqKk7LZu1NyZnvr+5rKkK0fbc\nxnN2prFLwk8D+F0APwrgZ/VkSuk7AHwbgG8E8O8BfDeA96WUPrfruvvRTafT6cqD5ySXfVYUv1Q7\nZp2XqSIOkYQ98o28I5SAmXy1EzIBq8rBCDUiWj6veTWSMOBPqz2pjqe6uml+zXs+OTnB5eUlrq6u\ncO/evQUhezpJ9X31CNja3TqSMD8nCxVKzpEkqVIlD8L6X55Hw7rw1H+s4vD4wMqqBtjI7fWgdMJd\n1/0KgF8BgOS3xm8D8J6u637h8TXvBPAygK8D8CPRfb1oqL5p0T4qzPtfe/FsDT40fTDge0fkQoaV\ngJV8OY99gJWEjVRtyx1rulYdoeXizqrHkYW/z0CUw+npKS4vL3F5eYnpdIp79+5lPX64TWu5LFLU\n1BY14Gm7SpieUU7LEwlDSrKmslMCtn5TA+96la5ZGrZnUynYyFcNxREBj56Ec0gpvQnAswDeZ3ld\n101TSu8H8FZkSHg6nboNPNdp9lVhTL7cmFSCODScnp6GQRGeTtgjM4+MAbg6YSbgBw8eLO1Lt1pJ\nGMj7uUZSXJSufc8nJye4d+/eEvnmPH2UCFgCVm+JGhgBW9rbc53owJAjYa071tFu2kdVZcL9j+vI\nUxeyJMwCQYl3xNjVETk8C6DDI8mX8TKAv5D7oacTBsaxOpnXeRmHSMBAmU6Y9cJ9JMyIAjGYUO/f\nvx+SMR9zeh0SjuC9z22/Sx7o+nzetUwsAZuxk0m7BkzASsYMJlDPk0SPbeDlWYOpTKK2UYtId81g\nNREPaDwj89Rt+i4OXhLeFGMg3HVw6OXuG2RKZx5KyjqV5SAJXbvBi66LvGL2gUhaXEcSZoICVkmC\niUK9MDwdtP22thxqaPQCb1jn7JGuJwl76/Z66V0JK/P5vNcN0raHDx/ilVdeWRIATBDhGYo3++b3\nZe+M1VWM2azsI5/A9kn4JQAJwBsepw163DAQPJck3fi66HwpuAFrp2ajSc4Auy/oc6o+eJ06MBc1\nXTKR7+NJbd41el0NIiOjt+V0wLp5yztG265ImL0eSkj4k5/8JD75yU/i1VdfXSFic8VUElYC5pn7\nxz72MfzZn/3ZUplq3s9WSbjruj9IKb0E4B0A/g0ApJTOAbwdj9zaGgZANO3MEW3p+YiUIpWFR8Se\nr2ZO1bEr6JR6G94RKT2JmNN1awGfXPV/oul1DXLPosfeLCbnHeEFtHj7IUhY1W1XV1d45ZVX8Oqr\nry6R8NXV1YKEdcalszlesgAAXve61+HTPu3Tlsp0//59fOhDHyoq/zp+wk8D+Bw8kngB4LNTSl8E\n4ONd1/0RgO8H8K6U0ocBfBjAuwDcB/CTtf/VsDly+rSc9Kvn+yTAPjL2yFeJ2JOC90XE/IyejzCn\nayVh78u+fA/t4H0EvA4J6wDD5Kt5Hglrnu1ZZ923Wty+SNizc3CA0Kuvvor79+8vNlVJsIugErC+\nO76GsWt1xBcD+A08MsB1AP7R4/wfB/A/dV33D1JKlwB+EE+CNb6sy/gIN+wGEQHXSLg5qbgPtVKw\nGkf2bSRhYlLd7LoRcywJ8++tDu0Z7fm1bpWgbaW3Whc1dafMpT1PjShtg1a02BCn90HCuWVQ2Tj8\n4MGDJYPv9fX1UmSop46Yz+cragYeIBnT6Q6/rNF13fsBZFth13XfC+B7a+/dsBuohOvllaga+o71\nPyJ9sEfEkQFon2TMZJSbZteSsLdKmke03ME9w91kMlkQcC0J6zvN6bv7vGCYiCPjnpe3axIuWZNa\nfdU5WIjVEZFOmGH55r/NGNIw1zAS5FQMto8k3dx1nKf/xWDSjKZ2kT54aJ2wSr66spr6xnL5vGNd\nQzinbjB4JKxT/XWer2TLkS+X2SThnNFPz+8CSsK6z53TvZIw8CTQhaFtWd9HI+GGJdQQca1qQv+D\nEakj1M2nj4j3AZUSjex0LeHa9YSB1YWB1DDHBMzqCbuWjXU8vV/3GaO0N4vpS3vSdaTuGIqEdZ+L\nCjWdsOl4PZ2vzQAiPTHQSLjhMUrUD7lzpdfmkFNJ5CThoaRhlYT1W2LrLOruuYFxnXInZ/9b/ZrJ\nJlJlycxIwWQb7WsH7l3g9vY2S7ycZl/gaB/phDntDTZaplI0Ej5yRERcKumWqiwUkR4xp4pQFzWV\nhvflIaHqCCXidfSxng4WgPucuyQ1/V00m9G6VvWSd4+SgX4XUBLOpW0NlMjtjvPtWY10U/K/TuI9\nn3pL5NBI+A5AO1of0UadPUfEmjaoJLGOJLwPMAHbxtFetvj6xcVFNQnzf3hQCav0d9v4701gkvAQ\n/80wEvbWqVYytvB7gzfAewNQNPBEqGm3oyHhnAtL37SoYRmePk4lMZP2vM/5eO5Z3nTYk3i8MuSm\n9+fn5ytTPSNlNWKllNZ657kBw/ZeXbARTNUJu4L3fHexnef6uuaxJNy3IFVtpGFf+baB0ZCwp2sr\ncZXZp7R0KMjpIVXSsy886DfP1CVLybdkWsxkzxKlLWpunYLv5VmdN7GsR9K9Psvp6elSHbAnxK6t\n+w0+cv2dB24LI+5boW6sPDEaEvZ0bVHUjuoMG5bhSaGe8/xkMln61I4SkCcVK5Hn9Mb8Pzqtt69J\n6BScCXgymSwZpzapB5XiNW2SsIYYe2s9NOwHnhrLSzMJR4u1j5knRkXCZ2dnS3ne8nmsIDed1Jgr\neAh4agAvlPTs7GxJEtYPTUYrepVIxFoGVkHY99TsvQJxeO6mUihLulFQgapKvM/PNyLeL1TqzQX0\n6Gp87Ga2b9vCOhgNCZtUxlDjTUrLH9o8Odn80ynHhkgS9Va6YhLWT69rcIESkef/6RnxlODOz88X\noaEsoWh4rq1FvKkk6g1IOjjlVgLjCLdGwPuFEnAU6s5tZiwLtddgNCRsEhJDFzThTjDmSh0aOYMY\n773PrnuSoCcNemQclcHuZx+x9HwxuTNxaO4mOuHIOBiFI3szhsi3t2H38GZHub0XAKSD/RgxahI2\nAmbjjb0Yi1hpnWIVHvlo1BerBiLDnBJxZOSK3oE3CCgBm2pJSXg6nW5Mwn114Um8StbNMDcMIt9y\n72MA3qp8nG6GuULY1JjhRRapT2XrGKtQNzRVQRjhqj6YjVKRTpTv77mp6bFJwmdnZytSiZ23zmLk\n66lDNiFhJWCui8gTwvMsaW1tv/DcFvlTRUzEXsCFF3wxRoyKhFUnrK5L7JKya3/NQ0ZOHaGBB7bl\ndMKev67tPX0wl4H/3yNgW4HKdMDX19fZpR9r60F1096s4Pz8fOV/1AjZiHj/0OAelYL1I7TqNXUo\nXlSjIWFPHRFJwNZxW6eIEUmArIawLdIJ95FwX1r/nzsCk7B1quvra1xcXODq6mpRjm14R+QkYfYK\nsetzkn7DfqBhw54kbIEYqnJQ9UNTRxTCU0cYWAJuLkN5RDpQVUWoPrhPHREtWuPVv0qg3AE43/7H\npOCrqyu3DJvMevr040rC3jO1NjYMcjphJmFdh1nvMXaMhoStw2tepJtrBBxDiYc9FLxVwfo8Imol\nUVU5GPmpJGznp9MpLi4ucHl5ievra9y7d28p3FSnoaXQYJRcmttetI5ASdo73gVqyrXL8ngzoWh2\nxMiFIwNwXc4ib4iaxXLGiNGQcMN2ocY51c9GKgcvKGOd/+YycDm6rlshZBsQLi8vV754q5JzTew/\nhyKzpO/tWWIvndpGeWzD2CX6yrGvaXjOa0YFpr465lmvrvmgbmfHgkbCRwhPEvb8Y3P6Xy8YoxRM\nnlYO61xMxAZTRXnf+NKIO2+d1ohk+HdqdNQ8c3lk20Nur9dpOleubUJduaIy7wqREZPTdl2ubjVP\nvSBYIj6E9SBq0Ej4SKEkWOsnu2mUWlQOI2DOZ0mYp5csARtR10jCrIbxVovj/JRSr5uTujzpMd/D\nyr9LklAC9pYD3fV0XdWFPKMwAlZVDxvbovKay2LkDzx2j4caNBI+QngE7IXn6kI1nkvYOiSsRjiW\niLiMtjeCZQJTCfji4gL37t0Lv1jgdUgj4b4oOTPKKQl4zv858mDUfFlhE6j3gBfeu0vCUlWXka8R\nrw5ESsDR4v6RPvgQwpBr0Uj4iOFJKZ40GOmFeSpZA9UFep4Ndm4+ny/UDKqCYMPavXv3cHV1tSTV\n5Tohk0EUCcfHAFYMPiWbR8Csjtm1JKwDgRquLL0r2EDH5KgzHVZHqeTuldXLO2YibiR8hKiRhHe9\napgSsZaNOxMb8Ez6ZW+J6+trd2oddURVxeQ2MwblLPKctt956pF9hdPr1N5byMa2XeH09BTz+dz1\nA2cCXqfMKimrxH8MBAw0Ej5alBJxn1vauoY5KwOABdlah2QpkaVklYBnsxkuLy+XDDR9krBHBGw0\nitIakcW6SP7c/Gw2Cz1HjFzsefcBT7L0Qnt3BSNhlkq9QdbKWlpeVfno8bFIwUAj4aNEn3dEpI5Q\no9ymLmoMnZLysbqh2XKX3hQ16nyaZ/fl+uCyad58Pl8JhZ1MJotjrReP2JiAtzGL6IMnVZpBi7fp\ndLqzMkQSsL1PfV/sguZFwdWGIh8DETcSPjIoyUR+wn2qiHUlYdWDcnnUdUt1wDnJJ7KI9x335Rt4\nASEmXtum02lYH0YK9gz7IGD+30iytGeZTqc7IyuV+LnNqRSsZfZWRcuFItvvj8k9DWgkfJSoUUV4\n6+nW6oSVYO1Ypc8cQWony6W9e0XHJR3ZJDNbwc2IoGQBISVCNmjuAypZ6vTePni5KzAJc1vzXMk8\ndUTfehD8nPrcx4LRk7Bnafcku9pOehehROJN0ddBaV1H98+RVvSbUhJWx35vM0IoMeB5+vPIs4QN\nd55EGM0Monas50sXOd+ld0RKyZ21lEis3rv12md0raKv3jaB/n9f2axdlWDUJOxJc+oOYxJXX2cz\n6eyukbGSTeQYX9uBcljXNat2gOjTDXfdsu45t/FA4A36NmuYzWY4OzvDbDZbGA+9jde94LpXvaaX\nZ2WPBgxL6+fcNZhhX8bBUni2Cq8/A3V+1n2znE3bsqr4oo0xn89xdXVV9B+jJmFgdV3ayB2mpIHf\nFQLuI17P19Uj4HXAqoiae3gNnY/5Gn3WXF4JAQPL0njOt5ot+GpIivKiYApOm0TJwoL3/njT/zoE\nP1oVqrg/W1mtLmrQZ8DjwW2dMrOhmtV1kaqqxi1w1CTsScJewzo5OQkNOfYyzWAytka5bUQSQY6M\no06+DiGvQ8CGPkkjNyWNyDgnJXEe+zGrmsE8Jc7OzpZc17y9lxf5H/Nnu8z9jp/Rm63wO8t95n3T\nwXRXYM8Jb7Cw8zVSfG6wAjbr+x4HRaoqRo1dYNQkDKxa+bVh2TmPWLxIprsEncJGBLxtaXgTKVql\nDk3nOlMNEWvapFDPiOmRp/oR96VVSmZpClh2b7OyqReBF1nmEXDkSTI0IqHKO19Lwt7sQq9Zt8yR\nPUDT+rtSjJqEtQI4XztMLpLJOts+3IbGgEiXGEnDTMSqvllXeqj9nUfAfdO+3H8w0erey2OJ5vb2\ndkHA6s8aRXmpRKpeCqa/Na8Lex5+PzYQcDn5PXlSb2SMOwQiZhJWgq4pt3paeO1kXWnY4xr1JrI0\no+Z/Rk/CVgHRiMlTRiNffQH7jGAaEyISrtELr0PE6zR0TqvxJucyV0LEnI728/mTz2ZFAxTvPcKN\nyNlCrtXlzf7fpGAvP3I9U11z7j2OBX392dQUpqooRdd1K+ody2cC3qTckY2A3TwZNTrtUZMw8KQC\nLM159rLYvUgJmBv5XZGEgVUC5k6tJJMj4HWwqSQckXDJO4xUEnpO01Y/GjSSm0FE60pwejab4erq\naul7efafngTnlcnzpWUS9so3RikYWO7PmmeDoKop+sD92yPgGp/3qLzqEsurENqecTQkzB2Sjz3L\np9dBrVGW6hSPBZ7es0YS3lQdsalOWNURkd6tthy5Y1XD5KzsOT2tkvNsNltISkoSfB+PROy/WLJm\ntUZUNn2WMUH7tM1SrbyR8T0H6+MGFUBYzbNOWT1J2PtYAKPqM1zVJdsjeOpi5Bn5fXpTOfNBtJdw\nlyRhwCfgSGrKEfA6Uu26KolIEuYlJ9dBVB5PIo7UMZ6e1tPHcr6uO2H3sXvweW8mp/di9UauvGMl\nYABFfboGnhGO2/emkjBL6vplGv5EFqMmSnHUJAysOutHej41crAU7LmQHDtUotOGWaILtvus89/r\nQFUSnkEk15E2JZ2cuoLTTI65bT6fLzwhuP0yqfLC+p4xSVUWbOQrUbOMDTob3UbZWX/sEfCmqsjI\nb5w/OKCSsB7nMGoSVgLOwRq36j3Nws0d2V5aTn94yPBIV3WW1pm95SzV9UYHMX0falirReSqxO5K\nnsU8KscuZzymDvOMd17a2p+nS/YMeayC4PfiWfz71C25ZyjJ23b/2LSdRNC6tzrP9f0SqCosSnuq\nsqPyE66Bp7vxLMVstOibfh4idMpspHt9fe1OeSNp6/r6erGo+oMHD5aIgIkyOjb0pXOLCtlmnUnt\nA3yf6P89lJBAREzeM7Oek9vN6emTTzTxEp3czrjNRqvbeSRs+1KVROn7i9QwY+4fke5W+34NOaaU\nFu8j59li/YZxNIa5GrBBx0Y8b3GflNJSJ8gZYA4RKgWrXlINGCotMwFfXV3h4cOHuLy8xOXlpUs6\nuTwgT5Z2HH1808vz/jPaDCVSnidVc5uJoASs9+66bulDpSwU2O9zBKzP671n3Xt5+m6itKr1cv8z\nJkQErEsc1NgVUkor32L0+o9n+LuTJAwsv4iTkxOXgHk66RmogMMPcdYR+vT0yRchuJN5RiQm4IuL\ni6UtCqLw0jWbuvtwmLDt7XlYKtRADo9YrD4U2mlUwuU2w+f499HGZTQSPj8/X3Id43Lq+s65JUU9\nu0dkYAWwRMD6ntT1z9Lcfuxe1m/4nmPqH1yfvCaFXlOzpoO1TSP1nC93I2H4L8Ej4JOTE1d/ZxKy\nYUwNrBaeioEbiWfwMfK9uLjA+fn50mZ5ke+uupKVSFycZjcf043axz+NgD1Dq258HsjrrkulXCPM\nPiKOJGG2pNsMzH6rBHx+fr5Cwp70q+/XBic+5nIw6esSnPre1Pde+4j3jEOD3wE/h6piPN1t3329\nelIS9sj9zpOwNW4vn0Oc2X1FCZglmUODSsJKwJ70ayRwdXW1RIrmgmN6sZxhgs950lckkZmkqC4/\n3poIHnHwsXY8TnvnPHJl5AiY2wnwRN/I+da2WALmKbM9q33UNCcF27099zjP11jLwnWV2/T+/Bw2\nCOTqbAjwIOPpwbnv18Brw0rCwOoXRnZKwimltwH4dgBvAfAZAL6m67pfpPM/BuCd8rMPdl331tr/\nWqNsK1KJNkBrXCcnfohz1x1+mLM3TYoMcKp/5WNN93Ve9awo3UwCtr1N3T0XOpsW6n91Xed2QNur\n5MtE4km7fb/hfG4vVhYmP3ZXikh4NpstVD5RZ/f8k09PT8P1EvR5lIT1qyq29wjY7snPOjYi5nZg\n8Ga/NeD64zSwHOCj2LUk/DSA3wXwowB+NrjmlwF8IwB7S7v7vspjKAFr47PGZcaqSGpQ6ebQYA2D\np5Kcrx3Y3NS0Y+Y6acneU1FEx7ZI+sXFRdaHGXiyxgOfy7m4KZl65BuBr+Nj7xomX1WDcZrJ11tn\nwlNBsDFM3Qy5HSs5eLMOfrfRdnt7u7AhcDn4vmPqH5FayI6tzNpOSqBGVj3HKh9Gje65moS7rvsV\nAL8CACl+E9dd1/1p7b03hfcSuAHbS4h0bWyEOGTwNEl1iDYDiPSCOZ1hruN6Lmalqgvv68pKwNbI\nzf/T83xhWFvwCDjKi957jnw96Vl1sVweK7dnl1DiA7BUD+pXrN4TqjO2tqwkpUTMBlAbHCJJ3BNe\nxoCo71v7YTtRDbg+tX69QddQI3HvSif8pSmllwH8FwDvB/BduyblqNN5Eooq15WkxjbS10Ibiboo\nley9PG/REi9P/Sr7VBe2RGMkAXMjt3wmYE8K5k7QR7YRAfe1AVVNqJTN92ZJzPPOsRmaJwWrCyGv\nSazPwQIH2zz4vfKA6q2BwPf1JPGxkjCAxfNGfb+WhD2vExZwNI9/V4pdkPAvAfgpAP8RwJsA/D0A\nv55SekvXdeWrWqwBexE67dApAxtQbGMLszeVPSSwJMySfU6/pXle2uuw7FrFx6ra8NQddux9Hy0i\nWB1g+ByTDestlYBLpV99931E7Z03MrC0Ppe2QX5/qj5SEo5mcxzY4kniniTMBlEm4cgzZWzw9LW8\n13QJ7J2o14mSMJ8zDErCXdf9NB2+kFL6HQB/COArAfz8tv/PoC8hB64829sUUafk0YsbOzlHuqpN\nkCNhj5QjHbOmLVyX34f6qdpmZMFubPbuzJ2NpUFgtW1EA5A3K+C0pw/V9qbHuTakuL29XSxT2bdn\nv2+DkndkIPWImGcywLIbI98v8k6JPAj2ITXX9P0a6MyGZ5Wap5LwGNQRC3Rd91JK6SMA3py77oUX\nXlhxrn7jG9+I5557butl0ukZN0iVsjypLJJojh2ezlFVONZAeUrMelAmXktHU25bqMZc58xf2ZO8\no2l1zSzAI5Qc0fBvPeIpJQVta8AT1yiOtru8vFyobqx+VeK1+7GKg+tan0XL6dVJ1DcALHT6Xnjv\noav1SvHKK6/glVdeWcobWh2xhJTS6wF8JoA/zl33/PPP45lnntl1cRZQEuZGxteYbyRLWJFy/i5A\nCVhJybuGydgz1qnbHEfuafCI57/sbTatriHhEk8OJbCSdFSPClbFWHlYXXB5ebm09oRHdDzLs8hD\nVQGVEDE/Mw+k2kfMu0VdHb26Ola89rWvxVNPPbWUd319jRdffLHo9+v4CT8N4HPwxP3ss1NKXwTg\n44+3dwP4GTwi3TcBeA+APwHwc7X/tStww2NJzAtzjBTzrHO9SwSskrBJW3qNkm7OG8P7HpuR7PX1\ntUu8JWlWR5TowCNjYuTlEUnLKjVrHUbgtmS/ZUmYPUcAhASsnhS6doiSbwkRq7DBg0206NBdIOBt\nYB1J+IsB/AaA7vH2jx7n/ziAbwXwhQD+BoBPwSMi/ucA/lrXdfc3Lu0WETUyPa9+q7w3sKHn2MF6\nMa8ebLO68/SRmjeZTBY6XvsSRWTwU7LN7XP6SU33eXFE0mSfi1/NLIkHdx4U7Hk47LmEgG1Q854n\nZ2RTVYSqIXTwidwT75JKYhOs4yf8fgA58+hfXr84+4GnE44amBfC6RHwXYDqwiMCZpe4Pj1rSmnh\nIdHne6wkzITrHXu62yjNaik2WGmeEnKOvNkzorRuWR3BkrAnJfNvmYCNfCPptE8d4dWLl6/qPG+w\napJwP45m7YhacGPy8niqHIVwWiersYQeOtRCbHlGOmysY91obt/nPcFpJV3d+LyRAFBGwrXBKN5e\npcacFKznPGlzMnm0rgbn23/x+4gMm56bYE4loXVi79bAwonVbzQINUm4DHeShD0C1nybhp2enrqR\nTNZh7tJIr9Ku5qm063VyLz/Su3rpiIS91d8inXBEOLkAlFza3OQ0iARYXb+2j5RVHWH3szyWjtUI\nxwsysRdJJAXndLaegMIqJjPY8fvz1E2qi29YxZ0kYWC1kZlEa3tr4F5DZemDz98VnbBOm1UF0Sdh\nReRcsjEJM/FGJJwrAx+zgakvEIV12HaOV0rz2kFJ21ASZlUAl5E/haTBHObWZwZNGyh0MNMZCbfv\naJZov+P+4b0/byBuiHEnSdgaCZOoSXKq9+QpLROQhoTeBbA/K5OZWsyV5KI9p5W8o2MjYY+Ada86\nyRwhs6tVzuPCNiM/09d67oq17YJ/b2UCln2GmXi9SDr7JJUnCfcRsPdu+FrtG9oOcrOdhhh3koSB\nZUlY9Xe852vU/cqI+C5B68ibbkbp3HEfSdteSdi++uGlPZ1wHwl7+mUzclm+fuLe8x5QSbKvPvWY\nyazrupW1MtQIp37VnlGuz02NYaTrvfO+NhDtG3zcSRKuaRw83fZCnHnbtTpiLOoOLse+yzSZTHBx\ncbHQe9r0++LiYkFGlq4xzJ2enrq6ZiM5XmSe1zsuWXAoame59lciQRrx6iepNLCF1RGR3tYbOIck\nz1L1DaO0P48Nd5KEa6DTLLbYqwRUE6pYC5Y+NF1iib+rKK0bU095U3w2dKkvcpTmvepIc2kud8ks\nImqbqsfu07WPUW1Q0+ZzA+wYBpUcGgkXgDtLRMAppZ2ScBQufRdDp0vhdd4obZZ/Xq/Xgkc0ci+n\nK7jgjacAACAASURBVPbyooAP3QweWdiMzCNiJlMlYnNxY7LOebGMDaXtnp8jSo/x+YBGwkVgSYND\nnFUHuCt/YVaHsGWaI9cs3Yh4GapHjdJKwhy5F7mo5TwovHwvbfscUTABMxH3ScL8fyUGtDFB31tu\nAxBK90bQpi4c23MCjYR74TX0yAizSxL2wqa5QTXyXUVpJ7b3ZsRlBKyE5hFcRNS8j9zobm9vcX5+\nDuAJiVi5PYm3VhJm6dyu1b2XHgvUFhMtdWpeTKr3Zve+MfePRsIFUHWER8C7NMx1Xbf0dWgjYFtL\nwDrn2BvbEPA6sn5WyEiYl3z0VAcaxVeynZ2d4fLyEpeXl4vlKC8vLxfvztqP3TtHhrU6YSbikvuN\nDeyNpBt7qABYPLsFWDHG3jcaCfeAG3oUQ28ua7uUhHm6rBKwEU3DMtStUDsvh6QDKI7cU1KOQpmN\nBO/du7fYeClKJk9enIfRR5SRJMx+zxb2nPNqGSNBeQOofhzV6s3qW5+T++lY0Ui4APwSPSk414m2\nASNhj4DHbN0eA6KObIuj28Z16blxRUTXJzWfnZ3h6aefxnQ6XUS58czFrjGJXKXbCJ5+NycJ84CU\nM1KOBZ4qSb9MbXtg9eMLWjdjNlw3Ei6AdgiWPjxD3bbBnRNYbaCNgGNYPXkLx3Oaoye9qDI9pwEQ\n0fHZ2dnik0S8Eppdw8tUWhm47IzIaNenE+Z1JiLvAl4LZCzgsnnvzzZ9HmDVVjOm51I0Eu4BGy68\nEM7cegHbQh8Jm464EfEytJ5UkuJvtulXQvo2L/DBy7MgD1YXKQHztJqJWKfVapiLJGFVh/BaxOpJ\nw8f6n0Mjp47gdxeRL3s0jem5FI2EC8DEC/SHcW4b3IG5YXLo9Jh1XkPC0wnrN+ym0+nCwON5EOhe\npc/cZqoGNcKdnT35bpyRNC8gxe0tclGzc/qfnjqCZ23sWaD1NBZ4LmreADqdTlcIWGeqqoIZGxoJ\n90A74RBgLwh10blrOmEloNx5b8bARMwrjtUG2vSpLizN7mGsJrBvxrEkzj6vLBXnCDgnCbNhTldd\n47piHWotdmkHUVVEFM3o6YBNAta1PWrK22fM3BYaCR8IIgOMFzmUM7yM0QgD9PuuWtoiwKKvRux7\nMPLqlAlU/ZE9QmFDk0q1DH63qiazduEFk9hXRnTlPx7cWS+8Th14deGlS3F7e7sYoNgTQomVy+AN\ntjwgAnWfoucZlG4cNLVpX2okfADw9FxR+HSkr/b012MhYs/SHx0zCet6ufskYSZCllTtGEAxAbNk\nx+/XM8ypntjz3PD0wZ53jZXV2sa69cZtrM8AWIr5fL4gYY+IPSlVVRdee6id8UQBIt5AsC4aCR8I\nPCLWhsjWYN2MKPh4TFBCifSuGqHmfTlin0RsZef6NEJjtQJ3XvZPVncrfq9e0IF66bChUAmYfaK9\nerGZ0yZ1pmTbF5VYCl6mkwmYV61TgYIHOx507P10Xbf4LFTps/F/Rf2qScJ3BKqOUOlCXXJ0xFYj\njBLHkPAMKl4YKhudxqKOAOIZRU4SjnSc3sDKhB4NvJ6fsPofc3mZVDatM7vfNqXG+Xy+4svNkjDf\nT4mYBx0un5WxBn1ufU0SvkOIpGDPdYo7uekCLSBBjTtjQUQm3mpj3loOkR511+jrhKXqCA4a8UhY\nPSb4XKSKULJSUlLJcVNJODfQ1AYz8ayhbxF9KwfXtUVBahlr28cu9N2KRsIHBJaIeHESlpC54bKE\no9LCmIg4MjrmwoK9bd+SsOqFvXMlemENPNA60amvR8Q6SHkSo5XL2gC7OPJAXksqSsKqbmEJtuae\nkUEsUkewJByVbYyunI2EDwA6JeV8loJY+lVCskZq52qnZbtGblrN6odITbFvnbCqdNQwx/XO5KsS\nnhKx3UulXG/6G0nCkVuWJ7Gq98A69cBT/Sg0nAeYEujApftIEs6pXdYlYfZE0b3mrYNGwgcC7nB2\nbI3LpCXzjdROxQ1xjD7FSjqRq1X0hQgmojHphD1jTqQT5nUQPHL1SFWNlzx45QjYpuvbmkF4krAn\n5ddKwiVGMbuWCdfAbX5dlYvnrRNt66KR8IFACdikWu2ct7er6wxrJxkTARtyxiX+YsXp6ekKaXvH\n+0CtOiJHvrbxM/AMx/OGAVY/OKAkbNdoGSaTyZIb17Z1whoWPp1Oqz0kIkOYbloOloo9Ai2FDobe\ngL+N9tZI+EBgL9qI2Bod+5N6UzLrHJ7+byzwJD/P39WCDvqmhft8ttwUmwfKPp2pF6xhhGYEqyTm\nqS10ISk27DFJerr0TeqAiTgKL66VhPneubQdc9u352ahY522oaoudSFshrk7hpJGZA1dp3Db0P/t\nEjl1BH84c4yGFYZ2yBI1BJOWp4bwJECDXh8Z7lRVwPr1bakjIqOjhYbXSsLrlGNTQlSo8dczim+K\nRsIDQqdIOV2Tvmzv5Vtn449SXl9fL03r7XgymSwtfqJSRsk+l+Yy5gwZKSX3u2yevnLd6SSTO0ca\nMrmxC5X3TLrncnjvyNIW4Rdt0XfpIpL06kAJOqeH9vSqnlSZe/ecvrm5qQovblhFI+EBofomT++k\nUkrfvus6XF1drXR2JmaWMnP6thIdXI6kdIv0t6z3zUXBbVLHEQHbNWxx79v0+bzntbw+EvYIuS8a\nkNFHvupjq65rauDSe/a1DybhXHhxI+IYjYQHgqcD5Q6nx/abSP/J007v0+vR989UKvIkpWhKrL6a\nBiaqaGDh/OgT8ZtOlT1d83y+ugA/k7CSk5fWe+cGm4iEPQlYiTiqBx5wIynYC5aIiDh6hyXtQhfJ\nj8KLG2I0Eh4QKa3G/HsBCuoRkNvm87nbyb1QX5YKudN4Ic8eSdn/MTmppKguVN7eC0X2puPr1jFL\nw0oKds6bqrOLkxrGvAHGy2OjIhNwjRTM9RDBK7cScU4V4akjlNS9NuEZF3ORbQ2raCQ8EJgc1B1L\n05PJpGh6n9Ij17Uc+SoJe1FJ3qZWfiNg4MmCNV23GsCg6oAo6i2KitvUaKQDAS/gwucikuHnNXRd\nt2I1jzZbvD0iYJWCvfcUScJWlj7S9KRgbyajRBwZFT0DnKa5rTTk0Uh4QLCE6HkCsJ5UpS0lYtsz\nCeckTPtP7kCe+5ROZ3MGMk9n6rmclYYiqxRcS8SeYY7Pcf1HA4+l9TlzEXuc50m9HiF7RjmtB50R\nlBKwpyLwJGK9r3ePqF14x00SLkMj4QHB6giWiM7PzxfS08XFxcp6ubm0SsKqX9QtWkjGOj+vQWHS\nr0dKHjGrKsLTSdeEIq8Lqxeu9xwJm7+uF9hipMKDi6fX5/daQsCWju6TmxF4hrmc9NqnjrB76r10\nVTP+Iki0b0Tcj0bCA0ElNJaELy4uFt8fMzLuM3DZZiQc6ZiVkPWrw5ZWCdvK7BGwbZ6hUI2P+w5F\ntvtwmVjPy764Ri48+ETPGhlVdaaRk4R11mNGSc8wG9VFZDzMScIRQfKeVRbq98vtxfO2yEnZDato\nJDwgWE+pJHx5eYl79+7h8vJyiYT79JAAQv2vJ7mZMz37Dnsd3pN2udN7RGl5qo5QPain8/RULrXg\nMun9lLg4mIWXQdRn9SThSNWinhGRuknDsvWdRpJwpI7I6YIjF7VIEvb0vxqKnPMmaSTcj0bCA8Ej\nKCXhy8tLPPXUU7i4uHBJ2EsD/STMmxHwdDoNp/8s6fLx6enpgpDYdYufL5KEmZzUuMi/1/S69czk\nqdKfEUckdetgA2DlmaJN1Us5Sdjqom9GkJPQcx4SfcEaXp14krCGInu/1XI1xGgkPCAiwxxLwiYN\ne/pSLw/ACgF7EjBL4Z4hzCNhTuu0XH8TGebYY4PXg9gVSo16OaMjk7SRuNan6rltX2qUs33k+dI3\nQOQkYc9gpgTM9+Nn9kiYv1LdvB82RyPhgaESoyc5smHOM1qpTliJPRfBFKkKptMpzs/PMZ1OcXFx\ngel0urJGrLf2gaVZAmQjo5cXGZsiRC5V0bkcAds50wmzzjN6tq7rstKvSsKvec1r8NRTT+Hpp59e\nGljVdc3es84Mos3TFXskbftcHfXVtXePJuFuB42ERwCVdkr0v1Gns/t5UjZLQHad6qNZ38dST458\nvU39Y7207SPiUETSeLRxffSlTS/suV9pGoBrhPP25+fnCwJ+6qmn8NRTTy2ImAcjG/z6iJfbS85w\np8/WMF40Eh4QfdKNqg9qOih3UiNgJie7xgjYdH3n5+dL69vy5i3BGG3RNNzTjeZIODIYeXvN43ru\n27MhKgrasD2wrBP23Mm47o18mYRN588krEE5URuxc94g7V3H9di3b9g/GgmPAH1EXGuo4Ws4NJkJ\n2M6bukI/ReOleYvI2PL7XLN4Y0QE7Okq+zat31ya791H9MCqD3R0PJlMFpIvS8GROiJHwlpmT5cf\n6Y4bxotGwgNDO5enkjg9PXWNNdo5+Z722/l8ecEa/h8LUsiRa+44d84Lz42MUyaJAv16XlMb9AUI\nsD9zyawh95+6j96Vlz49PV2QLkvALAmzzzSXK5cG8pJww+GgioRTSt8J4K8C+DwADwF8AMB3dF33\ne3LduwF8M4BPBfDbAP5W13UvbKPAx4JIFRFJwhEJ27303kzingRsestoPYCadQK8PPUF7vMIKJ0u\n90WD8caEWbpxferAFr233HZ6eroUeKN7VUfwf3hl4X2JuipXlw3jQK0k/DYA7wXwrx//9u8D+LWU\n0ud3XfcQAFJK3wHg2wB8I4B/D+C7AbwvpfS5Xdfd31bBjwmRKoL1jCVSnN7PXNYsj+9rpOmtNRDl\n5daZ0L23PKMXIcbqiJy/qe29wSLaGwlHBJVzA9OBT/dcr7zXvJOTkyWPEM9DRNURufvxvkQVoUTM\naGQ8DlSRcNd1X8HHKaVvAvAnAN4C4LceZ38bgPd0XfcLj695J4CXAXwdgB/ZtMDHhEh6ynlD8O/0\nHnxf9b1VAtbpe8kWeQ14ebogUbRurk3BgbyjP6siSg2EHgnn0l7wSxQcU0pqKaWsj3ANCWt6HZ1w\nI97xYVOd8KcA6AB8HABSSm8C8CyA99kFXddNU0rvB/BWNBJ2kVNJWEez67y9dz8m4ZOT5XUS2LiV\n8y7QvMhtyyNibznNaDN1RES8nJczHupeSTi3NyNatAYEp1m67CsvgHDw8RZa4nfovVdGjXdEDo2Y\nh8WmJPx9AH6T9L3P4hEpvyzXvQzgL2z4X0cJnbrmJOHotx74XiX+tCX5OZ2x5mn0WI6QS//fBgJ1\nmzN/ZnO1m0wmmM1mSyScM6Kxp4i3/KcuiMR12jeAAcsRjHovzmf1Ud/7BVYl4UgKzhk9G4bH2iSc\nUvpBAF8A4Eu2V5y7BU+l4Lk6aefMwYiH7+9d05fnHZcaxG5ublwS9sJ6cyTsbUy6vDfiZalVSTgy\ndpoUnFOZ8LHNLErd5PoCOyytKqS+d6gqk5w6opHueLEWCaeU3gvgqwC8reu6P6ZTLwFIAN7wOG3Q\n4xW88MILS9MxAHjjG9+I5557bp0iDoJSC7+BY/C5k+t6DrpKWaQb5r2m+45Lpq6qV84Z8XStiJw0\naHXTJ5l3XRcazzz9rV3vSb7egJcjYd5Yr963GVl6m6c26FNv8PHDhw9xdXW12K6vr3F9fb304U39\n5JC3kI+nQmkox/379/Hw4cOlPHaR7EM1CaeUfgDAXwHw9q7rPsLnuq77g5TSSwDeAeDfPL7+HMDb\nAXx77r7PP/88nnnmmdrijA7acbyVqizfOrS6GfG95vP5YnnJEv1mjphLrfresWc85Hxb4Mb0zrq6\nGB/rNJrrzatDL4/PablYEvaMVl4ek7C3AJJO9z2p1asnllgjtZK9ZwBLxOgROh8/ePAADx48wMOH\nDxd7JmYb5L11P7yFfBrWw9NPP43Xve51S3nX19d48cUXi35f6yf8QwC+FsBXA7ifUnrD41Of6Lru\n6nH6+wG8K6X0YQAfBvAuAPcB/GTNfx0quKP0pbWTewTMJNxnsbc0UObwr2lVZfCxQe9vkrrqRHPG\nLX0Oe14lXE6rPrOPgNmYGUnPSsLeEqBenXuDEZfNYHXo/U6fxepNZxgqvfKxka5unlSsQTUqETci\nHg61kvC34JHh7V9I/jcB+AkA6LruH6SULgH8IJ4Ea3xZdwd8hLVD6doDfHx7e+s62tt9+Fpb69cj\nM2+dYM9CrnnR+RwR294MU0wySppKiNEaC0zC/HsvzfXsPZPWi5UxImI+Zu8Ibx1mz/PABiAemFh1\nZNKtSt8eCdvvci54mnd1dbWikuA8JWE1oDYCHgdq/YSLFn7tuu57AXzvWiU6cCiBRlLMzc2NO0XV\n39/c3Cx9+aJPv2qkkCPdvo2RU2HkCNKT3HmfI2Hde//jScA3NzdLkYARCXub3cMb1CIp2J7T6pzL\nmRsAvTZjeybaaBElSxvRRnsj4el0GhpRGxEPj7Z2xJahknAU2GCSm3Vsb0pqndHcr7zNQoS5I/VJ\nf5FUp/AIo4+oDZ66JNqs3ry95kXkaTMLHvC863MkHA0UnhuYka89K9e9vQueSXiErKTHJMzLiPKm\ni6oz2fIxE3Dug5zNMDc8GglvETkCVr0cE1+kF7TfmfeEt7FEw6oAJluW1pgY9DyXiUlCCYTTubwS\nz4RIuo7SOQJW9Y+hpKxaxmgAUSnYMxbyoKp1qGmV+JWEPWLVtEfWStx9EZKNgIdDI+Etw1Mn6PTy\n5ubRwuBMPNYBVQK2L1vwxjo+9keNCIWny0YQTLa21yk1n/fuXytxR+e8OoyO7XnUBcwzgOrz8TN4\n6Zz3RKTTtd96ZJx7Di+f2wyTsOp8Lc/zl2b3NM5Tw6kXWNKIeBg0Et4ychItdw6+XiVn/Ziirbpl\nnery8nJF+vUIg7dISjMytmuUsBXR/T3iym12L+8/rF6ifCNinUHosf2Ht9e8GrUF/96rV24H/CxR\nHreDiITV++Hhw4eunjhK6+DguU42DINGwluER8BqaDFizXU+k3atE9o33u7du7eifjDY9Jmn6R5J\n9kXf5abYOf2p7oH8Moy8X6eOzQ/YIz7VIRuitB33Sc3R77iOuBxemfRZ+Jo+Ela/4L4FjHQxo9IB\nomG/aCS8ZaheWFUR+qlwj6hns9lS1JaRMhO7kqMSr/nqWn7UCRmeLpPPKQHnNu/eu6pvL51DX1lK\nymp1pWmvTCp5mpSukmkJCd+/fx/379/HgwcP3IWTorSHRrrjQCPhLSNnpPL2aqRRiSiltHC/srUR\nJpPJwndYXao8SbUkXXI+2vgTSrb1SZ25420gJ3l657d9zHklgTtd92hdjCj4Qt3O7HPzfet4qPuZ\nVweap1F1ep8xS819aqdalATsbIpGwluG6kyVoAxd17kL2nj+qTrNVYlJfW5LrPx9ZByRc+lWoguu\nacR6XQkJeCqBvryafUl5IvL1ji0KTkORo8ALjbDzjGyeVN635dae0HY8FtTo9WuQs3/kjMs1aCS8\nRail3az2/I03g5Gwt0UBAgCWOqyRMF9nhrU+AvauKdnrAJEj4T6vCR5gorosQY4AI3WAl+7TL3sS\nYN9g0Ee8SsJshOuLfvPczNhWoASs13i/m8/n2cV/xhrYoeoyz1BdS5jeTLa0/dagkfCWoeoAr9Ga\n8YvVCd7yhtx4AN+VzUiYSbrPc2ETAo7SUTRc3/+b9L5tlYSSbM4/1pMi+7aackT/HZFwLvotR8I5\ndzPVPbOaQY+9BfLtU1hjJuE+4cOEg9r7RrM4Pt4EjYS3DHspagzjc0zCkY5Vycyg6ggjNyZoj/Ci\n41yEWN+50rw+wt+V0c4jGCYeja6rlZZL4RFmlKdBGLpXEvZIPiort51In8zGZG/RH/6PsUAl1txs\njfvSuv+1bTQS3iKUZK0B8Hkm1RLi4gbGHc5Cnz0puWRaFknDNcTaR7R9g8x8Pl+qn1y9KkpIQHWm\nfQas0um6/Xdph4wGAW9QiIIvWE/LHzLVcpVKwl4ofZ+nhRLwmIgYWA6VV2GG06XQASy33wSNhLcM\nlXQtT0dpJumcvlYlRZZmvDz7BlrOI0OJuE/V0Ee4XjqnarFyWp7V0bZgHUPJpo94cnpbJWV951Fb\n8KTP3D76Xp7u2U1RCThSGfB5DgzypF4dtLiMY1VHeG1aF7uqlYRLVFPz+WaGykbCWwY3BjtmPS1L\nf5F0Gin/lYC1IVinYglapWk9t66eNxo8LD8yOrK3CC83uQto2Hhui3Ss0aZl9p5BSdgjXs2LBgld\nypJJuER/rVKw1o1K2ZHOeszqCG6DTLzsgVQjCQP+Ivuctv/fpD4aCW8RHgHbsZEvSxJKkBFZqtrB\nXj4TsErO0T00v4R8c8a2aPM6gK36prrLXZNwiXSp0nAuHakj+NgbOD3VSE7i7NszwXppRqSO0EAi\nW+wnUsMcgiSsUnD0Res+aH3xgGSwAbmR8IhgnY/1wpF04llYc5ZYnW6yVJYj3dzmEW2UV6I2sbR+\nq81I5uzsrNjAZY07R9K5e9jMoGSLfG+9Pf+nR7yazumhNS9HfJ5e2tNNRvpKb0Dwls0skazHBm9m\nxwTM3wwsRdc98UQ6OTlZemd8zaZoJLxlKGkySkinL4+nQHreI28l8nVI2DY16Okx51nDv7m5CZfc\n1PraNpRgvLTt1R+2jyi9OvfyuCP3bZuQax9UEvYWjueQ+ujeYyZhJmKVgm3p11oSZjXQycnJIgSc\n63PT9ttIeIso0RNuC1GH5P/OEbAdq87PVCZ8rFJwTn9t6T6dIpObTudK0iUo8TbgD2FGpKv5OoDk\n6tfKUULwmxp4+qAqC5aso0HmkOCp3dReUesdwapD2zPxbqOPNxI+cqjqwzuOGhV3WiPnPo8L1omr\n0VCnwOfn55jNZkvXbnMqvIk6ItobSUW6d68edPA5lCl+w37QSPiOgMlXpUsmYm8qyga0UvLxDEGe\nMcgMJSV60FpJsdYw12eQYxIt1Y1bnbK6Iafbbbh7aCR8xDDi5D2wTMQsjUUkx94XbAjMGQSVfCeT\nyRIBs8sak6yntuD8Gqgblh73uahF5QBQHBHoqWaaJNzAaCR8R6BEzPlKvkrMJtl6BBxtngQ8mUyW\nCNgMKN6UP9LP1sAMUPb/3p7TOhhExwBC4yVHArIOMbpfQ0Mj4SOFSsFeHkthPD1mdYJdpwQM5L0w\nLDLu9PTJp+hzpBURo6ZriCsyskUGsj51CNdDFA1okr1nBNN7NUm4AWgkfCcQEbEdA1giWSZik2pL\nPAE4bQQcTdN5n1MTqBqhRi/cdV2vjld1vRFJcp6RsEYBMgFbvZpEHN23EXBDI+EjhpIvsPpZnkjF\nYL+N3K68vaZLo+tYV6whtLqvJeGceqFPTxttGhHo3YfrpvS+DXcTjYSPHB4h5K7NSbfePsrzPCei\nvMhtTL/wYCG1Nc/uSaE5tYCqEby0BaOwOkN1vFx3uXs28m1oJHxHoASheUrWfSSr57xrS3THKSXc\n3Nwshc1qIAXn1ZKw7XNEq6oZ3WveycnJIgpQpV+uBxtoSu7ZcHfRSPgOQju9p6pQeAQcISLmKD2b\nzVYWL/cWNNeQ2hJ4pFdyLpevHg9KwuozXHLPhruLRsINC/SpKnYFI1omXU3bcS0J7wKeXtu8QaI1\nMhoaImz2rY+GhoaGho3QSLihoaFhQDQSbmhoaBgQjYQbGhoaBkQj4YaGhoYB0Ui4oaGhYUA0Em5o\naGgYEI2EGxoaGgZEI+GGhoaGAdFIuKGhoWFAtLDlhlEg95UODg8eA6JvyeniRLuEFw6dy+tbunMo\n5NYZAfLrfRwLGgk3DA4lW/7ska7BMIZOaKuonZ+f4+zsbLGwuxLyPtC3Opxt+mVp/lrJPj+31Le6\nnuXbc0TPw/mHjkbCDaMAEzGTsC7BOZYFfJSA+XNN+yLiaH1kb+1kXp/Z+7jpvga50jWmgfgL3Jbe\nR3n3gUbCDYPDk4Ttc0F63Rg6nS3qbl/XYBLOLWG5bRgZlWz61ZIhVnyr+eIKsKpC0S9e21dhDh2N\nhBtGAZaCPV2lnR8DCaeUlqRgloaHUEfol629r1dH3+5TKXPX0BmP7TVtZdLnYFXFMRAw0Ei4YQTw\nJGFeaN7LHxIpJfdLyyoJ70sd0XXdyhekWefLx7pnIt41vPfMG9cjgKVy2m9vbm5GYVDcJqpIOKX0\nnQD+KoDPA/AQwAcAfEfXdb9H1/wYgHfKTz/Ydd1bNyxrwxGDO6iXZ19lHkPHMxKOtqHUESrx8rGS\ntGeU2xcRqxTszSgA4ObmZvEhWP1oKn+g9tBRKwm/DcB7Afzrx7/9+wB+LaX0+V3XPaTrfhnANwKw\nGppuWM6GI4ZHwJaveuKxkDCXS9P6fbldgqftSsKsB2ai9tJDqSOMeFm9c3Z2BgALAuZnZRK+vb3d\neXn3gSoS7rruK/g4pfRNAP4EwFsA/Baduu667k83L17DXYGSMH/HzQh4DJ4RBs9XeAgC9nTCbIAz\nrwi+zksPYZhjNYSRr218vfe8xyIFA5vrhD8FQAfg45L/pSmllwH8FwDvB/BdjZQbIljHnM/nCwKz\nrxpHPqJDl7cvuGSfQRsRAetXqr263He9RiRs5Gv+13atPqc9K9fxGNrEJtiUhL8PwG92XfcC5f0S\ngJ8C8B8BvAnA3wPw6ymlt3RdN9vw/xqOFCwJq8/qmAI1DBpYEO13iZxOWIm4pE73JQl76ojJZLIg\nYCNhK5NHwPuabewDa5NwSukHAXwBgC/h/K7rfpoOX0gp/Q6APwTwlQB+PrrfNqebfS/nWF7emJHr\n0LlzfRJke3dP4OlIVSK2fQ65gWMTYvZCkj3PCNYJszpCfYNvb29XvFBsthRF30XPNSasRcIppfcC\n+CoAb+u67o9z13Zd91JK6SMA3py77oUXXlhxzv/0T/90PPvss6VlcsMhveOG/SAKpdXj9u72g9J6\nLn1vuft6/8HqBo02VNc+bhe5cHYrUxTBuA/V0CuvvIJXXnllKa/GaFhNwimlHwDwVwC8veu6kkl5\nhgAAC0VJREFUjxRc/3oAnwkgS9af9Vmfhde85jUr+dNpmWNFpJPTUMjWmfcDldKiEFuVYrx3t09j\n17GipH8oCUfvToM7St8dS7o5AtZ7eqHsKqEbwQ9BxK997Wvx1FNPLeVdX1/jxRdfLPp9rZ/wDwH4\nWgBfDeB+SukNj099ouu6q5TS0wDeDeBn8Ih03wTgPXjkQfFzuXtPp1NcX1/XFGcJnqXapipGwK0j\n7xeqs4w2773xu+MpZ8P6iOpZN/Wg4PfEaU9azW2np6dLHhDqW+25KKaUlrxjJpPJyuDddd3SfYcK\nI18XtZLwt+CRN8S/kPxvAvATAG4BfCGAv4FHnhN/DOCfA/hrXdfdz914Op2uqCNq4IVB6pRpTIad\nY0fkPqWhqNah+d3Z3tDe3eZQg1hubxF4XrgwsBoyXHpvDcroC/dWSdiTvu36kvuOlYhr/YSzC7p2\nXXcF4C+vU5DZbLaRJMyjqhGAdmQdaRt2CyXgKKSWB85o4Z6xdqBDgUqrUbiwvQMm35TSko7TBle7\nr95b7+eFeHvSap9KwtoFk6r1d77vsUvCO8NsNtuIJCOdEb/IJk3tD57l3gunvbm5Weqo+o548Gwq\nic3g6ViVvCaTyeL9eNIpB0vofZV0I2LMhXp79/UEKjtng0Pu3mMfxEdDwqUGuAieFBVZfBt2j5zr\nlLpPma7Qs7rvM6T2mKHSoxIvu4jZ7IRDhgEsdPR8r8iApvdk9YAX6u1JrCo8ef9lbSN3z7FLw6Mh\n4dlstlFHY4W9gV+aWuMbdg/VCWsggW2sQgJWO1sbQLcDJTCNVrON1RAGfpceWao07IUie4SraohI\nHcHnvFB2zzCv9x8rRkPC0+l0owU5ONAjp9Bv2A88wxyTsIXTmkE2J+00Et4cEVFyuLDtvUVzvHBh\nva8nCbNvMKsGPEk6UkcY8bJkrFskoXvkPjaMhoRterouuJPyaHh7e+tKyQ27Ry6Sywj4+vp6Mfh6\nU+amjtge1DDnkeXFxcWSbcbeob0PI2JGpBNmAmYStt/k9ppnaQ0a0XLo9d59x4bRkPAmBAz4I320\nVN+uOvSYX/S+oYY5loY1rBbAkr5Ql1nc9Xs7VHjSnUdifG3kHaErmPEAasa6SHUQScNKxusY3rdN\nojVtSIl+V+1vNCS8qd7Gc3EBfL1k7f/0jdINq4imv96gqJFUfa5LDcvQKX3kiqb16BlPvS9uRKoi\nz+gV3du8LQ4N3gL4uiLdphgVCW/qoqZE7gUL1JKwp/9K6Um0EF/X8ASR5GVEzNednp4udJJeSGur\n2xg53aymPQHF6yO5r26ousjzxY08Y8a0HnQpPALeNhGPhoTtZa4Lb1T2Gtk6ndqz0DYC7gd3WA05\n5Wu8ryvk/EcbluGpBDwp2KtL7RtmPPUI2P7LI/rovocsCXuRg9uWgoERkbC91E1+7xGsknBth1bJ\nV6d9fF3DE3iSWbToi+dbmvMfbXgETye7rjqC+0efOoLfm0fCXqCO9pdDgNbN0asjrKFs8vs+ndc6\nelxrPOwGA2CRV3u/uwQlA68z23vLhZ02STgPJWAmSVuD1xvMmEg0RNkzjPJ/2f35mKH9LqV0kOqI\n3MJTR6eO2IYk7KkKlIjXua/6I+r5hlWoxKT5dk7XjvCm0I2IY5RIwuZiFhGx+eBa//CMp/xfdh/r\nDznhh+97iO9PV5RT/+Rt4KhIWEdkbgicV9MYrAF7Ponqs3iIjWyX4E6reexCqKThTXUbYuR0whrl\nFhGw5nkSH/+X9VW1l+TufWjvUdUqXrqpIwheY+BKM2jD6ION+CrN2X2a76oPJWCekp6cnCwNbpEU\nF81uGh6B6yXSwavaoK9/RFIf/6cOqp7ww2qOQxZSVPL1SLipIx4j6qxaabUNITed3va05NjAHZ/J\nVhuzN6X28hpWUaKO0Dpm9El5aky1vdpHcpLwIfcRne1y+ujUEZtKwsBqAAVX2rpSK0+7TL/Fjf5Q\nG9euwe/C6ojry5OwbK/vsUnCeeTUEapK4H2uf+Tek7b7vvsCy2u7HCKiutkGRkPCniT86quvut+d\nq8GmFeY1YJvmbTIifvSjH8Vzzz23VpkOAS+++OLRPt9Y3p3OGDx1Tt8MU/tHrs8p2eawbaLaFrbB\nKdvGqE379+9nv4h00Cj9COCh4pif75if7Zj7HDDO5xs1CTc0NDQcOxoJNzQ0NAyIRsINDQ0NA2IM\nhrlLwP/G3O3t7UZfYN4GbFETDa+NIrxKcXNzg0984hM7LPmwOObnG8OzmQuYfrna+7J1jWfCGPrc\nLrGv5yM+u+y7Ng1tvUwpfR2AfzJoIRoaGhp2g6/vuu6f5i4YAwn/OQBfDuAPAVwNWpiGhoaG7eAS\nwGcB+NWu6/4sd+HgJNzQ0NBwl9EMcw0NDQ0DopFwQ0NDw4BoJNzQ0NAwIBoJNzQ0NAyIUZJwSulb\nU0q/n1J6mFL6Vymlvzh0mbaBlNL3pJTmsh3kQgQppbellH4xpfTRx8/x1c417358/kFK6TdSSs8P\nUdZ10Pd8KaUfc97lB4YqbylSSt+ZUvp/UkqfTCm9nFL6uZTS5zrXHeS7K3m+sb270ZFwSumvA/g+\nAP87gP8awG8B+OWU0p8ftGDbw4cAvAHAs4+3Lxy2OGvjaQC/C+BbAay42KSUvgPAtz0+/8UAXgLw\nvpTS0/ss5AbIPt9j/DKW3+VX7KdoG+FtAN4L4L8F8D/gUcDWr6WU7tkFB/7uep/vMcbz7ryV44fc\nAHwQwA9I3gsA3jN02bbwbN8D4P8duhw7eK45gK+WvBcB/B06PgfwnwF889Dl3dLz/RiAnx26bFt4\nttc/fr6/eKTvznu+Ub27UUnCKaUzAG8B8D459WsA3rr/Eu0Eb348zfv9lNJPppTeNHSBto3Hz/Qs\n6D12XTcF8H4cz3sEgC99POX9/1JK/0dK6b8aukBr4FPwSNL/OHCU727p+QijeXejImE8GrVOAbws\n+S/jUcM4dHwQwDcA+DIA/zMePdMHUkqfOmipto9n8ajhH+t7BIBfAvD1AP4SgP8VwH8D4NcfCxKH\nhO8D8Jtd173w+PjY3p0+HzCydzeGBXzuDLqu+1U6/HcppQ8C+A8A3gng+4cpVcM66Lrup+nwhZTS\n7+BR6P1XAvj5QQpViZTSDwL4AgBfMnRZdoHo+cb27sYmCX8MwC0eKcwZb8Aj48BRoeu6BwD+LYA3\nD12WLeMlAAl35D0CQNd1LwH4CA7kXaaU3gvgqwB8add1f0ynjuLdZZ5vBUO/u1GRcNd1MwC/A+Ad\ncuodAEbv/lOLlNIFgM8HkG0kh4au6/4Ajzrs4j2mlM4BvB3A/z1UuXaJlNLrAXwmDuBdppR+AMDX\nAPhLXdd9hM8dw7vLPV9w/aDvbozqiH8M4CceTxH+JYC/iUcV9MODlmoLSCn9QwD/DI9G3TcA+LsA\nXgvgx4cs1zp47K70OXgkNQHAZ6eUvgjAx7uu+yM8Uq+8K6X0YQAfBvAuAPcB/OQQ5a1F7vkeb+8G\n8DN41HHfBOA9AP4EwM/tvbAVSCn9EICvBfDVAO6nlEzi/UTXdbaK4cG+u77ne/xe340xvbuh3TMC\nt5JvAfD7AB4C+FcAvmToMm3puX4SwH/CoyU7/wjATwP4vKHLteazvB2PXH9uZfs/6Zr/DcBHATwA\n8BsAnh+63Nt4PjxapvBX8EhivALwBwB+FMBzQ5e74Lm8Z7oF8A1y3UG+u77nG+O7a0tZNjQ0NAyI\nUemEGxoaGu4aGgk3NDQ0DIhGwg0NDQ0DopFwQ0NDw4BoJNzQ0NAwIBoJNzQ0NAyIRsINDQ0NA6KR\ncENDQ8OAaCTc0NDQMCAaCTc0NDQMiEbCDQ0NDQOikXBDQ0PDgPj/AaZWYhP7cpqEAAAAAElFTkSu\nQmCC\n",
-      "text/plain": [
-       "<matplotlib.figure.Figure at 0x7f61ec17dd50>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import random as rnd\n",
     "idx = rnd.randint(0, 99)\n",
@@ -345,15 +294,6 @@
     "print(\"true: %d\" % label.asnumpy()[idx])\n",
     "print(\"pred: %d\" % np.argmax(pred, axis=1)[idx])"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -372,7 +312,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "version": "2.7.13"
   }
  },
  "nbformat": 4,
diff --git a/example/adversary/data.py b/example/adversary/data.py
deleted file mode 100644
index 0ca8e1fd6653..000000000000
--- a/example/adversary/data.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: skip-file
-""" data iterator for mnist """
-import sys
-import os
-# code to automatically download dataset
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
-import get_data
-import mxnet as mx
-
-def mnist_iterator(batch_size, input_shape):
-    """return train and val iterators for mnist"""
-    # download data
-    get_data.GetMNIST_ubyte()
-    flat = False if len(input_shape) == 3 else True
-
-    train_dataiter = mx.io.MNISTIter(
-        image="data/train-images-idx3-ubyte",
-        label="data/train-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        shuffle=True,
-        flat=flat)
-
-    val_dataiter = mx.io.MNISTIter(
-        image="data/t10k-images-idx3-ubyte",
-        label="data/t10k-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        flat=flat)
-
-    return (train_dataiter, val_dataiter)
diff --git a/example/autoencoder/autoencoder.py b/example/autoencoder/autoencoder.py
index a84b2718f748..5089a4d11560 100644
--- a/example/autoencoder/autoencoder.py
+++ b/example/autoencoder/autoencoder.py
@@ -148,12 +148,12 @@ def make_decoder(self, feature, dims, sparseness_penalty=None, dropout=None, int
                 x = mx.symbol.Dropout(data=x, p=dropout)
         return x
 
-    def layerwise_pretrain(self, X, batch_size, n_iter, optimizer, l_rate, decay, lr_scheduler=None):
+    def layerwise_pretrain(self, X, batch_size, n_iter, optimizer, l_rate, decay, lr_scheduler=None, print_every=1000):
         def l2_norm(label, pred):
             return np.mean(np.square(label-pred))/2.0
         solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate, lr_scheduler=lr_scheduler)
         solver.set_metric(mx.metric.CustomMetric(l2_norm))
-        solver.set_monitor(Monitor(1000))
+        solver.set_monitor(Monitor(print_every))
         data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=True,
                                       last_batch_handle='roll_over')
         for i in range(self.N):
@@ -168,12 +168,12 @@ def l2_norm(label, pred):
             solver.solve(self.xpu, self.stacks[i], self.args, self.args_grad, self.auxs, data_iter_i,
                          0, n_iter, {}, False)
 
-    def finetune(self, X, batch_size, n_iter, optimizer, l_rate, decay, lr_scheduler=None):
+    def finetune(self, X, batch_size, n_iter, optimizer, l_rate, decay, lr_scheduler=None, print_every=1000):
         def l2_norm(label, pred):
            return np.mean(np.square(label-pred))/2.0
         solver = Solver(optimizer, momentum=0.9, wd=decay, learning_rate=l_rate, lr_scheduler=lr_scheduler)
         solver.set_metric(mx.metric.CustomMetric(l2_norm))
-        solver.set_monitor(Monitor(1000))
+        solver.set_monitor(Monitor(print_every))
         data_iter = mx.io.NDArrayIter({'data': X}, batch_size=batch_size, shuffle=True,
                                       last_batch_handle='roll_over')
         logging.info('Fine tuning...')
diff --git a/example/autoencoder/mnist_sae.py b/example/autoencoder/mnist_sae.py
index 552594823a93..8ab4b0b1e4d1 100644
--- a/example/autoencoder/mnist_sae.py
+++ b/example/autoencoder/mnist_sae.py
@@ -15,30 +15,74 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: skip-file
 from __future__ import print_function
+import argparse
 import mxnet as mx
 import numpy as np
 import logging
 import data
 from autoencoder import AutoEncoderModel
 
+parser = argparse.ArgumentParser(description='Train an auto-encoder model for mnist dataset.')
+parser.add_argument('--print-every', type=int, default=1000,
+                    help='the interval of printing during training.')
+parser.add_argument('--batch-size', type=int, default=256,
+                    help='the batch size used for training.')
+parser.add_argument('--pretrain-num-iter', type=int, default=50000,
+                    help='the number of iterations for pretraining.')
+parser.add_argument('--finetune-num-iter', type=int, default=100000,
+                    help='the number of iterations for fine-tuning.')
+parser.add_argument('--visualize', action='store_true',
+                    help='whether to visualize the original image and the reconstructed one.')
+parser.add_argument('--num-units', type=str, default="784,500,500,2000,10",
+                    help='the number of hidden units for the layers of the encoder.' \
+                         'The decoder layers are created in the reverse order.')
+
+# set to INFO to see less information during training
+logging.basicConfig(level=logging.DEBUG)
+opt = parser.parse_args()
+logging.info(opt)
+print_every = opt.print_every
+batch_size = opt.batch_size
+pretrain_num_iter = opt.pretrain_num_iter
+finetune_num_iter = opt.finetune_num_iter
+visualize = opt.visualize
+layers = [int(i) for i in opt.num_units.split(',')]
 
 if __name__ == '__main__':
-    # set to INFO to see less information during training
-    logging.basicConfig(level=logging.DEBUG)
-    ae_model = AutoEncoderModel(mx.gpu(0), [784,500,500,2000,10], pt_dropout=0.2,
+    ae_model = AutoEncoderModel(mx.cpu(0), layers, pt_dropout=0.2,
         internal_act='relu', output_act='relu')
 
     X, _ = data.get_mnist()
     train_X = X[:60000]
     val_X = X[60000:]
 
-    ae_model.layerwise_pretrain(train_X, 256, 50000, 'sgd', l_rate=0.1, decay=0.0,
-                             lr_scheduler=mx.misc.FactorScheduler(20000,0.1))
-    ae_model.finetune(train_X, 256, 100000, 'sgd', l_rate=0.1, decay=0.0,
-                   lr_scheduler=mx.misc.FactorScheduler(20000,0.1))
+    ae_model.layerwise_pretrain(train_X, batch_size, pretrain_num_iter, 'sgd', l_rate=0.1,
+                                decay=0.0, lr_scheduler=mx.misc.FactorScheduler(20000,0.1),
+                                print_every=print_every)
+    ae_model.finetune(train_X, batch_size, finetune_num_iter, 'sgd', l_rate=0.1, decay=0.0,
+                      lr_scheduler=mx.misc.FactorScheduler(20000,0.1), print_every=print_every)
     ae_model.save('mnist_pt.arg')
     ae_model.load('mnist_pt.arg')
     print("Training error:", ae_model.eval(train_X))
     print("Validation error:", ae_model.eval(val_X))
+    if visualize:
+        try:
+            from matplotlib import pyplot as plt
+            from model import extract_feature
+            # sample a random image
+            original_image = X[np.random.choice(X.shape[0]), :].reshape(1, 784)
+            data_iter = mx.io.NDArrayIter({'data': original_image}, batch_size=1, shuffle=False,
+                                          last_batch_handle='pad')
+            # reconstruct the image
+            reconstructed_image = extract_feature(ae_model.decoder, ae_model.args,
+                                                  ae_model.auxs, data_iter, 1,
+                                                  ae_model.xpu).values()[0]
+            print("original image")
+            plt.imshow(original_image.reshape((28,28)))
+            plt.show()
+            print("reconstructed image")
+            plt.imshow(reconstructed_image.reshape((28, 28)))
+            plt.show()
+        except ImportError:
+            logging.info("matplotlib is required for visualization")
diff --git a/example/autoencoder/solver.py b/example/autoencoder/solver.py
index 5589c5a14010..69d8836987b9 100644
--- a/example/autoencoder/solver.py
+++ b/example/autoencoder/solver.py
@@ -140,10 +140,3 @@ def solve(self, xpu, sym, args, args_grad, auxs,
                 if self.iter_end_callback(i):
                     return
             exe.outputs[0].wait_to_read()
-
-
-
-
-
-
-
diff --git a/example/bi-lstm-sort/README.md b/example/bi-lstm-sort/README.md
index b56b671c428e..a590a18bfbc0 100644
--- a/example/bi-lstm-sort/README.md
+++ b/example/bi-lstm-sort/README.md
@@ -2,9 +2,13 @@ This is an example of using bidirection lstm to sort an array.
 
 Firstly, generate data by:
 
-    cd data
 	python gen_data.py
 
+Move generated txt files to data directory
+
+	mkdir data
+	mv *.txt data
+
 Then, train the model by:
 
     python lstm_sort.py
diff --git a/example/bi-lstm-sort/gen_data.py b/example/bi-lstm-sort/gen_data.py
new file mode 100644
index 000000000000..55af1b45554a
--- /dev/null
+++ b/example/bi-lstm-sort/gen_data.py
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import random
+
+vocab = [str(x) for x in range(100, 1000)]
+sw_train = open("sort.train.txt", "w")
+sw_test = open("sort.test.txt", "w")
+sw_valid = open("sort.valid.txt", "w")
+
+for i in range(1000000):
+    seq = " ".join([vocab[random.randint(0, len(vocab) - 1)] for j in range(5)])
+    k = i % 50
+    if k == 0:
+        sw_test.write(seq + "\n")
+    elif k == 1:
+        sw_valid.write(seq + "\n")
+    else:
+        sw_train.write(seq + "\n")
+
+sw_train.close()
+sw_test.close()
+sw_valid.close()
diff --git a/example/captcha/README.md b/example/captcha/README.md
index 02e87267ccba..cc97442f6207 100644
--- a/example/captcha/README.md
+++ b/example/captcha/README.md
@@ -2,4 +2,4 @@ This is the R version of [captcha recognition](http://blog.xlvector.net/2016-05/
 
 ![](captcha_example.png)
 
-You can download the images and `.rec` files from [here](https://s3-us-west-2.amazonaws.com/apache-mxnet/R/data/captcha_example.zip). Since each image has 4 labels, please remember to use `label_width=4` when generating the `.rec` files.
+You can download the images and `.rec` files from [here](https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/captcha_example.zip). Since each image has 4 labels, please remember to use `label_width=4` when generating the `.rec` files.
diff --git a/example/cnn_chinese_text_classification/README.md b/example/cnn_chinese_text_classification/README.md
new file mode 100644
index 000000000000..bfb271dd5c45
--- /dev/null
+++ b/example/cnn_chinese_text_classification/README.md
@@ -0,0 +1,28 @@
+Implementing  CNN + Highway Network for Chinese Text Classification in MXNet
+============
+Sentiment classification forked from [incubator-mxnet/cnn_text_classification/](https://github.com/apache/incubator-mxnet/tree/master/example/cnn_text_classification), i've implemented the [Highway Networks](https://arxiv.org/pdf/1505.00387.pdf) architecture.The final train model is CNN + Highway Network structure, and this version can achieve a best dev accuracy of 94.75% with the Chinese corpus.
+
+It is a slightly simplified implementation of Kim's [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882) paper in MXNet.
+
+Recently, I have been learning mxnet for Natural Language Processing (NLP). I followed this nice blog ["Implementing a CNN for Text Classification in Tensorflow" blog post.](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/) to reimplement it by mxnet framework.
+Data preprocessing code and corpus are directly borrowed from original author [cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf).
+
+## Performance compared to original paper
+I use the same pretrained word2vec [GoogleNews-vectors-negative300.bin](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing) in Kim's paper. However, I don't implement L2-normalization of weights on penultimate layer, but provide a L2-normalization of gradients.
+Finally, I got a best dev accuracy 80.1%, close to 81% that reported in the original paper.
+
+## Data
+Please download the corpus from this repository [cnn-text-classification-tf](https://github.com/dennybritz/cnn-text-classification-tf), :)
+
+'data/rt.vec', this file was trained on the corpus by word2vec tool. I recommend to use GoogleNews word2vec, which could get better performance, since
+this corpus is small (contains about 10K sentences).
+
+When using GoogleNews word2vec, this code loads it with gensim tools [gensim](https://github.com/piskvorky/gensim/tree/develop/gensim/models).
+
+## Remark
+If I were wrong in CNN implementation via mxnet, please correct me.
+
+## References
+- ["Implementing a CNN for Text Classification in Tensorflow" blog post.](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/)
+- [Convolutional Neural Networks for Sentence Classification](http://arxiv.org/abs/1408.5882)
+
diff --git a/example/cnn_chinese_text_classification/data_helpers.py b/example/cnn_chinese_text_classification/data_helpers.py
new file mode 100644
index 000000000000..1a5c4adb18f5
--- /dev/null
+++ b/example/cnn_chinese_text_classification/data_helpers.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import codecs
+
+import numpy as np
+import re
+import itertools
+from collections import Counter
+import os
+
+
+# from gensim.models import word2vec
+
+def clean_str(string):
+    """
+    Tokenization/string cleaning for all datasets except for SST.
+    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
+    """
+    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
+    string = re.sub(r"\'s", " \'s", string)
+    string = re.sub(r"\'ve", " \'ve", string)
+    string = re.sub(r"n\'t", " n\'t", string)
+    string = re.sub(r"\'re", " \'re", string)
+    string = re.sub(r"\'d", " \'d", string)
+    string = re.sub(r"\'ll", " \'ll", string)
+    string = re.sub(r",", " , ", string)
+    string = re.sub(r"!", " ! ", string)
+    string = re.sub(r"\(", " \( ", string)
+    string = re.sub(r"\)", " \) ", string)
+    string = re.sub(r"\?", " \? ", string)
+    string = re.sub(r"\s{2,}", " ", string)
+    return string.strip().lower()
+
+
+def get_chinese_text():
+    if not os.path.isdir("data/"):
+        os.system("mkdir data/")
+    if (not os.path.exists('data/pos.txt')) or \
+       (not os.path.exists('data/neg')):
+        os.system("wget -q https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/chinese_text.zip -P data/")
+        os.chdir("./data")
+        os.system("unzip -u chinese_text.zip")
+        os.chdir("..")
+
+
+def load_data_and_labels():
+    """
+    Loads MR polarity data from files, splits the data into words and generates labels.
+    Returns split sentences and labels.
+    """
+    # download dataset
+    get_chinese_text()
+
+    # Load data from files
+    positive_examples = list(codecs.open("./data/pos.txt", "r", "utf-8").readlines())
+    positive_examples = [s.strip() for s in positive_examples]
+    positive_examples = [pe for pe in positive_examples if len(pe) < 100]
+    negative_examples = list(codecs.open("./data/neg.txt", "r", "utf-8").readlines())
+    negative_examples = [s.strip() for s in negative_examples]
+    negative_examples = [ne for ne in negative_examples if len(ne) < 100]
+    # Split by words
+    x_text = positive_examples + negative_examples
+    # x_text = [clean_str(sent) for sent in x_text]
+    x_text = [list(s) for s in x_text]
+
+    # Generate labels
+    positive_labels = [[0, 1] for _ in positive_examples]
+    negative_labels = [[1, 0] for _ in negative_examples]
+    y = np.concatenate([positive_labels, negative_labels], 0)
+    return [x_text, y]
+
+
+def pad_sentences(sentences, padding_word="</s>"):
+    """
+    Pads all sentences to the same length. The length is defined by the longest sentence.
+    Returns padded sentences.
+    """
+    sequence_length = max(len(x) for x in sentences)
+    padded_sentences = []
+    for i in range(len(sentences)):
+        sentence = sentences[i]
+        num_padding = sequence_length - len(sentence)
+        new_sentence = sentence + [padding_word] * num_padding
+        padded_sentences.append(new_sentence)
+    return padded_sentences
+
+
+def build_vocab(sentences):
+    """
+    Builds a vocabulary mapping from word to index based on the sentences.
+    Returns vocabulary mapping and inverse vocabulary mapping.
+    """
+    # Build vocabulary
+    word_counts = Counter(itertools.chain(*sentences))
+    # Mapping from index to word
+    vocabulary_inv = [x[0] for x in word_counts.most_common()]
+    # Mapping from word to index
+    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
+    return [vocabulary, vocabulary_inv]
+
+
+def build_input_data(sentences, labels, vocabulary):
+    """
+    Maps sentencs and labels to vectors based on a vocabulary.
+    """
+    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
+    y = np.array(labels)
+    return [x, y]
+
+
+def build_input_data_with_word2vec(sentences, labels, word2vec):
+    """Map sentences and labels to vectors based on a pretrained word2vec"""
+    x_vec = []
+    for sent in sentences:
+        vec = []
+        for word in sent:
+            if word in word2vec:
+                vec.append(word2vec[word])
+            else:
+                vec.append(word2vec['</s>'])
+        x_vec.append(vec)
+    x_vec = np.array(x_vec)
+    y_vec = np.array(labels)
+    return [x_vec, y_vec]
+
+
+def load_data_with_word2vec(word2vec):
+    """
+    Loads and preprocessed data for the MR dataset.
+    Returns input vectors, labels, vocabulary, and inverse vocabulary.
+    """
+    # Load and preprocess data
+    sentences, labels = load_data_and_labels()
+    sentences_padded = pad_sentences(sentences)
+    # vocabulary, vocabulary_inv = build_vocab(sentences_padded)
+    return build_input_data_with_word2vec(sentences_padded, labels, word2vec)
+
+
+def load_data():
+    """
+    Loads and preprocessed data for the MR dataset.
+    Returns input vectors, labels, vocabulary, and inverse vocabulary.
+    """
+    # Load and preprocess data
+    sentences, labels = load_data_and_labels()
+    sentences_padded = pad_sentences(sentences)
+    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
+    x, y = build_input_data(sentences_padded, labels, vocabulary)
+    return [x, y, vocabulary, vocabulary_inv]
+
+
+def batch_iter(data, batch_size, num_epochs):
+    """
+    Generates a batch iterator for a dataset.
+    """
+    data = np.array(data)
+    data_size = len(data)
+    num_batches_per_epoch = int(len(data) / batch_size) + 1
+    for epoch in range(num_epochs):
+        # Shuffle the data at each epoch
+        shuffle_indices = np.random.permutation(np.arange(data_size))
+        shuffled_data = data[shuffle_indices]
+        for batch_num in range(num_batches_per_epoch):
+            start_index = batch_num * batch_size
+            end_index = min((batch_num + 1) * batch_size, data_size)
+            yield shuffled_data[start_index:end_index]
+
+
+def load_pretrained_word2vec(infile):
+    if isinstance(infile, str):
+        infile = open(infile)
+
+    word2vec = {}
+    for idx, line in enumerate(infile):
+        if idx == 0:
+            vocab_size, dim = line.strip().split()
+        else:
+            tks = line.strip().split()
+            word2vec[tks[0]] = map(float, tks[1:])
+
+    return word2vec
+
+
+def load_google_word2vec(path):
+    model = word2vec.Word2Vec.load_word2vec_format(path, binary=True)
+    return model
diff --git a/example/cnn_chinese_text_classification/text_cnn.py b/example/cnn_chinese_text_classification/text_cnn.py
new file mode 100644
index 000000000000..8fd6b05ccfa8
--- /dev/null
+++ b/example/cnn_chinese_text_classification/text_cnn.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import mxnet as mx
+import numpy as np
+import argparse
+import logging
+
+import time
+
+from mxnet import random
+from mxnet.initializer import Xavier, Initializer
+
+import data_helpers
+
+fmt = '%(asctime)s:filename %(filename)s: lineno %(lineno)d:%(levelname)s:%(message)s'
+logging.basicConfig(format=fmt, filemode='a+', filename='./cnn_text_classification.log', level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+parser = argparse.ArgumentParser(description="CNN for text classification",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--pretrained-embedding', type=bool, default=False,
+                    help='use pre-trained word2vec')
+parser.add_argument('--num-embed', type=int, default=300,
+                    help='embedding layer size')
+parser.add_argument('--gpus', type=str, default='',
+                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu. ')
+parser.add_argument('--kv-store', type=str, default='local',
+                    help='key-value store type')
+parser.add_argument('--num-epochs', type=int, default=150,
+                    help='max num of epochs')
+parser.add_argument('--batch-size', type=int, default=50,
+                    help='the batch size.')
+parser.add_argument('--optimizer', type=str, default='rmsprop',
+                    help='the optimizer type')
+parser.add_argument('--lr', type=float, default=0.0005,
+                    help='initial learning rate')
+parser.add_argument('--dropout', type=float, default=0.0,
+                    help='dropout rate')
+parser.add_argument('--disp-batches', type=int, default=50,
+                    help='show progress for every n batches')
+parser.add_argument('--save-period', type=int, default=10,
+                    help='save checkpoint for every n epochs')
+
+
+def save_model():
+    if not os.path.exists("checkpoint"):
+        os.mkdir("checkpoint")
+    return mx.callback.do_checkpoint("checkpoint/checkpoint", args.save_period)
+
+
+def highway(data):
+    _data = data
+    high_weight = mx.sym.Variable('high_weight')
+    high_bias = mx.sym.Variable('high_bias')
+    high_fc = mx.sym.FullyConnected(data=data, weight=high_weight, bias=high_bias, num_hidden=300, name='high_fc')
+    high_relu = mx.sym.Activation(high_fc, act_type='relu')
+
+    high_trans_weight = mx.sym.Variable('high_trans_weight')
+    high_trans_bias = mx.sym.Variable('high_trans_bias')
+    high_trans_fc = mx.sym.FullyConnected(data=_data, weight=high_trans_weight, bias=high_trans_bias, num_hidden=300,
+                                          name='high_trans_sigmoid')
+    high_trans_sigmoid = mx.sym.Activation(high_trans_fc, act_type='sigmoid')
+
+    return high_relu * high_trans_sigmoid + _data * (1 - high_trans_sigmoid)
+
+
+def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
+    logger.info('Loading data...')
+    if pre_trained_word2vec:
+        word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
+        x, y = data_helpers.load_data_with_word2vec(word2vec)
+        # reshpae for convolution input
+        x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
+        embed_size = x.shape[-1]
+        sentence_size = x.shape[2]
+        vocab_size = -1
+    else:
+        x, y, vocab, vocab_inv = data_helpers.load_data()
+        embed_size = num_embed
+        sentence_size = x.shape[1]
+        vocab_size = len(vocab)
+
+    # randomly shuffle data
+    np.random.seed(10)
+    shuffle_indices = np.random.permutation(np.arange(len(y)))
+    x_shuffled = x[shuffle_indices]
+    y_shuffled = y[shuffle_indices]
+
+    # split train/valid set
+    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
+    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
+    logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
+    logger.info('train shape: %(shape)s', {'shape': x_train.shape})
+    logger.info('valid shape: %(shape)s', {'shape': x_dev.shape})
+    logger.info('sentence max words: %(shape)s', {'shape': sentence_size})
+    logger.info('embedding size: %(msg)s', {'msg': embed_size})
+    logger.info('vocab size: %(msg)s', {'msg': vocab_size})
+
+    train = mx.io.NDArrayIter(
+        x_train, y_train, batch_size, shuffle=True)
+    valid = mx.io.NDArrayIter(
+        x_dev, y_dev, batch_size)
+    return (train, valid, sentence_size, embed_size, vocab_size)
+
+
+def sym_gen(batch_size, sentence_size, num_embed, vocab_size,
+            num_label=2, filter_list=[3, 4, 5], num_filter=100,
+            dropout=0.0, pre_trained_word2vec=False):
+    input_x = mx.sym.Variable('data')
+    input_y = mx.sym.Variable('softmax_label')
+
+    # embedding layer
+    if not pre_trained_word2vec:
+        embed_layer = mx.sym.Embedding(data=input_x, input_dim=vocab_size, output_dim=num_embed, name='vocab_embed')
+        conv_input = mx.sym.Reshape(data=embed_layer, target_shape=(batch_size, 1, sentence_size, num_embed))
+    else:
+        conv_input = input_x
+
+    # create convolution + (max) pooling layer for each filter operation
+    pooled_outputs = []
+    for i, filter_size in enumerate(filter_list):
+        convi = mx.sym.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
+        relui = mx.sym.Activation(data=convi, act_type='relu')
+        pooli = mx.sym.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1, 1))
+        pooled_outputs.append(pooli)
+
+    # combine all pooled outputs
+    total_filters = num_filter * len(filter_list)
+    concat = mx.sym.Concat(*pooled_outputs, dim=1)
+    h_pool = mx.sym.Reshape(data=concat, target_shape=(batch_size, total_filters))
+
+    # highway network
+    h_pool = highway(h_pool)
+
+    # dropout layer
+    if dropout > 0.0:
+        h_drop = mx.sym.Dropout(data=h_pool, p=dropout)
+    else:
+        h_drop = h_pool
+
+    # fully connected
+    cls_weight = mx.sym.Variable('cls_weight')
+    cls_bias = mx.sym.Variable('cls_bias')
+
+    fc = mx.sym.FullyConnected(data=h_drop, weight=cls_weight, bias=cls_bias, num_hidden=num_label)
+
+    # softmax output
+    sm = mx.sym.SoftmaxOutput(data=fc, label=input_y, name='softmax')
+
+    return sm, ('data',), ('softmax_label',)
+
+
+def train(symbol, train_iter, valid_iter, data_names, label_names):
+    devs = mx.cpu() if args.gpus is None or args.gpus is '' else [
+        mx.gpu(int(i)) for i in args.gpus.split(',')]
+    module = mx.mod.Module(symbol, data_names=data_names, label_names=label_names, context=devs)
+
+    init_params = {
+        'vocab_embed_weight': {'uniform': 0.1},
+        'convolution0_weight': {'uniform': 0.1}, 'convolution0_bias': {'costant': 0},
+        'convolution1_weight': {'uniform': 0.1}, 'convolution1_bias': {'costant': 0},
+        'convolution2_weight': {'uniform': 0.1}, 'convolution2_bias': {'costant': 0},
+        'high_weight': {'uniform': 0.1}, 'high_bias': {'costant': 0},
+        'high_trans_weight': {'uniform': 0.1}, 'high_trans_bias': {'costant': -2},
+        'cls_weight': {'uniform': 0.1}, 'cls_bias': {'costant': 0},
+    }
+    # custom init_params
+    module.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
+    module.init_params(CustomInit(init_params))
+    lr_sch = mx.lr_scheduler.FactorScheduler(step=25000, factor=0.999)
+    module.init_optimizer(
+        optimizer='rmsprop', optimizer_params={'learning_rate': 0.0005, 'lr_scheduler': lr_sch})
+
+    def norm_stat(d):
+        return mx.nd.norm(d) / np.sqrt(d.size)
+    mon = mx.mon.Monitor(25000, norm_stat)
+
+    module.fit(train_data=train_iter,
+               eval_data=valid_iter,
+               eval_metric='acc',
+               kvstore=args.kv_store,
+               monitor=mon,
+               num_epoch=args.num_epochs,
+               batch_end_callback=mx.callback.Speedometer(args.batch_size, args.disp_batches),
+               epoch_end_callback=save_model())
+
+
+@mx.init.register
+class CustomInit(Initializer):
+    """
+    https://mxnet.incubator.apache.org/api/python/optimization.html#mxnet.initializer.register
+    Create and register a custom initializer that
+    Initialize the weight and bias with custom requirements
+
+    """
+    weightMethods = ["normal", "uniform", "orthogonal", "xavier"]
+    biasMethods = ["costant"]
+
+    def __init__(self, kwargs):
+        self._kwargs = kwargs
+        super(CustomInit, self).__init__(**kwargs)
+
+    def _init_weight(self, name, arr):
+        if name in self._kwargs.keys():
+            init_params = self._kwargs[name]
+            for (k, v) in init_params.items():
+                if k.lower() == "normal":
+                    random.normal(0, v, out=arr)
+                elif k.lower() == "uniform":
+                    random.uniform(-v, v, out=arr)
+                elif k.lower() == "orthogonal":
+                    raise NotImplementedError("Not support at the moment")
+                elif k.lower() == "xavier":
+                    xa = Xavier(v[0], v[1], v[2])
+                    xa(name, arr)
+        else:
+            raise NotImplementedError("Not support")
+
+    def _init_bias(self, name, arr):
+        if name in self._kwargs.keys():
+            init_params = self._kwargs[name]
+            for (k, v) in init_params.items():
+                if k.lower() == "costant":
+                    arr[:] = v
+        else:
+            raise NotImplementedError("Not support")
+
+
+if __name__ == '__main__':
+    # parse args
+    args = parser.parse_args()
+
+    # data iter
+    train_iter, valid_iter, sentence_size, embed_size, vocab_size = data_iter(args.batch_size,
+                                                                              args.num_embed,
+                                                                              args.pretrained_embedding)
+
+    # network symbol
+    symbol, data_names, label_names = sym_gen(args.batch_size,
+                                              sentence_size,
+                                              embed_size,
+                                              vocab_size,
+                                              num_label=2, filter_list=[3, 4, 5], num_filter=100,
+                                              dropout=args.dropout, pre_trained_word2vec=args.pretrained_embedding)
+    # train cnn model
+    train(symbol, train_iter, valid_iter, data_names, label_names)
diff --git a/example/ctc/lstm.py b/example/ctc/lstm.py
index 7e18c8699492..326daa1d9f3a 100644
--- a/example/ctc/lstm.py
+++ b/example/ctc/lstm.py
@@ -96,7 +96,7 @@ def lstm_unroll(num_lstm_layer, seq_len,
     pred_fc = mx.sym.FullyConnected(data=hidden_concat, num_hidden=11)
     pred_ctc = mx.sym.Reshape(data=pred_fc, shape=(-4, seq_len, -1, 0))
 
-    loss = mx.contrib.sym.ctc_loss(data=pred_ctc, label=label)
+    loss = mx.sym.contrib.ctc_loss(data=pred_ctc, label=label)
     ctc_loss = mx.sym.MakeLoss(loss)
 
     softmax_class = mx.symbol.SoftmaxActivation(data=pred_fc)
diff --git a/example/gluon/data.py b/example/gluon/data.py
index c5ddd0af302b..67519e6a2026 100644
--- a/example/gluon/data.py
+++ b/example/gluon/data.py
@@ -73,8 +73,49 @@ def cifar10_iterator(batch_size, data_shape, resize=-1):
 
     return train, val
 
+def imagenet_iterator(train_data, val_data, batch_size, data_shape, resize=-1):
+    train = mx.io.ImageRecordIter(
+        path_imgrec             = train_data,
+        data_shape              = data_shape,
+        mean_r                  = 123.68,
+        mean_g                  = 116.779,
+        mean_b                  = 103.939,
+        std_r                   = 58.395,
+        std_g                   = 57.12,
+        std_b                   = 57.375,
+        preprocess_threads      = 32,
+        shuffle                 = True,
+        batch_size              = batch_size,
+        rand_crop               = True,
+        resize                  = resize,
+        random_mirror           = True,
+        max_random_h            = 36,
+        max_random_s            = 50,
+        max_random_l            = 50,
+        max_random_rotate_angle = 10,
+        max_random_shear_ratio  = 0.1,
+        max_random_aspect_ratio = 0.25,
+        fill_value              = 127,
+        min_random_scale        = 0.533)
+
+    val = mx.io.ImageRecordIter(
+        path_imgrec        = val_data,
+        data_shape         = data_shape,
+        mean_r             = 123.68,
+        mean_g             = 116.779,
+        mean_b             = 103.939,
+        std_r              = 58.395,
+        std_g              = 57.12,
+        std_b              = 57.375,
+        preprocess_threads = 32,
+        batch_size         = batch_size,
+        resize             = resize)
+
+    return train, val
+
+
 class DummyIter(mx.io.DataIter):
-    def __init__(self, batch_size, data_shape, batches = 5):
+    def __init__(self, batch_size, data_shape, batches = 100):
         super(DummyIter, self).__init__(batch_size)
         self.data_shape = (batch_size,) + data_shape
         self.label_shape = (batch_size,)
@@ -125,9 +166,9 @@ def next(self):
                 image = mx.nd.expand_dims(mx.nd.array(image), axis=2)
                 target = image.copy()
                 for aug in self.input_aug:
-                    image = aug(image)[0]
+                    image = aug(image)
                 for aug in self.target_aug:
-                    target = aug(target)[0]
+                    target = aug(target)
                 data.append(image)
                 label.append(target)
 
diff --git a/example/gluon/dcgan.py b/example/gluon/dcgan.py
index ed814df61e99..3233f430eeac 100644
--- a/example/gluon/dcgan.py
+++ b/example/gluon/dcgan.py
@@ -184,7 +184,7 @@ def transformer(data, label):
         ###########################
         # train with real_t
         data = data.as_in_context(ctx)
-        noise = mx.nd.random_normal(0, 1, shape=(opt.batch_size, nz, 1, 1), ctx=ctx)
+        noise = mx.nd.random.normal(0, 1, shape=(opt.batch_size, nz, 1, 1), ctx=ctx)
 
         with autograd.record():
             output = netD(data)
diff --git a/example/gluon/image_classification.py b/example/gluon/image_classification.py
index 3f84ff8602ed..a2fb75768363 100644
--- a/example/gluon/image_classification.py
+++ b/example/gluon/image_classification.py
@@ -33,20 +33,24 @@
 parser = argparse.ArgumentParser(description='Train a model for image classification.')
 parser.add_argument('--dataset', type=str, default='mnist',
                     help='dataset to use. options are mnist, cifar10, and dummy.')
+parser.add_argument('--train-data', type=str, default='',
+                    help='training record file to use, required for imagenet.')
+parser.add_argument('--val-data', type=str, default='',
+                    help='validation record file to use, required for imagenet.')
 parser.add_argument('--batch-size', type=int, default=32,
                     help='training batch size per device (CPU/GPU).')
-parser.add_argument('--gpus', type=int, default=0,
+parser.add_argument('--num-gpus', type=int, default=0,
                     help='number of gpus to use.')
 parser.add_argument('--epochs', type=int, default=3,
                     help='number of training epochs.')
 parser.add_argument('--lr', type=float, default=0.01,
                     help='learning rate. default is 0.01.')
+parser.add_argument('-momentum', type=float, default=0.9,
+                    help='momentum value for optimizer, default is 0.9.')
 parser.add_argument('--wd', type=float, default=0.0001,
                     help='weight decay rate. default is 0.0001.')
 parser.add_argument('--seed', type=int, default=123,
                     help='random seed to use. Default=123.')
-parser.add_argument('--benchmark', action='store_true',
-                    help='whether to run benchmark.')
 parser.add_argument('--mode', type=str,
                     help='mode in which to train the model. options are symbolic, imperative, hybrid')
 parser.add_argument('--model', type=str, required=True,
@@ -57,10 +61,15 @@
                     help='enable batch normalization or not in vgg. default is false.')
 parser.add_argument('--use-pretrained', action='store_true',
                     help='enable using pretrained model from gluon.')
+parser.add_argument('--kvstore', type=str, default='device',
+                    help='kvstore to use for trainer/module.')
 parser.add_argument('--log-interval', type=int, default=50, help='Number of batches to wait before logging.')
+parser.add_argument('--profile', action='store_true',
+                    help='Option to turn on memory profiling for front-end, '\
+                         'and prints out the memory usage by python function at the end.')
 opt = parser.parse_args()
 
-print(opt)
+logging.info(opt)
 
 mx.random.seed(opt.seed)
 
@@ -68,15 +77,10 @@
 
 batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[opt.dataset]
 
-gpus = opt.gpus
+num_gpus = opt.num_gpus
 
-if opt.benchmark:
-    batch_size = 32
-    dataset = 'dummy'
-    classes = 1000
-
-batch_size *= max(1, gpus)
-context = [mx.gpu(i) for i in range(gpus)] if gpus > 0 else [mx.cpu()]
+batch_size *= max(1, num_gpus)
+context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()]
 
 model_name = opt.model
 
@@ -93,6 +97,13 @@
     train_data, val_data = mnist_iterator(batch_size, (1, 32, 32))
 elif dataset == 'cifar10':
     train_data, val_data = cifar10_iterator(batch_size, (3, 32, 32))
+elif dataset == 'imagenet':
+    if model_name == 'inceptionv3':
+        train_data, val_data = imagenet_iterator(opt.train_data, opt.val_data,
+                                              batch_size, (3, 299, 299))
+    else:
+        train_data, val_data = imagenet_iterator(opt.train_data, opt.val_data,
+                                                 batch_size, (3, 224, 224))
 elif dataset == 'dummy':
     if model_name == 'inceptionv3':
         train_data, val_data = dummy_iterator(batch_size, (3, 299, 299))
@@ -115,8 +126,10 @@ def test(ctx):
 def train(epochs, ctx):
     if isinstance(ctx, mx.Context):
         ctx = [ctx]
-    net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
-    trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': opt.lr, 'wd': opt.wd})
+    net.initialize(mx.init.Xavier(magnitude=2), ctx=ctx)
+    trainer = gluon.Trainer(net.collect_params(), 'sgd',
+                            {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum},
+                            kvstore = opt.kvstore)
     metric = mx.metric.Accuracy()
     loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
@@ -144,7 +157,7 @@ def train(epochs, ctx):
             metric.update(label, outputs)
             if opt.log_interval and not (i+1)%opt.log_interval:
                 name, acc = metric.get()
-                logging.info('[Epoch %d Batch %d] speed: %f samples/s, training: %s=%f'%(
+                logging.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f'%(
                                epoch, i, batch_size/(time.time()-btic), name, acc))
             btic = time.time()
 
@@ -156,14 +169,36 @@ def train(epochs, ctx):
 
     net.save_params('image-classifier-%s-%d.params'%(opt.model, epochs))
 
-if __name__ == '__main__':
+def main():
     if opt.mode == 'symbolic':
         data = mx.sym.var('data')
         out = net(data)
         softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-        mod = mx.mod.Module(softmax, context=[mx.gpu(i) for i in range(gpus)] if gpus > 0 else [mx.cpu()])
-        mod.fit(train_data, num_epoch=opt.epochs, batch_end_callback = mx.callback.Speedometer(batch_size, 1))
+        mod = mx.mod.Module(softmax, context=[mx.gpu(i) for i in range(num_gpus)] if num_gpus > 0 else [mx.cpu()])
+        mod.fit(train_data,
+                eval_data = val_data,
+                num_epoch=opt.epochs,
+                kvstore=opt.kvstore,
+                batch_end_callback = mx.callback.Speedometer(batch_size, max(1, opt.log_interval)),
+                epoch_end_callback = mx.callback.do_checkpoint('image-classifier-%s'% opt.model),
+                optimizer = 'sgd',
+                optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum},
+                initializer = mx.init.Xavier(magnitude=2))
+        mod.save_params('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
     else:
         if opt.mode == 'hybrid':
             net.hybridize()
         train(opt.epochs, context)
+
+if __name__ == '__main__':
+    if opt.profile:
+        import hotshot, hotshot.stats
+        prof = hotshot.Profile('image-classifier-%s-%s.prof'%(opt.model, opt.mode))
+        prof.runcall(main)
+        prof.close()
+        stats = hotshot.stats.load('image-classifier-%s-%s.prof'%(opt.model, opt.mode))
+        stats.strip_dirs()
+        stats.sort_stats('cumtime', 'calls')
+        stats.print_stats()
+    else:
+        main()
diff --git a/example/gluon/kaggle_k_fold_cross_validation.py b/example/gluon/kaggle_k_fold_cross_validation.py
new file mode 100644
index 000000000000..7911e4d1a01c
--- /dev/null
+++ b/example/gluon/kaggle_k_fold_cross_validation.py
@@ -0,0 +1,164 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# This example provides an end-to-end pipeline for a common Kaggle competition.
+# The entire pipeline includes common utilities such as k-fold cross validation
+# and data pre-processing.
+#
+# Specifically, the example studies the `House Prices: Advanced Regression
+# Techniques` challenge as a case study.
+#
+# The link to the problem on Kaggle:
+# https://www.kaggle.com/c/house-prices-advanced-regression-techniques
+
+import numpy as np
+import pandas as pd
+from mxnet import autograd
+from mxnet import gluon
+from mxnet import ndarray as nd
+
+# After logging in www.kaggle.com, the training and testing data sets can be downloaded at:
+# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/train.csv
+# https://www.kaggle.com/c/house-prices-advanced-regression-techniques/download/test.csv
+train = pd.read_csv("train.csv")
+test = pd.read_csv("test.csv")
+all_X = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
+                      test.loc[:, 'MSSubClass':'SaleCondition']))
+
+# Get all the numerical features and apply standardization.
+numeric_feas = all_X.dtypes[all_X.dtypes != "object"].index
+all_X[numeric_feas] = all_X[numeric_feas].apply(lambda x:
+                                                (x - x.mean()) / (x.std()))
+# Convert categorical feature values to numerical (including N/A).
+all_X = pd.get_dummies(all_X, dummy_na=True)
+# Approximate N/A feature value by the mean value of the current feature.
+all_X = all_X.fillna(all_X.mean())
+
+num_train = train.shape[0]
+
+# Convert data formats to NDArrays to feed into gluon.
+X_train = all_X[:num_train].as_matrix()
+X_test = all_X[num_train:].as_matrix()
+y_train = train.SalePrice.as_matrix()
+
+X_train = nd.array(X_train)
+y_train = nd.array(y_train)
+y_train.reshape((num_train, 1))
+
+X_test = nd.array(X_test)
+square_loss = gluon.loss.L2Loss()
+
+def get_rmse_log(net, X_train, y_train):
+    """Gets root mse between the logarithms of the prediction and the truth."""
+    num_train = X_train.shape[0]
+    clipped_preds = nd.clip(net(X_train), 1, float('inf'))
+    return np.sqrt(2 * nd.sum(square_loss(
+        nd.log(clipped_preds), nd.log(y_train))).asscalar() / num_train)
+
+def get_net():
+    """Gets a neural network. Better results are obtained with modifications."""
+    net = gluon.nn.Sequential()
+    with net.name_scope():
+        net.add(gluon.nn.Dense(50, activation="relu"))
+        net.add(gluon.nn.Dense(1))
+    net.initialize()
+    return net
+
+def train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
+          weight_decay, batch_size):
+    """Trains the model."""
+    dataset_train = gluon.data.ArrayDataset(X_train, y_train)
+    data_iter_train = gluon.data.DataLoader(dataset_train, batch_size,
+                                            shuffle=True)
+    trainer = gluon.Trainer(net.collect_params(), 'adam',
+                            {'learning_rate': learning_rate,
+                             'wd': weight_decay})
+    net.collect_params().initialize(force_reinit=True)
+    for epoch in range(epochs):
+        for data, label in data_iter_train:
+            with autograd.record():
+                output = net(data)
+                loss = square_loss(output, label)
+            loss.backward()
+            trainer.step(batch_size)
+            avg_loss = get_rmse_log(net, X_train, y_train)
+        if epoch > verbose_epoch:
+            print("Epoch %d, train loss: %f" % (epoch, avg_loss))
+    return avg_loss
+
+def k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
+                       learning_rate, weight_decay, batch_size):
+    """Conducts k-fold cross validation for the model."""
+    assert k > 1
+    fold_size = X_train.shape[0] // k
+
+    train_loss_sum = 0.0
+    test_loss_sum = 0.0
+    for test_idx in range(k):
+        X_val_test = X_train[test_idx * fold_size: (test_idx + 1) *
+                                                   fold_size, :]
+        y_val_test = y_train[test_idx * fold_size: (test_idx + 1) * fold_size]
+        val_train_defined = False
+        for i in range(k):
+            if i != test_idx:
+                X_cur_fold = X_train[i * fold_size: (i + 1) * fold_size, :]
+                y_cur_fold = y_train[i * fold_size: (i + 1) * fold_size]
+                if not val_train_defined:
+                    X_val_train = X_cur_fold
+                    y_val_train = y_cur_fold
+                    val_train_defined = True
+                else:
+                    X_val_train = nd.concat(X_val_train, X_cur_fold, dim=0)
+                    y_val_train = nd.concat(y_val_train, y_cur_fold, dim=0)
+        net = get_net()
+        train_loss = train(net, X_val_train, y_val_train, epochs, verbose_epoch,
+                           learning_rate, weight_decay, batch_size)
+        train_loss_sum += train_loss
+        test_loss = get_rmse_log(net, X_val_test, y_val_test)
+        print("Test loss: %f" % test_loss)
+        test_loss_sum += test_loss
+    return train_loss_sum / k, test_loss_sum / k
+
+# The sets of parameters. Better results are obtained with modifications.
+# These parameters can be fine-tuned with k-fold cross-validation.
+k = 5
+epochs = 100
+verbose_epoch = 95
+learning_rate = 0.3
+weight_decay = 100
+batch_size = 100
+
+train_loss, test_loss = \
+    k_fold_cross_valid(k, epochs, verbose_epoch, X_train, y_train,
+                       learning_rate, weight_decay, batch_size)
+print("%d-fold validation: Avg train loss: %f, Avg test loss: %f" %
+      (k, train_loss, test_loss))
+
+def learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
+          weight_decay, batch_size):
+    """Trains the model and predicts on the test data set."""
+    net = get_net()
+    _ = train(net, X_train, y_train, epochs, verbose_epoch, learning_rate,
+                 weight_decay, batch_size)
+    preds = net(X_test).asnumpy()
+    test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
+    submission = pd.concat([test['Id'], test['SalePrice']], axis=1)
+    submission.to_csv('submission.csv', index=False)
+
+learn(epochs, verbose_epoch, X_train, y_train, test, learning_rate,
+      weight_decay, batch_size)
diff --git a/example/gluon/learning_rate_manipulation.py b/example/gluon/learning_rate_manipulation.py
new file mode 100644
index 000000000000..1523102b795a
--- /dev/null
+++ b/example/gluon/learning_rate_manipulation.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# This example demonstrates how to manipulate the learning rate of an optimizer
+# in gluon. The example uses linear regression as a case study.
+
+from __future__ import print_function
+import numpy as np
+import mxnet as mx
+from mxnet import autograd
+from mxnet import gluon
+
+# Generate synthetic data.
+X = np.random.randn(10000, 2)
+Y = 2 * X[:, 0] - 3.4 * X[:, 1] + 4.2 + .01 * np.random.normal(size=10000)
+
+net = gluon.nn.Sequential()
+# The output dimension is 1.
+net.add(gluon.nn.Dense(1))
+net.collect_params().initialize()
+loss = gluon.loss.L2Loss()
+
+# Initialize the learning rate as 0.1.
+trainer = gluon.Trainer(net.collect_params(), 'sgd',
+                        optimizer_params={'learning_rate': 0.1})
+net.collect_params().initialize(mx.init.Xavier(magnitude=2.24),
+                                force_reinit=True)
+train_data = mx.io.NDArrayIter(X, Y, batch_size=10, shuffle=True)
+
+for epoch in range(5):
+    train_data.reset()
+    for i, batch in enumerate(train_data):
+        data = batch.data[0]
+        label = batch.label[0].reshape((-1, 1))
+        with autograd.record():
+            output = net(data)
+            mse = loss(output, label)
+        mse.backward()
+        trainer.step(data.shape[0])
+    # After the second epoch, decay the learning rate of the optimizer every
+    # epoch.
+    if epoch > 1:
+        trainer.set_learning_rate(trainer.learning_rate * 0.9)
+    print('Epoch:', epoch, 'Learning rate:', trainer.learning_rate)
+
+for para_name, para_value in net.collect_params().items():
+    # Print all the parameter values after training.
+    print(para_name, para_value.data().asnumpy()[0])
\ No newline at end of file
diff --git a/example/gluon/lstm_crf.py b/example/gluon/lstm_crf.py
index 40c8c2be2784..857bfca56186 100644
--- a/example/gluon/lstm_crf.py
+++ b/example/gluon/lstm_crf.py
@@ -62,13 +62,13 @@ def __init__(self, vocab_size, tag2idx, embedding_dim, hidden_dim):
 
             # Matrix of transition parameters.  Entry i,j is the score of
             # transitioning *to* i *from* j.
-            self.transitions = nd.random_normal(shape=(self.tagset_size, self.tagset_size))
+            self.transitions = nd.random.normal(shape=(self.tagset_size, self.tagset_size))
 
             self.hidden = self.init_hidden()
 
     def init_hidden(self):
-        return [nd.random_normal(shape=(2, 1, self.hidden_dim // 2)),
-                nd.random_normal(shape=(2, 1, self.hidden_dim // 2))]
+        return [nd.random.normal(shape=(2, 1, self.hidden_dim // 2)),
+                nd.random.normal(shape=(2, 1, self.hidden_dim // 2))]
 
     def _forward_alg(self, feats):
         # Do the forward algorithm to compute the partition function
diff --git a/example/gluon/style_transfer/README.md b/example/gluon/style_transfer/README.md
new file mode 100644
index 000000000000..ef273a5975ab
--- /dev/null
+++ b/example/gluon/style_transfer/README.md
@@ -0,0 +1,113 @@
+# MXNet-Gluon-Style-Transfer
+
+This repo provides MXNet Implementation of **[Neural Style Transfer](#neural-style)** and **[MSG-Net](#real-time-style-transfer)**. 
+
+**Tabe of content**
+
+* [Slow Neural Style Transfer](#neural-style)
+* [Real-time Style Transfer](#real-time-style-transfer)
+	- [Stylize Images using Pre-trained MSG-Net](#stylize-images-using-pre-trained-msg-net)
+	- [Train Your Own MSG-Net Model](#train-your-own-msg-net-model)
+
+## Neural Style
+
+[A Neural Algorithm of Artistic Style](https://arxiv.org/abs/1508.06576) by Leon A. Gatys, Alexander S. Ecker, and Matthias Bethge.
+
+
+**Download the images**
+
+```bash
+python download_images.py 
+```
+
+**Neural style transfer**
+
+```bash
+python main.py optim --content-image images/content/venice-boat.jpg --style-image images/styles/candy.jpg
+```
+* `--content-image`: path to content image.
+* `--style-image`: path to style image.
+* `--output-image`: path for saving the output image.
+* `--content-size`: the content image size to test on.
+* `--style-size`: the style image size to test on.
+* `--cuda`: set it to 1 for running on GPU, 0 for CPU.
+
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/g1.jpg" width="260px" /> <img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/g2.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/g3.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/g4.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/g5.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/g6.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/g7.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/g8.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/g9.jpg" width="260px" />
+
+## Real-time Style Transfer
+<table width="100%" border="0" cellspacing="15" cellpadding="0">
+	<tbody>
+		<tr>
+			<td>
+			<b>Multi-style Generative Network for Real-time Transfer</b>  [<a href="https://arxiv.org/pdf/1703.06953.pdf">arXiv</a>] [<a href="http://computervisionrutgers.github.io/MSG-Net/">project</a>]  <br>
+  <a href="http://hangzh.com/">Hang Zhang</a>,  <a href="http://eceweb1.rutgers.edu/vision/dana.html">Kristin Dana</a>
+<pre>
+@article{zhang2017multistyle,
+	title={Multi-style Generative Network for Real-time Transfer},
+	author={Zhang, Hang and Dana, Kristin},
+	journal={arXiv preprint arXiv:1703.06953},
+	year={2017}
+}
+</pre>
+			</td>
+			<td width="440"><a><img src ="https://raw.githubusercontent.com/zhanghang1989/MSG-Net/master/images/figure1.jpg" width="420px" border="1"></a></td>
+		</tr>
+	</tbody>
+</table>
+
+
+### Stylize Images Using Pre-trained MSG-Net
+0. Download the images and pre-trained model
+	```bash
+    python download_images.py 
+	python models/download_model.py
+	```
+0. Test the model
+	```bash
+	python main.py eval --content-image images/content/venice-boat.jpg --style-image images/styles/candy.jpg --model models/21styles.params --content-size 1024
+	```
+* If you don't have a GPU, simply set `--cuda=0`. For a different style, set `--style-image path/to/style`.
+	If you would to stylize your own photo, change the `--content-image path/to/your/photo`. 
+	More options:
+
+	* `--content-image`: path to content image you want to stylize.
+	* `--style-image`: path to style image (typically covered during the training).
+	* `--model`: path to the pre-trained model to be used for stylizing the image.
+	* `--output-image`: path for saving the output image.
+	* `--content-size`: the content image size to test on.
+	* `--cuda`: set it to 1 for running on GPU, 0 for CPU.
+
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/1.jpg" width="260px" /> <img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/2.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/3.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/4.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/5.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/6.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/7.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/8.jpg" width="260px" />
+<img src ="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/example/style_transfer/images/9.jpg" width="260px" />
+
+### Train Your Own MSG-Net Model
+0. Download the style images and COCO dataset
+	```bash
+    python download_images.py 
+	python dataset/download_dataset.py
+	```
+0. Train the model
+	```bash
+	python main.py train --epochs 4
+	```
+* If you would like to customize styles, set `--style-folder path/to/your/styles`. More options:
+	* `--style-folder`: path to the folder style images.
+	* `--vgg-model-dir`: path to folder where the vgg model will be downloaded.
+	* `--save-model-dir`: path to folder where trained model will be saved.
+	* `--cuda`: set it to 1 for running on GPU, 0 for CPU.
+
+
+The code is mainly modified from [PyTorch-Style-Transfer](https://github.com/zhanghang1989/PyTorch-Style-Transfer).
diff --git a/example/gluon/style_transfer/data.py b/example/gluon/style_transfer/data.py
new file mode 100644
index 000000000000..d2b4ab6650ed
--- /dev/null
+++ b/example/gluon/style_transfer/data.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet.gluon.data as data
+
+from PIL import Image
+import os
+import os.path
+
+IMG_EXTENSIONS = [
+    '.jpg', '.JPG', '.jpeg', '.JPEG',
+    '.png', '.PNG', '.ppm', '.PPM', '.bmp', '.BMP',
+]
+
+
+def is_image_file(filename):
+    return any(filename.endswith(extension) for extension in IMG_EXTENSIONS)
+
+
+def find_classes(dir):
+    classes = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
+    classes.sort()
+    class_to_idx = {classes[i]: i for i in range(len(classes))}
+    return classes, class_to_idx
+
+
+def make_dataset(dir, class_to_idx):
+    images = []
+    dir = os.path.expanduser(dir)
+    for target in sorted(os.listdir(dir)):
+        d = os.path.join(dir, target)
+        if not os.path.isdir(d):
+            continue
+
+        for root, _, fnames in sorted(os.walk(d)):
+            for fname in sorted(fnames):
+                if is_image_file(fname):
+                    path = os.path.join(root, fname)
+                    item = (path, class_to_idx[target])
+                    images.append(item)
+
+    return images
+
+
+def pil_loader(path):
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        with Image.open(f) as img:
+            return img.convert('RGB')
+
+
+class ImageFolder(data.Dataset):
+    """A generic data loader where the images are arranged in this way: ::
+
+        root/dog/xxx.png
+        root/dog/xxy.png
+        root/dog/xxz.png
+
+        root/cat/123.png
+        root/cat/nsdf3.png
+        root/cat/asd932_.png
+
+    Args:
+        root (string): Root directory path.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        loader (callable, optional): A function to load an image given its path.
+
+     Attributes:
+        classes (list): List of the class names.
+        class_to_idx (dict): Dict with items (class_name, class_index).
+        imgs (list): List of (image path, class_index) tuples
+    """
+
+    def __init__(self, root, transform=None, target_transform=None,
+                 loader=pil_loader):
+        classes, class_to_idx = find_classes(root)
+        imgs = make_dataset(root, class_to_idx)
+        if len(imgs) == 0:
+            raise(RuntimeError("Found 0 images in subfolders of: " + root + "\n"
+                               "Supported image extensions are: " + ",".join(IMG_EXTENSIONS)))
+
+        self.root = root
+        self.imgs = imgs
+        self.classes = classes
+        self.class_to_idx = class_to_idx
+        self.transform = transform
+        self.target_transform = target_transform
+        self.loader = loader
+
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is class_index of the target class.
+        """
+        path, target = self.imgs[index]
+        img = self.loader(path)
+        if self.transform is not None:
+            img = self.transform(img)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        return img, target
+
+    def __len__(self):
+        return len(self.imgs)
diff --git a/example/gluon/style_transfer/dataset/download_dataset.py b/example/gluon/style_transfer/dataset/download_dataset.py
new file mode 100644
index 000000000000..538001511172
--- /dev/null
+++ b/example/gluon/style_transfer/dataset/download_dataset.py
@@ -0,0 +1,33 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os, zipfile
+import mxnet
+from mxnet.test_utils import download
+
+def unzip_file(filename, outpath):
+    fh = open(filename, 'rb')
+    z = zipfile.ZipFile(fh)
+    for name in z.namelist():
+        z.extract(name, outpath)
+    fh.close()
+
+download('http://msvocds.blob.core.windows.net/coco2014/train2014.zip', 'dataset/train2014.zip')
+download('http://msvocds.blob.core.windows.net/coco2014/val2014.zip', 'dataset/val2014.zip')
+
+unzip_file('dataset/train2014.zip', 'dataset')
+unzip_file('dataset/val2014.zip', 'dataset')
diff --git a/python/mxnet/_symbol_internal.py b/example/gluon/style_transfer/download_images.py
similarity index 83%
rename from python/mxnet/_symbol_internal.py
rename to example/gluon/style_transfer/download_images.py
index cd6ae41c2a19..9f7b30057e54 100644
--- a/python/mxnet/_symbol_internal.py
+++ b/example/gluon/style_transfer/download_images.py
@@ -15,4 +15,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Symbol namespace used to register internal functions."""
+import os
+if not os.path.exists('images'):
+        os.system('svn checkout https://github.com/dmlc/web-data/trunk/mxnet/example/style_transfer/images')
diff --git a/example/gluon/style_transfer/main.py b/example/gluon/style_transfer/main.py
new file mode 100644
index 000000000000..0ed2d831bbc0
--- /dev/null
+++ b/example/gluon/style_transfer/main.py
@@ -0,0 +1,228 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import time
+import random
+import os
+import mxnet as mx
+import numpy as np
+np.set_printoptions(precision=2)
+from PIL import Image
+
+from mxnet import gluon
+from mxnet.gluon import nn, autograd, Block, HybridBlock, Parameter, ParameterDict
+import mxnet.ndarray as F
+
+import net
+import utils
+from option import Options
+import data
+
+def train(args):
+    np.random.seed(args.seed)
+    if args.cuda:
+        ctx = mx.gpu(0)
+    else:
+        ctx = mx.cpu(0)
+    # dataloader
+    transform = utils.Compose([utils.Scale(args.image_size),
+                               utils.CenterCrop(args.image_size),
+                               utils.ToTensor(ctx),
+                               ])
+    train_dataset = data.ImageFolder(args.dataset, transform)
+    train_loader = gluon.data.DataLoader(train_dataset, batch_size=args.batch_size, last_batch='discard')
+    style_loader = utils.StyleLoader(args.style_folder, args.style_size, ctx=ctx)
+    print('len(style_loader):',style_loader.size())
+    # models
+    vgg = net.Vgg16()
+    utils.init_vgg_params(vgg, 'models', ctx=ctx)
+    style_model = net.Net(ngf=args.ngf)
+    style_model.initialize(init=mx.initializer.MSRAPrelu(), ctx=ctx)
+    if args.resume is not None:
+        print('Resuming, initializing using weight from {}.'.format(args.resume))
+        style_model.collect_params().load(args.resume, ctx=ctx)
+    print('style_model:',style_model)
+    # optimizer and loss
+    trainer = gluon.Trainer(style_model.collect_params(), 'adam',
+                            {'learning_rate': args.lr})
+    mse_loss = gluon.loss.L2Loss()
+
+    for e in range(args.epochs):
+        agg_content_loss = 0.
+        agg_style_loss = 0.
+        count = 0
+        for batch_id, (x, _) in enumerate(train_loader):
+            n_batch = len(x)
+            count += n_batch
+            # prepare data
+            style_image = style_loader.get(batch_id)
+            style_v = utils.subtract_imagenet_mean_preprocess_batch(style_image.copy())
+            style_image = utils.preprocess_batch(style_image)
+
+            features_style = vgg(style_v)
+            gram_style = [net.gram_matrix(y) for y in features_style]
+
+            xc = utils.subtract_imagenet_mean_preprocess_batch(x.copy())
+            f_xc_c = vgg(xc)[1]
+            with autograd.record():
+                style_model.setTarget(style_image)
+                y = style_model(x)
+
+                y = utils.subtract_imagenet_mean_batch(y)
+                features_y = vgg(y)
+
+                content_loss = 2 * args.content_weight * mse_loss(features_y[1], f_xc_c)
+
+                style_loss = 0.
+                for m in range(len(features_y)):
+                    gram_y = net.gram_matrix(features_y[m])
+                    _, C, _ = gram_style[m].shape
+                    gram_s = F.expand_dims(gram_style[m], 0).broadcast_to((args.batch_size, 1, C, C))
+                    style_loss = style_loss + 2 * args.style_weight * mse_loss(gram_y, gram_s[:n_batch, :, :])
+
+                total_loss = content_loss + style_loss
+                total_loss.backward()
+                
+            trainer.step(args.batch_size)
+            mx.nd.waitall()
+
+            agg_content_loss += content_loss[0]
+            agg_style_loss += style_loss[0]
+
+            if (batch_id + 1) % args.log_interval == 0:
+                mesg = "{}\tEpoch {}:\t[{}/{}]\tcontent: {:.3f}\tstyle: {:.3f}\ttotal: {:.3f}".format(
+                    time.ctime(), e + 1, count, len(train_dataset),
+                                agg_content_loss.asnumpy()[0] / (batch_id + 1),
+                                agg_style_loss.asnumpy()[0] / (batch_id + 1),
+                                (agg_content_loss + agg_style_loss).asnumpy()[0] / (batch_id + 1)
+                )
+                print(mesg)
+
+            
+            if (batch_id + 1) % (4 * args.log_interval) == 0:
+                # save model
+                save_model_filename = "Epoch_" + str(e) + "iters_" + str(count) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
+                    args.content_weight) + "_" + str(args.style_weight) + ".params"
+                save_model_path = os.path.join(args.save_model_dir, save_model_filename)
+                style_model.collect_params().save(save_model_path)
+                print("\nCheckpoint, trained model saved at", save_model_path)
+
+    # save model
+    save_model_filename = "Final_epoch_" + str(args.epochs) + "_" + str(time.ctime()).replace(' ', '_') + "_" + str(
+        args.content_weight) + "_" + str(args.style_weight) + ".params"
+    save_model_path = os.path.join(args.save_model_dir, save_model_filename)
+    style_model.collect_params().save(save_model_path)
+    print("\nDone, trained model saved at", save_model_path)
+
+
+def evaluate(args):
+    if args.cuda:
+        ctx = mx.gpu(0)
+    else:
+        ctx = mx.cpu(0)
+    # images
+    content_image = utils.tensor_load_rgbimage(args.content_image,ctx, size=args.content_size, keep_asp=True)
+    style_image = utils.tensor_load_rgbimage(args.style_image, ctx, size=args.style_size)
+    style_image = utils.preprocess_batch(style_image)
+    # model
+    style_model = net.Net(ngf=args.ngf)
+    style_model.collect_params().load(args.model, ctx=ctx)
+    # forward
+    style_model.setTarget(style_image)
+    output = style_model(content_image)
+    utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
+
+
+def optimize(args):
+    """    Gatys et al. CVPR 2017
+    ref: Image Style Transfer Using Convolutional Neural Networks
+    """
+    if args.cuda:
+        ctx = mx.gpu(0)
+    else:
+        ctx = mx.cpu(0)
+    # load the content and style target
+    content_image = utils.tensor_load_rgbimage(args.content_image,ctx, size=args.content_size, keep_asp=True)
+    content_image = utils.subtract_imagenet_mean_preprocess_batch(content_image)
+    style_image = utils.tensor_load_rgbimage(args.style_image, ctx, size=args.style_size)
+    style_image = utils.subtract_imagenet_mean_preprocess_batch(style_image)
+    # load the pre-trained vgg-16 and extract features
+    vgg = net.Vgg16()
+    utils.init_vgg_params(vgg, 'models', ctx=ctx)
+    # content feature
+    f_xc_c = vgg(content_image)[1]
+    # style feature
+    features_style = vgg(style_image)
+    gram_style = [net.gram_matrix(y) for y in features_style]
+    # output
+    output = Parameter('output', shape=content_image.shape)
+    output.initialize(ctx=ctx)
+    output.set_data(content_image)
+    # optimizer
+    trainer = gluon.Trainer([output], 'adam',
+                            {'learning_rate': args.lr})
+    mse_loss = gluon.loss.L2Loss()
+
+    # optimizing the images
+    for e in range(args.iters):
+        utils.imagenet_clamp_batch(output.data(), 0, 255)
+        # fix BN for pre-trained vgg
+        with autograd.record():
+            features_y = vgg(output.data())
+            content_loss = 2 * args.content_weight * mse_loss(features_y[1], f_xc_c)
+            style_loss = 0.
+            for m in range(len(features_y)):
+                gram_y = net.gram_matrix(features_y[m])
+                gram_s = gram_style[m]
+                style_loss = style_loss + 2 * args.style_weight * mse_loss(gram_y, gram_s)
+            total_loss = content_loss + style_loss
+            total_loss.backward()
+
+        trainer.step(1)
+        if (e + 1) % args.log_interval == 0:
+            print('loss:{:.2f}'.format(total_loss.asnumpy()[0]))
+        
+    # save the image
+    output = utils.add_imagenet_mean_batch(output.data())
+    utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
+
+
+def main():
+    # figure out the experiments type
+    args = Options().parse()
+
+    if args.subcommand is None:
+        raise ValueError("ERROR: specify the experiment type")
+
+    if args.subcommand == "train":
+        # Training the model 
+        train(args)
+
+    elif args.subcommand == 'eval':
+        # Test the pre-trained model
+        evaluate(args)
+
+    elif args.subcommand == 'optim':
+        # Gatys et al. using optimization-based approach
+        optimize(args)
+
+    else:
+        raise ValueError('Unknow experiment type')
+
+
+if __name__ == "__main__":
+   main()
diff --git a/example/gluon/style_transfer/models/download_model.py b/example/gluon/style_transfer/models/download_model.py
new file mode 100644
index 000000000000..ef70e6a346ce
--- /dev/null
+++ b/example/gluon/style_transfer/models/download_model.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from mxnet.test_utils import download
+
+download('https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/models/21styles-32f7205c.params', 'models/21styles.params')
+
diff --git a/example/gluon/style_transfer/net.py b/example/gluon/style_transfer/net.py
new file mode 100644
index 000000000000..353a52c66c0b
--- /dev/null
+++ b/example/gluon/style_transfer/net.py
@@ -0,0 +1,330 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import numpy as np
+import mxnet as mx
+from mxnet import gluon
+from mxnet.gluon import nn, autograd, Block, HybridBlock, Parameter
+from mxnet.base import numeric_types
+import mxnet.ndarray as F
+
+class InstanceNorm(HybridBlock):
+    def __init__(self, axis=1, momentum=0.9, epsilon=1e-5, center=True, scale=False,
+                 beta_initializer='zeros', gamma_initializer='ones',
+                 in_channels=0, **kwargs):
+        super(InstanceNorm, self).__init__(**kwargs)
+        self._kwargs = {'eps': epsilon}
+        if in_channels != 0:
+            self.in_channels = in_channels
+        self.gamma = self.params.get('gamma', grad_req='write' if scale else 'null',
+                                     shape=(in_channels,), init=gamma_initializer,
+                                     allow_deferred_init=True)
+        self.beta = self.params.get('beta', grad_req='write' if center else 'null',
+                                    shape=(in_channels,), init=beta_initializer,
+                                    allow_deferred_init=True)
+
+    def hybrid_forward(self, F, x, gamma, beta):
+        return F.InstanceNorm(x, gamma, beta,
+                           name='fwd', **self._kwargs)
+
+    def __repr__(self):
+        s = '{name}({content}'
+        if hasattr(self, 'in_channels'):
+            s += ', in_channels={0}'.format(self.in_channels)
+        s += ')'
+        return s.format(name=self.__class__.__name__,
+                        content=', '.join(['='.join([k, v.__repr__()])
+                                           for k, v in self._kwargs.items()]))
+
+
+class ReflectancePadding(HybridBlock):
+    def __init__(self, pad_width=None, **kwargs):
+        super(ReflectancePadding, self).__init__(**kwargs)
+        self.pad_width = pad_width
+        
+    def forward(self, x):
+        return F.pad(x, mode='reflect', pad_width=self.pad_width)
+
+    
+class Bottleneck(Block):
+    """ Pre-activation residual block
+    Identity Mapping in Deep Residual Networks
+    ref https://arxiv.org/abs/1603.05027
+    """
+    def __init__(self, inplanes, planes, stride=1, downsample=None, norm_layer=InstanceNorm):
+        super(Bottleneck, self).__init__()
+        self.expansion = 4
+        self.downsample = downsample
+        if self.downsample is not None:
+            self.residual_layer = nn.Conv2D(in_channels=inplanes, 
+                                            channels=planes * self.expansion,
+                                            kernel_size=1, strides=(stride, stride))
+        self.conv_block = nn.Sequential()
+        with self.conv_block.name_scope():
+            self.conv_block.add(norm_layer(in_channels=inplanes))
+            self.conv_block.add(nn.Activation('relu'))
+            self.conv_block.add(nn.Conv2D(in_channels=inplanes, channels=planes, 
+                                 kernel_size=1))
+            self.conv_block.add(norm_layer(in_channels=planes))
+            self.conv_block.add(nn.Activation('relu'))
+            self.conv_block.add(ConvLayer(planes, planes, kernel_size=3, 
+                stride=stride))
+            self.conv_block.add(norm_layer(in_channels=planes))
+            self.conv_block.add(nn.Activation('relu'))
+            self.conv_block.add(nn.Conv2D(in_channels=planes, 
+                                 channels=planes * self.expansion, 
+                                 kernel_size=1))
+        
+    def forward(self, x):
+        if self.downsample is not None:
+            residual = self.residual_layer(x)
+        else:
+            residual = x
+        return residual + self.conv_block(x)
+
+
+class UpBottleneck(Block):
+    """ Up-sample residual block (from MSG-Net paper)
+    Enables passing identity all the way through the generator
+    ref https://arxiv.org/abs/1703.06953
+    """
+    def __init__(self, inplanes, planes, stride=2, norm_layer=InstanceNorm):
+        super(UpBottleneck, self).__init__()
+        self.expansion = 4
+        self.residual_layer = UpsampleConvLayer(inplanes, planes * self.expansion,
+                                                      kernel_size=1, stride=1, upsample=stride)
+        self.conv_block = nn.Sequential()
+        with self.conv_block.name_scope():
+            self.conv_block.add(norm_layer(in_channels=inplanes))
+            self.conv_block.add(nn.Activation('relu'))
+            self.conv_block.add(nn.Conv2D(in_channels=inplanes, channels=planes, 
+                                kernel_size=1))
+            self.conv_block.add(norm_layer(in_channels=planes))
+            self.conv_block.add(nn.Activation('relu'))
+            self.conv_block.add(UpsampleConvLayer(planes, planes, kernel_size=3, stride=1, upsample=stride))
+            self.conv_block.add(norm_layer(in_channels=planes))
+            self.conv_block.add(nn.Activation('relu'))
+            self.conv_block.add(nn.Conv2D(in_channels=planes, 
+                                channels=planes * self.expansion, 
+                                kernel_size=1))
+
+    def forward(self, x):
+        return  self.residual_layer(x) + self.conv_block(x)
+
+
+class ConvLayer(Block):
+    def __init__(self, in_channels, out_channels, kernel_size, stride):
+        super(ConvLayer, self).__init__()
+        padding = int(np.floor(kernel_size / 2))
+        self.pad = ReflectancePadding(pad_width=(0,0,0,0,padding,padding,padding,padding))
+        self.conv2d = nn.Conv2D(in_channels=in_channels, channels=out_channels, 
+                                kernel_size=kernel_size, strides=(stride,stride),
+                                padding=0)
+
+    def forward(self, x):
+        x = self.pad(x)
+        out = self.conv2d(x)
+        return out
+
+
+class UpsampleConvLayer(Block):
+    """UpsampleConvLayer
+    Upsamples the input and then does a convolution. This method gives better results
+    compared to ConvTranspose2d.
+    ref: http://distill.pub/2016/deconv-checkerboard/
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, 
+            stride, upsample=None):
+        super(UpsampleConvLayer, self).__init__()
+        self.upsample = upsample
+        """
+        if upsample:
+            self.upsample_layer = torch.nn.UpsamplingNearest2d(scale_factor=upsample)
+        """
+        self.reflection_padding = int(np.floor(kernel_size / 2))
+        self.conv2d = nn.Conv2D(in_channels=in_channels, 
+                                channels=out_channels, 
+                                kernel_size=kernel_size, strides=(stride,stride),
+                                padding=self.reflection_padding)
+
+    def forward(self, x):
+        if self.upsample:
+            x = F.UpSampling(x, scale=self.upsample, sample_type='nearest')
+        """
+        if self.reflection_padding != 0:
+            x = self.reflection_pad(x)
+        """
+        out = self.conv2d(x)
+        return out
+
+
+def gram_matrix(y):
+    (b, ch, h, w) = y.shape
+    features = y.reshape((b, ch, w * h))
+    #features_t = F.SwapAxis(features,1, 2)
+    gram = F.batch_dot(features, features, transpose_b=True) / (ch * h * w)
+    return gram
+
+
+class GramMatrix(Block):
+    def forward(self, x):
+        gram = gram_matrix(x)
+        return gram
+
+class Net(Block):
+    def __init__(self, input_nc=3, output_nc=3, ngf=64, 
+                 norm_layer=InstanceNorm, n_blocks=6, gpu_ids=[]):
+        super(Net, self).__init__()
+        self.gpu_ids = gpu_ids
+        self.gram = GramMatrix()
+
+        block = Bottleneck
+        upblock = UpBottleneck
+        expansion = 4
+
+        with self.name_scope():
+            self.model1 = nn.Sequential()
+            self.ins = Inspiration(ngf*expansion)
+            self.model = nn.Sequential()
+
+            self.model1.add(ConvLayer(input_nc, 64, kernel_size=7, stride=1))
+            self.model1.add(norm_layer(in_channels=64))
+            self.model1.add(nn.Activation('relu'))
+            self.model1.add(block(64, 32, 2, 1, norm_layer))
+            self.model1.add(block(32*expansion, ngf, 2, 1, norm_layer))
+
+
+            self.model.add(self.model1)
+            self.model.add(self.ins)
+
+            for i in range(n_blocks):
+                self.model.add(block(ngf*expansion, ngf, 1, None, norm_layer))
+        
+            self.model.add(upblock(ngf*expansion, 32, 2, norm_layer))
+            self.model.add(upblock(32*expansion, 16, 2, norm_layer))
+            self.model.add(norm_layer(in_channels=16*expansion))
+            self.model.add(nn.Activation('relu'))
+            self.model.add(ConvLayer(16*expansion, output_nc, kernel_size=7, stride=1))
+
+
+    def setTarget(self, Xs):
+        F = self.model1(Xs)
+        G = self.gram(F)
+        self.ins.setTarget(G)
+
+    def forward(self, input):
+        return self.model(input)
+
+
+class Inspiration(HybridBlock):
+    """ Inspiration Layer (from MSG-Net paper)
+    tuning the featuremap with target Gram Matrix
+    ref https://arxiv.org/abs/1703.06953
+    """
+    def __init__(self, C, B=1):
+        super(Inspiration, self).__init__()
+        # B is equal to 1 or input mini_batch
+        self.C = C
+        self.weight = self.params.get('weight', shape=(1,C,C),
+                                      init=mx.initializer.Uniform(),
+                                      allow_deferred_init=True)
+        self.gram = self.params.get('gram', shape=(B,C,C),
+                                    init=mx.initializer.Uniform(),
+                                    allow_deferred_init=True,
+                                    lr_mult=0)
+
+    def setTarget(self, target):
+        self.gram.set_data(target)
+
+    def forward(self, X):
+        # input X is a 3D feature map
+        self.P = F.batch_dot(F.broadcast_to(self.weight.data(), shape=(self.gram.shape)), self.gram.data())
+        return F.batch_dot(F.SwapAxis(self.P,1,2).broadcast_to((X.shape[0], self.C, self.C)), X.reshape((0,0,X.shape[2]*X.shape[3]))).reshape(X.shape)
+
+    def __repr__(self):
+        return self.__class__.__name__ + '(' \
+            + 'N x ' + str(self.C) + ')'
+
+
+class Vgg16(Block):
+    def __init__(self):
+        super(Vgg16, self).__init__()
+        self.conv1_1 = nn.Conv2D(in_channels=3, channels=64, kernel_size=3, strides=1, padding=1)
+        self.conv1_2 = nn.Conv2D(in_channels=64, channels=64, kernel_size=3, strides=1, padding=1)
+
+        self.conv2_1 = nn.Conv2D(in_channels=64, channels=128, kernel_size=3, strides=1, padding=1)
+        self.conv2_2 = nn.Conv2D(in_channels=128, channels=128, kernel_size=3, strides=1, padding=1)
+
+        self.conv3_1 = nn.Conv2D(in_channels=128, channels=256, kernel_size=3, strides=1, padding=1)
+        self.conv3_2 = nn.Conv2D(in_channels=256, channels=256, kernel_size=3, strides=1, padding=1)
+        self.conv3_3 = nn.Conv2D(in_channels=256, channels=256, kernel_size=3, strides=1, padding=1)
+
+        self.conv4_1 = nn.Conv2D(in_channels=256, channels=512, kernel_size=3, strides=1, padding=1)
+        self.conv4_2 = nn.Conv2D(in_channels=512, channels=512, kernel_size=3, strides=1, padding=1)
+        self.conv4_3 = nn.Conv2D(in_channels=512, channels=512, kernel_size=3, strides=1, padding=1)
+
+        self.conv5_1 = nn.Conv2D(in_channels=512, channels=512, kernel_size=3, strides=1, padding=1)
+        self.conv5_2 = nn.Conv2D(in_channels=512, channels=512, kernel_size=3, strides=1, padding=1)
+        self.conv5_3 = nn.Conv2D(in_channels=512, channels=512, kernel_size=3, strides=1, padding=1)
+
+    def forward(self, X):
+        h = F.Activation(self.conv1_1(X), act_type='relu')
+        h = F.Activation(self.conv1_2(h), act_type='relu')
+        relu1_2 = h
+        h = F.Pooling(h, pool_type='max', kernel=(2, 2), stride=(2, 2))
+
+        h = F.Activation(self.conv2_1(h), act_type='relu')
+        h = F.Activation(self.conv2_2(h), act_type='relu')
+        relu2_2 = h
+        h = F.Pooling(h, pool_type='max', kernel=(2, 2), stride=(2, 2))
+
+        h = F.Activation(self.conv3_1(h), act_type='relu')
+        h = F.Activation(self.conv3_2(h), act_type='relu')
+        h = F.Activation(self.conv3_3(h), act_type='relu')
+        relu3_3 = h
+        h = F.Pooling(h, pool_type='max', kernel=(2, 2), stride=(2, 2))
+
+        h = F.Activation(self.conv4_1(h), act_type='relu')
+        h = F.Activation(self.conv4_2(h), act_type='relu')
+        h = F.Activation(self.conv4_3(h), act_type='relu')
+        relu4_3 = h
+
+        return [relu1_2, relu2_2, relu3_3, relu4_3]
+
+
+def test_InstanceNorm():
+    import torch
+    from torch import nn as nn2
+    from torch.autograd import Variable
+    tx = Variable(torch.Tensor(1, 2, 200, 300).uniform_(0,1))
+    tlayer = nn2.InstanceNorm2d(2)
+    ty = tlayer(tx)
+    
+    mlayer = InstanceNorm(2)
+    ctx = mx.cpu(0)
+    mlayer.initialize(ctx=ctx)
+    mmx = (mx.nd.array(tx.data.numpy())).as_in_context(ctx)
+    my = mlayer(mmx)
+    print('tx',tx)
+    print('mmx',mmx)
+    print('ty',ty)
+    print('my',my)
+
+if __name__ == "__main__":
+    test_InstanceNorm()
+
diff --git a/example/gluon/style_transfer/option.py b/example/gluon/style_transfer/option.py
new file mode 100644
index 000000000000..5faa52259d7c
--- /dev/null
+++ b/example/gluon/style_transfer/option.py
@@ -0,0 +1,109 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import os
+
+class Options():
+    def __init__(self):
+        self.parser = argparse.ArgumentParser(description="parser for MXNet-Gluon-Style-Transfer")
+        subparsers = self.parser.add_subparsers(title="subcommands", dest="subcommand")
+
+        # training args
+        train_arg = subparsers.add_parser("train",
+                                    help="parser for training arguments")
+        train_arg.add_argument("--ngf", type=int, default=128,
+                                help="number of generator filter channels, default 128")
+        train_arg.add_argument("--epochs", type=int, default=4,
+                                help="number of training epochs, default is 2")
+        train_arg.add_argument("--batch-size", type=int, default=4,
+                                help="batch size for training, default is 4")
+        train_arg.add_argument("--dataset", type=str, default="dataset/",
+                                help="path to training dataset, the path should point to a folder "
+                                "containing another folder with all the training images")
+        train_arg.add_argument("--style-folder", type=str, default="images/styles/",
+                                help="path to style-folder")
+        train_arg.add_argument("--save-model-dir", type=str, default="models/",
+                                help="path to folder where trained model will be saved.")
+        train_arg.add_argument("--image-size", type=int, default=256,
+                                help="size of training images, default is 256 X 256")
+        train_arg.add_argument("--style-size", type=int, default=512,
+                                help="size of style-image, default is the original size of style image")
+        train_arg.add_argument("--cuda", type=int, default=1, 
+                                help="set it to 1 for running on GPU, 0 for CPU")
+        train_arg.add_argument("--seed", type=int, default=42, 
+                                help="random seed for training")
+        train_arg.add_argument("--content-weight", type=float, default=1.0,
+                                help="weight for content-loss, default is 1.0")
+        train_arg.add_argument("--style-weight", type=float, default=5.0,
+                                help="weight for style-loss, default is 5.0")
+        train_arg.add_argument("--lr", type=float, default=1e-3,
+                                help="learning rate, default is 0.001")
+        train_arg.add_argument("--log-interval", type=int, default=500,
+                                help="number of images after which the training loss is logged, default is 500")
+        train_arg.add_argument("--resume", type=str, default=None,
+                                help="resume if needed")
+
+        # optim args (Gatys CVPR 2016)
+        optim_arg = subparsers.add_parser("optim",
+                                    help="parser for optimization arguments")
+        optim_arg.add_argument("--iters", type=int, default=500,
+                                help="number of training iterations, default is 500")
+        optim_arg.add_argument("--content-image", type=str, default="images/content/venice-boat.jpg",
+                                help="path to content image you want to stylize")
+        optim_arg.add_argument("--style-image", type=str, default="images/9styles/candy.jpg",
+                                help="path to style-image")
+        optim_arg.add_argument("--content-size", type=int, default=512,
+                                help="factor for scaling down the content image")
+        optim_arg.add_argument("--style-size", type=int, default=512,
+                                help="size of style-image, default is the original size of style image")
+        optim_arg.add_argument("--output-image", type=str, default="output.jpg",
+                                help="path for saving the output image")
+        optim_arg.add_argument("--cuda", type=int, default=1, 
+                                help="set it to 1 for running on GPU, 0 for CPU")
+        optim_arg.add_argument("--content-weight", type=float, default=1.0,
+                                help="weight for content-loss, default is 1.0")
+        optim_arg.add_argument("--style-weight", type=float, default=5.0,
+                                help="weight for style-loss, default is 5.0")
+        optim_arg.add_argument("--lr", type=float, default=1e1,
+                                help="learning rate, default is 0.001")
+        optim_arg.add_argument("--log-interval", type=int, default=50,
+                                help="number of images after which the training loss is logged, default is 50")    
+
+        # evaluation args
+        eval_arg = subparsers.add_parser("eval", help="parser for evaluation/stylizing arguments")
+        eval_arg.add_argument("--ngf", type=int, default=128,
+                                help="number of generator filter channels, default 128")
+        eval_arg.add_argument("--content-image", type=str, required=True,
+                                help="path to content image you want to stylize")
+        eval_arg.add_argument("--style-image", type=str, default="images/9styles/candy.jpg",
+                                help="path to style-image")
+        eval_arg.add_argument("--content-size", type=int, default=512,
+                                help="factor for scaling down the content image")
+        eval_arg.add_argument("--style-size", type=int, default=512,
+                                help="size of style-image, default is the original size of style image")
+        eval_arg.add_argument("--style-folder", type=str, default="images/9styles/",
+                                help="path to style-folder")
+        eval_arg.add_argument("--output-image", type=str, default="output.jpg",
+                                help="path for saving the output image")
+        eval_arg.add_argument("--model", type=str, required=True,
+                                help="saved model to be used for stylizing the image")
+        eval_arg.add_argument("--cuda", type=int, default=1,
+                                help="set it to 1 for running on GPU, 0 for CPU")    
+
+    def parse(self):
+        return self.parser.parse_args()
diff --git a/example/gluon/style_transfer/utils.py b/example/gluon/style_transfer/utils.py
new file mode 100644
index 000000000000..e02733198b3c
--- /dev/null
+++ b/example/gluon/style_transfer/utils.py
@@ -0,0 +1,227 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os
+import numbers
+from PIL import Image
+
+import numpy as np
+import mxnet as mx
+import mxnet.ndarray as F
+
+def tensor_load_rgbimage(filename, ctx, size=None, scale=None, keep_asp=False):
+    img = Image.open(filename).convert('RGB')
+    if size is not None:
+        if keep_asp:
+            size2 = int(size * 1.0 / img.size[0] * img.size[1])
+            img = img.resize((size, size2), Image.ANTIALIAS)
+        else:
+            img = img.resize((size, size), Image.ANTIALIAS)
+
+    elif scale is not None:
+        img = img.resize((int(img.size[0] / scale), int(img.size[1] / scale)), Image.ANTIALIAS)
+    img = np.array(img).transpose(2, 0, 1).astype(float)
+    img = F.expand_dims(mx.nd.array(img, ctx=ctx), 0)
+    return img
+
+
+def tensor_save_rgbimage(img, filename, cuda=False):
+    img = F.clip(img, 0, 255).asnumpy()
+    img = img.transpose(1, 2, 0).astype('uint8')
+    img = Image.fromarray(img)
+    img.save(filename)
+
+
+def tensor_save_bgrimage(tensor, filename, cuda=False):
+    (b, g, r) = F.split(tensor, num_outputs=3, axis=0)
+    tensor = F.concat(r, g, b, dim=0)
+    tensor_save_rgbimage(tensor, filename, cuda)
+
+
+def subtract_imagenet_mean_batch(batch):
+    """Subtract ImageNet mean pixel-wise from a BGR image."""
+    batch = F.swapaxes(batch,0, 1)
+    (r, g, b) = F.split(batch, num_outputs=3, axis=0)
+    r = r - 123.680
+    g = g - 116.779
+    b = b - 103.939
+    batch = F.concat(r, g, b, dim=0)
+    batch = F.swapaxes(batch,0, 1)
+    return batch
+
+
+def subtract_imagenet_mean_preprocess_batch(batch):
+    """Subtract ImageNet mean pixel-wise from a BGR image."""
+    batch = F.swapaxes(batch,0, 1)
+    (r, g, b) = F.split(batch, num_outputs=3, axis=0)
+    r = r - 123.680
+    g = g - 116.779
+    b = b - 103.939
+    batch = F.concat(b, g, r, dim=0)
+    batch = F.swapaxes(batch,0, 1)
+    return batch
+
+
+def add_imagenet_mean_batch(batch):
+    batch = F.swapaxes(batch,0, 1)
+    (b, g, r) = F.split(batch, num_outputs=3, axis=0)
+    r = r + 123.680
+    g = g + 116.779
+    b = b + 103.939
+    batch = F.concat(b, g, r, dim=0)
+    batch = F.swapaxes(batch,0, 1)
+    """
+    batch = denormalizer(batch)
+    """
+    return batch
+
+
+def imagenet_clamp_batch(batch, low, high):
+    """ Not necessary in practice """
+    F.clip(batch[:,0,:,:],low-123.680, high-123.680)
+    F.clip(batch[:,1,:,:],low-116.779, high-116.779)
+    F.clip(batch[:,2,:,:],low-103.939, high-103.939)
+
+
+def preprocess_batch(batch):
+    batch = F.swapaxes(batch, 0, 1)
+    (r, g, b) = F.split(batch, num_outputs=3, axis=0)
+    batch = F.concat(b, g, r, dim=0)
+    batch = F.swapaxes(batch, 0, 1)
+    return batch
+
+
+class ToTensor(object):
+    def __init__(self, ctx):
+        self.ctx = ctx
+
+    def __call__(self, img):
+        img = mx.nd.array(np.array(img).transpose(2, 0, 1).astype('float32'), ctx=self.ctx)
+        return img
+
+
+class Compose(object):
+    """Composes several transforms together.
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img):
+        for t in self.transforms:
+            img = t(img)
+        return img
+
+
+class Scale(object):
+    """Rescale the input PIL.Image to the given size.
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (w, h), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size)
+        interpolation (int, optional): Desired interpolation. Default is
+            ``PIL.Image.BILINEAR``
+    """
+
+    def __init__(self, size, interpolation=Image.BILINEAR):
+        assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
+        self.size = size
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL.Image): Image to be scaled.
+        Returns:
+            PIL.Image: Rescaled image.
+        """
+        if isinstance(self.size, int):
+            w, h = img.size
+            if (w <= h and w == self.size) or (h <= w and h == self.size):
+                return img
+            if w < h:
+                ow = self.size
+                oh = int(self.size * h / w)
+                return img.resize((ow, oh), self.interpolation)
+            else:
+                oh = self.size
+                ow = int(self.size * w / h)
+                return img.resize((ow, oh), self.interpolation)
+        else:
+            return img.resize(self.size, self.interpolation)
+
+
+class CenterCrop(object):
+    """Crops the given PIL.Image at the center.
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+    """
+
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    def __call__(self, img):
+        """
+        Args:
+            img (PIL.Image): Image to be cropped.
+        Returns:
+            PIL.Image: Cropped image.
+        """
+        w, h = img.size
+        th, tw = self.size
+        x1 = int(round((w - tw) / 2.))
+        y1 = int(round((h - th) / 2.))
+        return img.crop((x1, y1, x1 + tw, y1 + th))
+
+
+class StyleLoader():
+    def __init__(self, style_folder, style_size, ctx):
+        self.folder = style_folder
+        self.style_size = style_size
+        self.files = os.listdir(style_folder)
+        assert(len(self.files) > 0)
+        self.ctx = ctx
+    
+    def get(self, i):
+        idx = i%len(self.files)
+        filepath = os.path.join(self.folder, self.files[idx])
+        style = tensor_load_rgbimage(filepath, self.ctx, self.style_size)
+        return style
+
+    def size(self):
+        return len(self.files)
+
+def init_vgg_params(vgg, model_folder, ctx):
+    if not os.path.exists(os.path.join(model_folder, 'mxvgg.params')):
+        os.system('wget https://www.dropbox.com/s/7c92s0guekwrwzf/mxvgg.params?dl=1 -O' + os.path.join(model_folder, 'mxvgg.params'))
+    vgg.collect_params().load(os.path.join(model_folder, 'mxvgg.params'), ctx=ctx)
+    for param in vgg.collect_params().values():
+        param.grad_req = 'null'
diff --git a/example/gluon/super_resolution.py b/example/gluon/super_resolution.py
index acc5ffa01d4a..7963590c6db8 100644
--- a/example/gluon/super_resolution.py
+++ b/example/gluon/super_resolution.py
@@ -144,7 +144,7 @@ def train(epoch, ctx):
         ctx = [ctx]
     net.initialize(mx.init.Orthogonal(), ctx=ctx)
     # re-initialize conv4's weight to be Orthogonal
-    net.conv4.collect_params().initialize(mx.init.Orthogonal(scale=1), ctx=ctx)
+    net.conv4.collect_params().initialize(mx.init.Orthogonal(scale=1), force_reinit=True, ctx=ctx)
     trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': opt.lr})
     loss = gluon.loss.L2Loss()
 
diff --git a/example/gluon/tree_lstm/dataset.cPickle b/example/gluon/tree_lstm/dataset.cPickle
new file mode 100644
index 000000000000..bdfca53a8390
Binary files /dev/null and b/example/gluon/tree_lstm/dataset.cPickle differ
diff --git a/example/gluon/word_language_model/get_ptb_data.sh b/example/gluon/word_language_model/get_ptb_data.sh
index d2641cb32b81..0a0c7051b010 100755
--- a/example/gluon/word_language_model/get_ptb_data.sh
+++ b/example/gluon/word_language_model/get_ptb_data.sh
@@ -17,6 +17,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+echo ""
+echo "NOTE: Please review the licensing of the datasets in this script before proceeding"
+echo "See https://catalog.ldc.upenn.edu/ldc99t42 for the licensing"
+echo "Once that is done, please uncomment the wget commands in this script"
+echo ""
 
 RNN_DIR=$(cd `dirname $0`; pwd)
 DATA_DIR="${RNN_DIR}/data/"
@@ -26,7 +31,7 @@ if [[ ! -d "${DATA_DIR}" ]]; then
   mkdir -p ${DATA_DIR}
 fi
 
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/example/gluon/word_language_model/train.py b/example/gluon/word_language_model/train.py
index 0b504998bed2..b419277dcf09 100644
--- a/example/gluon/word_language_model/train.py
+++ b/example/gluon/word_language_model/train.py
@@ -54,6 +54,11 @@
                     help='report interval')
 parser.add_argument('--save', type=str, default='model.params',
                     help='path to save the final model')
+parser.add_argument('--gctype', type=str, default='none',
+                    help='type of gradient compression to use, \
+                          takes `2bit` or `none` for now.')
+parser.add_argument('--gcthreshold', type=float, default=0.5,
+                    help='threshold for 2bit gradient compression')
 args = parser.parse_args()
 
 
@@ -90,10 +95,13 @@ def batchify(data, batch_size):
 model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                        args.nlayers, args.dropout, args.tied)
 model.collect_params().initialize(mx.init.Xavier(), ctx=context)
+
+compression_params = None if args.gctype == 'none' else {'type': args.gctype, 'threshold': args.gcthreshold}
 trainer = gluon.Trainer(model.collect_params(), 'sgd',
                         {'learning_rate': args.lr,
                          'momentum': 0,
-                         'wd': 0})
+                         'wd': 0},
+                        compression_params=compression_params)
 loss = gluon.loss.SoftmaxCrossEntropyLoss()
 
 ###############################################################################
diff --git a/example/image-classification/README.md b/example/image-classification/README.md
index 3f514e2a391f..8a64b5530a4e 100644
--- a/example/image-classification/README.md
+++ b/example/image-classification/README.md
@@ -128,7 +128,7 @@ to calculate the accuracy.
 | `imagenet1k-resnet-152`        | 0.7653 | 0.9312 |
 | `imagenet1k-resnext-50`        | 0.7689 | 0.9332 |
 | `imagenet1k-resnext-101`       | 0.7828 | 0.9408 |
-| `imagenet1k-rexnext-101-64x4d` | 0.7911 | 0.9430 |
+| `imagenet1k-resnext-101-64x4d` | 0.7911 | 0.9430 |
 
 Note:
 - our Resnet does not need to specify the RGB mean due the data batch
@@ -263,14 +263,28 @@ The `benchmark.py` can be used to run a series of benchmarks against different i
 - `--worker_file`: file that contains a list of worker hostnames or list of worker ip addresses that have passwordless ssh enabled.
 - `--worker_count`: number of workers to run benchmark on.
 - `--gpu_count`: number of gpus on each worker to use.
-- `--networks`: one or more networks in the format network_name:batch_size:image_size.
+- `--networks`: one or more networks in the format mode:network_name:batch_size:image_size. (Use `native` mode for imagenet benchmarks and any of the symbolic/imperative/hybrid for gluon benchmarks). Be sure to use appropriate models according to the mode you are using.
 
 The `benchmark.py` script runs benchmarks on variable number of gpus upto gpu_count starting from 1 gpu doubling the number of gpus in each run using `kv-store=device` and after that running on variable number of nodes on all gpus starting with 1 node upto `worker_count` doubling the number of nodes used in each run using `kv-store=dist_sync_device`.
 
 An example to run the benchmark script is shown below with 8 workers and 16 gpus on each worker:
 ```
 python benchmark.py --worker_file /opt/deeplearning/workers --worker_count 8 \
-  --gpu_count 16 --networks 'inception-v3:32:299'
+  --gpu_count 16 --networks 'native:inception-v3:32:299'
+```
+
+Additionally, this script also runs [Gluon vision models](mxnet/python/mxnet/gluon/model_zoo/model_store.py) benchmarking [image_classification](mxnet/example/gluon/image_classification.py) script
+for all three symbolic, imperative and hybrid paradigms using synthetic data.
+An example to run the benchmark script is shown below with 8 workers and 16 gpus on each worker:
+```
+python benchmark.py --worker_file /opt/deeplearning/workers --worker_count 8 \
+  --gpu_count 16 --networks 'imperative:resnet152_v1:32:299'
+```
+
+To run benchmark on gluon vision models, use `--benchmark 1`  as the argument to `image_classification.py`, An example is shown below:
+```
+python ../gluon/image_classification.py --dataset dummy --gpus 2 --epochs 1 --benchmark --mode imperative \
+  --model resnet152_v1 --batch-size 32 --log-interval 1 --kv-store dist_sync_device
 ```
 
 ### Scalability Results
diff --git a/example/image-classification/benchmark.py b/example/image-classification/benchmark.py
index 3096fae07897..2a50d506adb6 100644
--- a/example/image-classification/benchmark.py
+++ b/example/image-classification/benchmark.py
@@ -87,7 +87,8 @@ def startCmd(self, timeout):
 LOGGER = setup_logging(log_loc)
 
 class Network(object):
-    def __init__(self, name, img_size, batch_size):
+    def __init__(self, mode, name, img_size, batch_size):
+        self.mode = mode
         self.name = name
         self.img_size = img_size
         self.batch_size = batch_size
@@ -97,44 +98,54 @@ def parse_args():
     class NetworkArgumentAction(argparse.Action):
         def validate(self, attrs):
             args = attrs.split(':')
-            if len(args) != 3 or isinstance(args[0], str) == False:
-                print('expected network attributes in format network_name:batch_size:image_size \
-                \nThe network_name is a valid model defined as network_name.py in the image-classification/symbol folder.')
+            if len(args) != 4 or isinstance(args[0], str) == False or isinstance(args[1], str) == False:
+                print('expected network attributes in format mode:network_name:batch_size:image_size \
+                \nThe network_name is a valid model defined as network_name.py in the image-classification/symbol folder. \
+                \nOr a gluon vision model defined in mxnet/python/mxnet/gluon/model_zoo/model_store.py.')
                 sys.exit(1)
             try:
-                #check if the network exists
-                importlib.import_module('symbols.'+ args[0])
-                batch_size = int(args[1])
-                img_size = int(args[2])
-                return Network(name=args[0], batch_size=batch_size, img_size=img_size)
+                # check if the network exists
+                if args[0] == 'native':
+                    importlib.import_module('symbols.' + args[1])
+                batch_size = int(args[2])
+                img_size = int(args[3])
+                return Network(mode=args[0], name=args[1], batch_size=batch_size, img_size=img_size)
             except Exception as e:
-                print('expected network attributes in format network_name:batch_size:image_size \
-                \nThe network_name is a valid model defined as network_name.py in the image-classification/symbol folder.')
+                print('expected network attributes in format mode:network_name:batch_size:image_size \
+                \nThe network_name is a valid model defined as network_name.py in the image-classification/symbol folder. \
+                \nOr a gluon vision model defined in mxnet/python/mxnet/gluon/model_zoo/model_store.py.')
                 print(e)
                 sys.exit(1)
+
         def __init__(self, *args, **kw):
             kw['nargs'] = '+'
             argparse.Action.__init__(self, *args, **kw)
+
         def __call__(self, parser, namespace, values, option_string=None):
             if isinstance(values, list) == True:
                 setattr(namespace, self.dest, map(self.validate, values))
             else:
                 setattr(namespace, self.dest, self.validate(values))
+
     parser = argparse.ArgumentParser(description='Run Benchmark on various imagenet networks using train_imagenent.py')
-    parser.add_argument('--networks', dest='networks', nargs= '+', type=str, help= 'one or more networks in the format network_name:batch_size:image_size \
-    \nThe network_name is a valid model defined as network_name.py in the image-classification/symbol folder.',action=NetworkArgumentAction)
-    parser.add_argument('--worker_file', type=str, help='file that contains a list of worker hostnames or list of worker ip addresses that can be sshed without a password.',required=True)
+    parser.add_argument('--networks', dest='networks', nargs='+', type=str, help='one or more networks in the format mode:network_name:batch_size:image_size \
+    \nThe network_name is a valid model defined as network_name.py in the image-classification/symbol folder for native imagenet \
+    \n Or a gluon vision model defined in mxnet/python/mxnet/gluon/model_zoo/model_store.py.',
+                        action=NetworkArgumentAction)
+    parser.add_argument('--worker_file', type=str,
+                        help='file that contains a list of worker hostnames or list of worker ip addresses that can be sshed without a password.',
+                        required=True)
     parser.add_argument('--worker_count', type=int, help='number of workers to run benchmark on.', required=True)
     parser.add_argument('--gpu_count', type=int, help='number of gpus on each worker to use.', required=True)
     args = parser.parse_args()
     return args
 
 def series(max_count):
-    i=1
-    s=[]
+    i = 1
+    s = []
     while i <= max_count:
         s.append(i)
-        i=i*2
+        i = i * 2
     if s[-1] < max_count:
         s.append(max_count)
     return s
@@ -142,9 +153,12 @@ def series(max_count):
 '''
 Choose the middle iteration to get the images processed per sec
 '''
-def images_processed(log_loc):
-    f=open(log_loc)
-    img_per_sec = re.findall("(?:Batch\s+\[30\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)", str(f.readlines()))
+def images_processed(log_loc, mode):
+    f = open(log_loc)
+    if mode == 'native':
+        img_per_sec = re.findall("(?:Batch\s+\[30\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)", str(f.readlines()))
+    else:
+        img_per_sec = re.findall("(?:Batch\s+\[3\]\\\\tSpeed:\s+)(\d+\.\d+)(?:\s+)", str(f.readlines()))
     f.close()
     img_per_sec = map(float, img_per_sec)
     total_img_per_sec = sum(img_per_sec)
@@ -157,48 +171,64 @@ def generate_hosts_file(num_nodes, workers_file, args_workers_file):
     f.close()
     return
 
-def stop_old_processes(hosts_file):
-    stop_args = ['python', '../../tools/kill-mxnet.py', hosts_file]
+def stop_old_processes(hosts_file, prog_name):
+    stop_args = ['python', '../../tools/kill-mxnet.py', hosts_file, 'python', prog_name]
     stop_args_str = ' '.join(stop_args)
     LOGGER.info('killing old remote processes\n %s', stop_args_str)
     stop = subprocess.check_output(stop_args, stderr=subprocess.STDOUT)
     LOGGER.debug(stop)
     time.sleep(1)
 
-def run_imagenet(kv_store, data_shape, batch_size, num_gpus, num_nodes, network, args_workers_file):
-    imagenet_args=['python',  'train_imagenet.py',  '--gpus', ','.join(str(i) for i in range(num_gpus)), \
-                   '--network', network, '--batch-size', str(batch_size * num_gpus), \
-                   '--image-shape', '3,' + str(data_shape) + ',' + str(data_shape), '--num-epochs', '1' ,'--kv-store', kv_store, '--benchmark', '1', '--disp-batches', '10']
-    log = log_loc + '/' + network + '_' + str(num_nodes*num_gpus) + '_log'
-    hosts = log_loc + '/' + network + '_' + str(num_nodes*num_gpus) + '_workers'
+def run_benchmark(kv_store, data_shape, batch_size, num_gpus, num_nodes, network, args_workers_file, mode):
+    if mode == 'native':
+        benchmark_args = ['python', 'train_imagenet.py', '--gpus', ','.join(str(i) for i in range(num_gpus)), \
+                          '--network', network, '--batch-size', str(batch_size * num_gpus), \
+                          '--image-shape', '3,' + str(data_shape) + ',' + str(data_shape), '--num-epochs', '1',
+                          '--kv-store', kv_store, '--benchmark', '1', '--disp-batches', '10']
+    else:
+        benchmark_args = ['python', '../gluon/image_classification.py', '--dataset', 'dummy', '--gpus', str(num_gpus), \
+                          '--epochs', '1', '--benchmark', '--mode', mode, '--model', network, '--batch-size',
+                          str(batch_size), \
+                          '--log-interval', str(1), '--kvstore', kv_store]
+
+    log = log_loc + '/' + network + '_' + str(num_nodes * num_gpus) + '_log'
+    hosts = log_loc + '/' + network + '_' + str(num_nodes * num_gpus) + '_workers'
     generate_hosts_file(num_nodes, hosts, args_workers_file)
-    stop_old_processes(hosts)
-    launch_args = ['../../tools/launch.py', '-n', str(num_nodes), '-s', str(num_nodes*2), '-H', hosts, ' '.join(imagenet_args) ]
+    if mode == 'native':
+        stop_old_processes(hosts, 'train_imagenet.py')
+    else:
+        stop_old_processes(hosts, '../gluon/image-classification.py')
+    launch_args = ['../../tools/launch.py', '-n', str(num_nodes), '-s', str(num_nodes * 2), '-H', hosts,
+                   ' '.join(benchmark_args)]
 
-    #use train_imagenet when running on a single node
+    # use train_imagenet/image_classification when running on a single node
     if kv_store == 'device':
-        imagenet = RunCmd(imagenet_args, log)
-        imagenet.startCmd(timeout = 60 * 10)
+        imagenet = RunCmd(benchmark_args, log)
+        imagenet.startCmd(timeout=60 * 10)
     else:
         launch = RunCmd(launch_args, log)
-        launch.startCmd(timeout = 60 * 10)
+        launch.startCmd(timeout=60 * 10)
 
-    stop_old_processes(hosts)
-    img_per_sec = images_processed(log)
-    LOGGER.info('network: %s, num_gpus: %d, image/sec: %f', network, num_gpus*num_nodes, img_per_sec)
+    if mode == 'native':
+        stop_old_processes(hosts, 'train_imagenet.py')
+    else:
+        stop_old_processes(hosts, '../gluon/image-classification.py')
+    img_per_sec = images_processed(log, mode)
+    LOGGER.info('network: %s, num_gpus: %d, image/sec: %f', network, num_gpus * num_nodes, img_per_sec)
     return img_per_sec
 
 def plot_graph(args):
-    speedup_chart = pygal.Line(x_title ='gpus',y_title ='speedup', logarithmic=True)
+    speedup_chart = pygal.Line(x_title='gpus', y_title='speedup', logarithmic=True)
     speedup_chart.x_labels = map(str, series(args.worker_count * args.gpu_count))
     speedup_chart.add('ideal speedup', series(args.worker_count * args.gpu_count))
     for net in args.networks:
         image_single_gpu = net.gpu_speedup[1] if 1 in net.gpu_speedup or not net.gpu_speedup[1] else 1
-        y_values = [ each/image_single_gpu for each in net.gpu_speedup.values() ]
-        LOGGER.info('%s: image_single_gpu:%.2f' %(net.name, image_single_gpu))
+        y_values = [each / image_single_gpu for each in net.gpu_speedup.values()]
+        LOGGER.info('%s: image_single_gpu:%.2f' % (net.name, image_single_gpu))
         LOGGER.debug('network:%s, y_values: %s' % (net.name, ' '.join(map(str, y_values))))
-        speedup_chart.add(net.name , y_values \
-            , formatter= lambda y_val, img = copy.deepcopy(image_single_gpu), batch_size = copy.deepcopy(net.batch_size): 'speedup:%.2f, img/sec:%.2f, batch/gpu:%d' % \
+        speedup_chart.add(net.name, y_values \
+            , formatter=lambda y_val, img=copy.deepcopy(image_single_gpu), batch_size=copy.deepcopy(
+            net.batch_size): 'speedup:%.2f, img/sec:%.2f, batch/gpu:%d' % \
             (0 if y_val is None else y_val, 0 if y_val is None else y_val * img, batch_size))
     speedup_chart.render_to_file(log_loc + '/speedup.svg')
 
@@ -212,14 +242,17 @@ def write_csv(log_loc, args):
 def main():
     args = parse_args()
     for net in args.networks:
-        #use kv_store='device' when running on 1 node
+        # use kv_store='device' when running on 1 node
         for num_gpus in series(args.gpu_count):
-            imgs_per_sec = run_imagenet(kv_store='device', data_shape=net.img_size, batch_size=net.batch_size, \
-                                        num_gpus=num_gpus, num_nodes=1, network=net.name, args_workers_file=args.worker_file)
+            imgs_per_sec = run_benchmark(kv_store='device', data_shape=net.img_size, batch_size=net.batch_size, \
+                                         num_gpus=num_gpus, num_nodes=1, network=net.name,
+                                         args_workers_file=args.worker_file, mode=net.mode)
             net.gpu_speedup[num_gpus] = imgs_per_sec
         for num_nodes in series(args.worker_count)[1::]:
-            imgs_per_sec = run_imagenet(kv_store='dist_sync_device', data_shape=net.img_size, batch_size=net.batch_size, \
-                         num_gpus=args.gpu_count, num_nodes=num_nodes, network=net.name, args_workers_file=args.worker_file)
+            imgs_per_sec = run_benchmark(kv_store='dist_sync_device', data_shape=net.img_size,
+                                         batch_size=net.batch_size, \
+                                         num_gpus=args.gpu_count, num_nodes=num_nodes, network=net.name,
+                                         args_workers_file=args.worker_file, mode=net.mode)
             net.gpu_speedup[num_nodes * args.gpu_count] = imgs_per_sec
         LOGGER.info('Network: %s (num_gpus, images_processed): %s', net.name, ','.join(map(str, net.gpu_speedup.items())))
     write_csv(log_loc, args)
diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py
index eb694a45dc27..dc8915cda4c8 100755
--- a/example/image-classification/common/data.py
+++ b/example/image-classification/common/data.py
@@ -23,7 +23,9 @@
 def add_data_args(parser):
     data = parser.add_argument_group('Data', 'the input images')
     data.add_argument('--data-train', type=str, help='the training data')
+    data.add_argument('--data-train-idx', type=str, default='', help='the index of training data')
     data.add_argument('--data-val', type=str, help='the validation data')
+    data.add_argument('--data-val-idx', type=str, default='', help='the index of validation data')
     data.add_argument('--rgb-mean', type=str, default='123.68,116.779,103.939',
                       help='a tuple of size 3 for the mean rgb')
     data.add_argument('--pad-size', type=int, default=0,
@@ -119,6 +121,7 @@ def get_rec_iter(args, kv=None):
     rgb_mean = [float(i) for i in args.rgb_mean.split(',')]
     train = mx.io.ImageRecordIter(
         path_imgrec         = args.data_train,
+        path_imgidx         = args.data_train_idx,
         label_width         = 1,
         mean_r              = rgb_mean[0],
         mean_g              = rgb_mean[1],
@@ -147,6 +150,7 @@ def get_rec_iter(args, kv=None):
         return (train, None)
     val = mx.io.ImageRecordIter(
         path_imgrec         = args.data_val,
+        path_imgidx         = args.data_val_idx,
         label_width         = 1,
         mean_r              = rgb_mean[0],
         mean_g              = rgb_mean[1],
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
index 73235fc2e4ef..2b002c770266 100755
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -103,6 +103,11 @@ def add_fit_args(parser):
                        help='1 means test reading speed without training')
     train.add_argument('--dtype', type=str, default='float32',
                        help='precision: float32 or float16')
+    train.add_argument('--gc-type', type=str, default='none',
+                       help='type of gradient compression to use, \
+                             takes `2bit` or `none` for now')
+    train.add_argument('--gc-threshold', type=float, default=0.5,
+                       help='threshold for 2bit gradient compression')
     return train
 
 def fit(args, network, data_loader, **kwargs):
@@ -114,6 +119,9 @@ def fit(args, network, data_loader, **kwargs):
     """
     # kvstore
     kv = mx.kvstore.create(args.kv_store)
+    if args.gc_type != 'none':
+        kv.set_gradient_compression({'type': args.gc_type,
+                                     'threshold': args.gc_threshold})
 
     # logging
     head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
@@ -162,11 +170,15 @@ def fit(args, network, data_loader, **kwargs):
 
     lr_scheduler  = lr_scheduler
     optimizer_params = {
-            'learning_rate': lr,
-            'momentum' : args.mom,
-            'wd' : args.wd,
-            'lr_scheduler': lr_scheduler,
-            'multi_precision': True}
+        'learning_rate': lr,
+        'wd' : args.wd,
+        'lr_scheduler': lr_scheduler,
+        'multi_precision': True}
+
+    # Only a limited number of optimizers have 'momentum' property
+    has_momentum = {'sgd', 'dcasgd', 'nag'}
+    if args.optimizer in has_momentum:
+        optimizer_params['momentum'] = args.mom
 
     monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None
 
@@ -191,17 +203,17 @@ def fit(args, network, data_loader, **kwargs):
 
     # run
     model.fit(train,
-        begin_epoch        = args.load_epoch if args.load_epoch else 0,
-        num_epoch          = args.num_epochs,
-        eval_data          = val,
-        eval_metric        = eval_metrics,
-        kvstore            = kv,
-        optimizer          = args.optimizer,
-        optimizer_params   = optimizer_params,
-        initializer        = initializer,
-        arg_params         = arg_params,
-        aux_params         = aux_params,
-        batch_end_callback = batch_end_callbacks,
-        epoch_end_callback = checkpoint,
-        allow_missing      = True,
-        monitor            = monitor)
+              begin_epoch        = args.load_epoch if args.load_epoch else 0,
+              num_epoch          = args.num_epochs,
+              eval_data          = val,
+              eval_metric        = eval_metrics,
+              kvstore            = kv,
+              optimizer          = args.optimizer,
+              optimizer_params   = optimizer_params,
+              initializer        = initializer,
+              arg_params         = arg_params,
+              aux_params         = aux_params,
+              batch_end_callback = batch_end_callbacks,
+              epoch_end_callback = checkpoint,
+              allow_missing      = True,
+              monitor            = monitor)
diff --git a/example/image-classification/predict-cpp/image-classification-predict.cc b/example/image-classification/predict-cpp/image-classification-predict.cc
index fb74ed9d7170..a4a968ee3cec 100644
--- a/example/image-classification/predict-cpp/image-classification-predict.cc
+++ b/example/image-classification/predict-cpp/image-classification-predict.cc
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  *  Copyright (c) 2015 by Xiao Liu, pertusa, caprice-j
  * \file image_classification-predict.cpp
@@ -128,8 +109,8 @@ void GetImageFile(const std::string image_file,
                mean_data++;
             }
             if (channels > 1) {
-                *ptr_image_g++ = static_cast<mx_float>(*data++) - mean_g;
                 *ptr_image_b++ = static_cast<mx_float>(*data++) - mean_b;
+                *ptr_image_g++ = static_cast<mx_float>(*data++) - mean_g;
             }
 
             *ptr_image_r++ = static_cast<mx_float>(*data++) - mean_r;;
diff --git a/example/image-classification/symbols/vgg.py b/example/image-classification/symbols/vgg.py
index ca1013621863..8ae48a0a28f1 100644
--- a/example/image-classification/symbols/vgg.py
+++ b/example/image-classification/symbols/vgg.py
@@ -20,59 +20,57 @@
 Simonyan, Karen, and Andrew Zisserman. "Very deep convolutional networks for
 large-scale image recognition." arXiv preprint arXiv:1409.1556 (2014).
 """
+
 import mxnet as mx
+import numpy as np
+
+def get_feature(internel_layer, layers, filters, batch_norm = False, **kwargs):
+    for i, num in enumerate(layers):
+        for j in range(num):
+            internel_layer = mx.sym.Convolution(data = internel_layer, kernel=(3, 3), pad=(1, 1), num_filter=filters[i], name="conv%s_%s" %(i + 1, j + 1))
+            if batch_norm:
+                internel_layer = mx.symbol.BatchNorm(data=internel_layer, name="bn%s_%s" %(i + 1, j + 1))
+            internel_layer = mx.sym.Activation(data=internel_layer, act_type="relu", name="relu%s_%s" %(i + 1, j + 1))
+        internel_layer = mx.sym.Pooling(data=internel_layer, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool%s" %(i + 1))
+    return internel_layer
+
+def get_classifier(input_data, num_classes, **kwargs):
+    flatten = mx.sym.Flatten(data=input_data, name="flatten")
+    fc6 = mx.sym.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
+    relu6 = mx.sym.Activation(data=fc6, act_type="relu", name="relu6")
+    drop6 = mx.sym.Dropout(data=relu6, p=0.5, name="drop6")
+    fc7 = mx.sym.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
+    relu7 = mx.sym.Activation(data=fc7, act_type="relu", name="relu7")
+    drop7 = mx.sym.Dropout(data=relu7, p=0.5, name="drop7")
+    fc8 = mx.sym.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
+    return fc8
 
-def get_symbol(num_classes, **kwargs):
-    ## define VGG11
-    data = mx.symbol.Variable(name="data")
-    # group 1
-    conv1_1 = mx.symbol.Convolution(data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, name="conv1_1")
-    relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1")
-    pool1 = mx.symbol.Pooling(
-        data=relu1_1, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool1")
-    # group 2
-    conv2_1 = mx.symbol.Convolution(
-        data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, name="conv2_1")
-    relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1")
-    pool2 = mx.symbol.Pooling(
-        data=relu2_1, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool2")
-    # group 3
-    conv3_1 = mx.symbol.Convolution(
-        data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_1")
-    relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1")
-    conv3_2 = mx.symbol.Convolution(
-        data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, name="conv3_2")
-    relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2")
-    pool3 = mx.symbol.Pooling(
-        data=relu3_2, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool3")
-    # group 4
-    conv4_1 = mx.symbol.Convolution(
-        data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_1")
-    relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1")
-    conv4_2 = mx.symbol.Convolution(
-        data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv4_2")
-    relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2")
-    pool4 = mx.symbol.Pooling(
-        data=relu4_2, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool4")
-    # group 5
-    conv5_1 = mx.symbol.Convolution(
-        data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_1")
-    relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1")
-    conv5_2 = mx.symbol.Convolution(
-        data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, name="conv5_2")
-    relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2")
-    pool5 = mx.symbol.Pooling(
-        data=relu5_2, pool_type="max", kernel=(2, 2), stride=(2,2), name="pool5")
-    # group 6
-    flatten = mx.symbol.Flatten(data=pool5, name="flatten")
-    fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6")
-    relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6")
-    drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6")
-    # group 7
-    fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7")
-    relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7")
-    drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7")
-    # output
-    fc8 = mx.symbol.FullyConnected(data=drop7, num_hidden=num_classes, name="fc8")
-    softmax = mx.symbol.SoftmaxOutput(data=fc8, name='softmax')
-    return softmax
+def get_symbol(num_classes, num_layers=11, batch_norm=False, dtype='float32', **kwargs):
+    """
+    Parameters
+    ----------
+    num_classes : int, default 1000
+        Number of classification classes.
+    num_layers : int
+        Number of layers for the variant of densenet. Options are 11, 13, 16, 19.
+    batch_norm : bool, default False
+        Use batch normalization.
+    dtype: str, float32 or float16
+        Data precision.
+    """
+    vgg_spec = {11: ([1, 1, 2, 2, 2], [64, 128, 256, 512, 512]),
+                13: ([2, 2, 2, 2, 2], [64, 128, 256, 512, 512]),
+                16: ([2, 2, 3, 3, 3], [64, 128, 256, 512, 512]),
+                19: ([2, 2, 4, 4, 4], [64, 128, 256, 512, 512])}
+    if not vgg_spec.has_key(num_layers):
+        raise ValueError("Invalide num_layers {}. Possible choices are 11,13,16,19.".format(num_layers))
+    layers, filters = vgg_spec[num_layers]
+    data = mx.sym.Variable(name="data")
+    if dtype == 'float16':
+        data = mx.sym.Cast(data=data, dtype=np.float16)
+    feature = get_feature(data, layers, filters, batch_norm)
+    classifier = get_classifier(feature, num_classes)
+    if dtype == 'float16':
+        classifier = mx.sym.Cast(data=classifier, dtype=np.float32)
+    symbol = mx.sym.SoftmaxOutput(data=classifier, name='softmax')
+    return symbol
diff --git a/example/image-classification/train_imagenet.py b/example/image-classification/train_imagenet.py
index 5760a9af3782..f465fbc5f469 100644
--- a/example/image-classification/train_imagenet.py
+++ b/example/image-classification/train_imagenet.py
@@ -25,7 +25,7 @@
 
 if __name__ == '__main__':
     # parse args
-    parser = argparse.ArgumentParser(description="train cifar10",
+    parser = argparse.ArgumentParser(description="train imagenet-1k",
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     fit.add_fit_args(parser)
     data.add_data_args(parser)
diff --git a/example/model-parallel-lstm/get_ptb_data.sh b/example/model-parallel-lstm/get_ptb_data.sh
index d2641cb32b81..0a0c7051b010 100755
--- a/example/model-parallel-lstm/get_ptb_data.sh
+++ b/example/model-parallel-lstm/get_ptb_data.sh
@@ -17,6 +17,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+echo ""
+echo "NOTE: Please review the licensing of the datasets in this script before proceeding"
+echo "See https://catalog.ldc.upenn.edu/ldc99t42 for the licensing"
+echo "Once that is done, please uncomment the wget commands in this script"
+echo ""
 
 RNN_DIR=$(cd `dirname $0`; pwd)
 DATA_DIR="${RNN_DIR}/data/"
@@ -26,7 +31,7 @@ if [[ ! -d "${DATA_DIR}" ]]; then
   mkdir -p ${DATA_DIR}
 fi
 
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/example/multi-task/data.py b/example/multi-task/data.py
deleted file mode 100644
index 0ca8e1fd6653..000000000000
--- a/example/multi-task/data.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: skip-file
-""" data iterator for mnist """
-import sys
-import os
-# code to automatically download dataset
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
-import get_data
-import mxnet as mx
-
-def mnist_iterator(batch_size, input_shape):
-    """return train and val iterators for mnist"""
-    # download data
-    get_data.GetMNIST_ubyte()
-    flat = False if len(input_shape) == 3 else True
-
-    train_dataiter = mx.io.MNISTIter(
-        image="data/train-images-idx3-ubyte",
-        label="data/train-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        shuffle=True,
-        flat=flat)
-
-    val_dataiter = mx.io.MNISTIter(
-        image="data/t10k-images-idx3-ubyte",
-        label="data/t10k-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        flat=flat)
-
-    return (train_dataiter, val_dataiter)
diff --git a/example/multi-task/example_multi_task.py b/example/multi-task/example_multi_task.py
index 853d435bbf0b..ec5ece985cd3 100644
--- a/example/multi-task/example_multi_task.py
+++ b/example/multi-task/example_multi_task.py
@@ -17,8 +17,11 @@
 
 # pylint: skip-file
 import sys
+import os
 sys.path.insert(0, "../../python/")
-from data import mnist_iterator
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+from get_data import MNISTIterator
 import mxnet as mx
 import numpy as np
 import logging
@@ -76,7 +79,13 @@ class Multi_Accuracy(mx.metric.EvalMetric):
     """Calculate accuracies of multi label"""
 
     def __init__(self, num=None):
-        super(Multi_Accuracy, self).__init__('multi-accuracy', num)
+        self.num = num
+        super(Multi_Accuracy, self).__init__('multi-accuracy')
+
+    def reset(self):
+        """Resets the internal evaluation result to initial state."""
+        self.num_inst = 0 if self.num is None else [0] * self.num
+        self.sum_metric = 0.0 if self.num is None else [0.0] * self.num
 
     def update(self, labels, preds):
         mx.metric.check_label_shapes(labels, preds)
@@ -90,13 +99,43 @@ def update(self, labels, preds):
 
             mx.metric.check_label_shapes(label, pred_label)
 
-            if i is None:
+            if self.num is None:
                 self.sum_metric += (pred_label.flat == label.flat).sum()
                 self.num_inst += len(pred_label.flat)
             else:
                 self.sum_metric[i] += (pred_label.flat == label.flat).sum()
                 self.num_inst[i] += len(pred_label.flat)
 
+    def get(self):
+        """Gets the current evaluation result.
+
+        Returns
+        -------
+        names : list of str
+           Name of the metrics.
+        values : list of float
+           Value of the evaluations.
+        """
+        if self.num is None:
+            return super(Multi_Accuracy, self).get()
+        else:
+            return zip(*(('%s-task%d'%(self.name, i), float('nan') if self.num_inst[i] == 0
+                                                      else self.sum_metric[i] / self.num_inst[i])
+                       for i in range(self.num)))
+
+    def get_name_value(self):
+        """Returns zipped name and value pairs.
+
+        Returns
+        -------
+        list of tuples
+            A (name, value) tuple list.
+        """
+        if self.num is None:
+            return super(Multi_Accuracy, self).get_name_value()
+        name, value = self.get()
+        return list(zip(name, value))
+
 
 batch_size=100
 num_epochs=100
@@ -104,7 +143,7 @@ def update(self, labels, preds):
 lr = 0.01
 
 network = build_network()
-train, val = mnist_iterator(batch_size=batch_size, input_shape = (784,))
+train, val = MNISTIterator(batch_size=batch_size, input_shape = (784,))
 train = Multi_mnist_iterator(train)
 val = Multi_mnist_iterator(val)
 
diff --git a/example/mxnet_adversarial_vae/README.md b/example/mxnet_adversarial_vae/README.md
new file mode 100644
index 000000000000..6eb5f74afaac
--- /dev/null
+++ b/example/mxnet_adversarial_vae/README.md
@@ -0,0 +1,22 @@
+VAE-GAN in MXNet
+
+Implementation of Autoencoding beyond pixels using a learned similarity metric based on the Tensorflow implementation of https://github.com/JeremyCCHsu/tf-vaegan/
+
+*Please refer to their official Github for details*: https://github.com/andersbll/autoencoding_beyond_pixels
+
+As the name indicates, VAE-GAN replaces GAN's generator with a variational auto-encoder, resulting in a model with both inference and generation components. 
+
+Experiements
+
+Dataset: caltech 101 silhouettes dataset from https://people.cs.umass.edu/~marlin/data.shtml
+
+Usage
+
+Using existing models
+
+python vaegan_mxnet.py --test --testing_data_path [your dataset image path] --pretrained_encoder_path [pretrained encoder model path] --pretrained_generator_path [pretrained generator model path] [options]
+
+Train a new model
+
+python vaegan_mxnet.py --train --training_data_path [your dataset image path] [options]
+
diff --git a/example/mxnet_adversarial_vae/convert_data.py b/example/mxnet_adversarial_vae/convert_data.py
new file mode 100644
index 000000000000..c25e8c919e17
--- /dev/null
+++ b/example/mxnet_adversarial_vae/convert_data.py
@@ -0,0 +1,83 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Created on Jun 15, 2017
+
+@author: shujon
+'''
+import scipy.io
+import scipy.misc
+import numpy as np
+import unicodedata
+from PIL import Image
+import argparse
+
+##########################################################
+# convert the caltech101 mat file to images
+# sample run : python convert_data.py --dataset /home/ubuntu/datasets/caltech101/data/caltech101_silhouettes_28.mat --save_path /home/ubuntu/datasets/caltech101/data/ --invert --height 32 --width 32
+##########################################################
+
+
+def convert_mat_to_images(args):
+    dataset = scipy.io.loadmat(args.dataset)
+    
+    # image pixel data
+    X = dataset['X']
+    
+    # image class labels (not used in this project)
+    Y = dataset['Y']
+    
+    total_image = X.shape[0]
+    
+    h=args.height
+    w=args.width
+    
+    for i in range(total_image):
+        img = X[i]
+        img = np.reshape(img, (28, 28))
+        if args.invert:
+            img = (1-img)*255
+        else:
+            img = img*255
+        img = Image.fromarray(img, 'L')
+        img = img.rotate(-90)
+        img = img.resize([h,w], Image.BILINEAR)
+        img.save(args.save_path + '/img' + str(i) + '.png')
+        
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert the caltech101 mat file to images')
+    # general
+    parser.add_argument('--dataset', help='caltech101 dataset mat file path', default='/home/ubuntu/datasets/caltech101/data/caltech101_silhouettes_28.mat', type=str)
+    parser.add_argument('--save_path', help='path to save the images', default='/home/ubuntu/datasets/caltech101/data/', type=str)
+    parser.add_argument('--invert', help='invert the image color i.e. default shapes are black and background is white in caltech101, invert the shapes to white', action='store_true')
+    parser.add_argument('--height', help='height of the final image', default=32, type=int)
+    parser.add_argument('--width', help='width of the final image', default=32, type=int)
+
+    #Note if you change the height or width you will need to change the network as well, as the convolution output will be different then
+
+    args = parser.parse_args()
+    return args
+      
+
+def main():
+    args = parse_args()
+    convert_mat_to_images(args)
+
+if __name__ == '__main__':
+    main()
diff --git a/example/mxnet_adversarial_vae/vaegan_mxnet.py b/example/mxnet_adversarial_vae/vaegan_mxnet.py
new file mode 100644
index 000000000000..33f0903a777b
--- /dev/null
+++ b/example/mxnet_adversarial_vae/vaegan_mxnet.py
@@ -0,0 +1,832 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+'''
+Created on Jun 15, 2017
+
+@author: shujon
+'''
+
+from __future__ import print_function
+import mxnet as mx
+import numpy as np
+from sklearn.datasets import fetch_mldata
+from matplotlib import pyplot as plt
+import logging
+import cv2
+from datetime import datetime
+from PIL import Image
+import os
+import argparse
+from scipy.io import savemat
+#from layer import GaussianSampleLayer
+
+######################################################################
+#An adversarial variational autoencoder implementation in mxnet
+# following the implementation at https://github.com/JeremyCCHsu/tf-vaegan
+# of paper `Larsen, Anders Boesen Lindbo, et al. "Autoencoding beyond pixels using a 
+# learned similarity metric." arXiv preprint arXiv:1512.09300 (2015).`
+######################################################################
+
+#constant operator in mxnet, not used in this code
+@mx.init.register
+class MyConstant(mx.init.Initializer):
+    def __init__(self, value):
+        super(MyConstant, self).__init__(value=value)
+        self.value = value
+
+    def _init_weight(self, _, arr):
+        arr[:] = mx.nd.array(self.value)
+        
+#######################################################################
+#The encoder is a CNN which takes 32x32 image as input
+# generates the 100 dimensional shape embedding as a sample from normal distribution
+# using predicted meand and variance 
+#######################################################################        
+
+def encoder(nef, z_dim, batch_size, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12):
+    BatchNorm = mx.sym.BatchNorm
+    
+    data = mx.sym.Variable('data')
+    #label = mx.sym.Variable('label')    
+
+    e1 = mx.sym.Convolution(data, name='enc1', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef, no_bias=no_bias)
+    ebn1 = BatchNorm(e1, name='encbn1', fix_gamma=fix_gamma, eps=eps)
+    eact1 = mx.sym.LeakyReLU(ebn1, name='encact1', act_type='leaky', slope=0.2)
+
+    e2 = mx.sym.Convolution(eact1, name='enc2', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef*2, no_bias=no_bias)
+    ebn2 = BatchNorm(e2, name='encbn2', fix_gamma=fix_gamma, eps=eps)
+    eact2 = mx.sym.LeakyReLU(ebn2, name='encact2', act_type='leaky', slope=0.2)
+
+    e3 = mx.sym.Convolution(eact2, name='enc3', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef*4, no_bias=no_bias)
+    ebn3 = BatchNorm(e3, name='encbn3', fix_gamma=fix_gamma, eps=eps)
+    eact3 = mx.sym.LeakyReLU(ebn3, name='encact3', act_type='leaky', slope=0.2)
+
+    e4 = mx.sym.Convolution(eact3, name='enc4', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=nef*8, no_bias=no_bias)
+    ebn4 = BatchNorm(e4, name='encbn4', fix_gamma=fix_gamma, eps=eps)
+    eact4 = mx.sym.LeakyReLU(ebn4, name='encact4', act_type='leaky', slope=0.2)
+
+    eact4 = mx.sym.Flatten(eact4)
+
+    z_mu = mx.sym.FullyConnected(eact4, num_hidden=z_dim, name="enc_mu")
+    z_lv = mx.sym.FullyConnected(eact4, num_hidden=z_dim, name="enc_lv")
+    
+
+    #eps = mx.symbol.random_normal(loc=0, scale=1, shape=(batch_size,z_dim) )
+    #std = mx.symbol.sqrt(mx.symbol.exp(z_lv))
+    #z = mx.symbol.elemwise_add(z_mu, mx.symbol.broadcast_mul(eps, std))    
+    
+    z = z_mu + mx.symbol.broadcast_mul(mx.symbol.exp(0.5*z_lv),mx.symbol.random_normal(loc=0, scale=1,shape=(batch_size,z_dim))) 
+    
+    return z_mu, z_lv, z    
+    
+    
+#######################################################################
+#The genrator is a CNN which takes 100 dimensional embedding as input
+# and reconstructs the input image given to the encoder
+####################################################################### 
+    
+def generator(ngf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12, z_dim=100, activation='sigmoid'):
+    
+    BatchNorm = mx.sym.BatchNorm
+    rand = mx.sym.Variable('rand')
+    
+    rand = mx.sym.Reshape(rand, shape=(-1, z_dim, 1, 1))
+
+    #g1 = mx.sym.FullyConnected(rand, name="g1", num_hidden=2*2*ngf*8, no_bias=True)
+    g1 = mx.sym.Deconvolution(rand, name='gen1', kernel=(5,5), stride=(2,2),target_shape=(2,2), num_filter=ngf*8, no_bias=no_bias)
+    gbn1 = BatchNorm(g1, name='genbn1', fix_gamma=fix_gamma, eps=eps)
+    gact1 = mx.sym.Activation(gbn1, name="genact1", act_type="relu")
+    # 4 x 4
+    #gact1 = mx.sym.Reshape(gact1, shape=(-1, ngf * 8, 2, 2))
+
+    #g1 = mx.sym.Deconvolution(g0, name='g1', kernel=(4,4), num_filter=ngf*8, no_bias=no_bias)
+    #gbn1 = BatchNorm(g1, name='gbn1', fix_gamma=fix_gamma, eps=eps)
+    #gact1 = mx.sym.Activation(gbn1, name='gact1', act_type='relu')
+
+    g2 = mx.sym.Deconvolution(gact1, name='gen2', kernel=(5,5), stride=(2,2),target_shape=(4,4), num_filter=ngf*4, no_bias=no_bias)
+    gbn2 = BatchNorm(g2, name='genbn2', fix_gamma=fix_gamma, eps=eps)
+    gact2 = mx.sym.Activation(gbn2, name='genact2', act_type='relu')
+
+    g3 = mx.sym.Deconvolution(gact2, name='gen3', kernel=(5,5), stride=(2,2), target_shape=(8,8), num_filter=ngf*2, no_bias=no_bias)
+    gbn3 = BatchNorm(g3, name='genbn3', fix_gamma=fix_gamma, eps=eps)
+    gact3 = mx.sym.Activation(gbn3, name='genact3', act_type='relu')
+
+    g4 = mx.sym.Deconvolution(gact3, name='gen4', kernel=(5,5), stride=(2,2), target_shape=(16,16), num_filter=ngf, no_bias=no_bias)
+    gbn4 = BatchNorm(g4, name='genbn4', fix_gamma=fix_gamma, eps=eps)
+    gact4 = mx.sym.Activation(gbn4, name='genact4', act_type='relu')
+
+    g5 = mx.sym.Deconvolution(gact4, name='gen5', kernel=(5,5), stride=(2,2), target_shape=(32,32), num_filter=nc, no_bias=no_bias)
+    gout = mx.sym.Activation(g5, name='genact5', act_type=activation)    
+
+    return gout
+
+
+#######################################################################
+# First part of the discriminator which takes a 32x32 image as input
+# and output a convolutional feature map, this is required to calculate
+# the layer loss
+####################################################################### 
+
+def discriminator1(ndf, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12):
+
+    BatchNorm = mx.sym.BatchNorm
+    
+    data = mx.sym.Variable('data')
+    
+    #label = mx.sym.Variable('label')
+
+    d1 = mx.sym.Convolution(data, name='d1', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf, no_bias=no_bias)
+    dact1 = mx.sym.LeakyReLU(d1, name='dact1', act_type='leaky', slope=0.2)
+
+    d2 = mx.sym.Convolution(dact1, name='d2', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf*2, no_bias=no_bias)
+    dbn2 = BatchNorm(d2, name='dbn2', fix_gamma=fix_gamma, eps=eps)
+    dact2 = mx.sym.LeakyReLU(dbn2, name='dact2', act_type='leaky', slope=0.2)
+
+    d3 = mx.sym.Convolution(dact2, name='d3', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf*4, no_bias=no_bias)
+    dbn3 = BatchNorm(d3, name='dbn3', fix_gamma=fix_gamma, eps=eps)
+    dact3 = mx.sym.LeakyReLU(dbn3, name='dact3', act_type='leaky', slope=0.2)
+
+    return dact3 
+
+#######################################################################
+# Second part of the discriminator which takes a 256x8x8 feature map as input
+# and generates the loss based on whether the input image was a real one or fake one
+####################################################################### 
+def discriminator2(ndf, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12):
+        
+    BatchNorm = mx.sym.BatchNorm
+            
+    data = mx.sym.Variable('data')
+    
+    label = mx.sym.Variable('label')
+    
+    d4 = mx.sym.Convolution(data, name='d4', kernel=(5,5), stride=(2,2), pad=(2,2), num_filter=ndf*8, no_bias=no_bias)
+    dbn4 = BatchNorm(d4, name='dbn4', fix_gamma=fix_gamma, eps=eps)
+    dact4 = mx.sym.LeakyReLU(dbn4, name='dact4', act_type='leaky', slope=0.2)
+
+    #d5 = mx.sym.Convolution(dact4, name='d5', kernel=(4,4), num_filter=1, no_bias=no_bias)
+    #d5 = mx.sym.Flatten(d5)
+    h = mx.sym.Flatten(dact4)    
+    
+    d5 = mx.sym.FullyConnected(h, num_hidden=1, name="d5")
+            
+    #dloss = (0.5 * (label == 0) + (label != 0) ) * mx.sym.LogisticRegressionOutput(data=d5, label=label, name='dloss') 
+    dloss = mx.sym.LogisticRegressionOutput(data=d5, label=label, name='dloss') 
+    
+    return dloss
+
+#######################################################################
+# GaussianLogDensity loss calculation for layer wise loss
+####################################################################### 
+def GaussianLogDensity(x, mu, log_var, name='GaussianLogDensity', EPSILON = 1e-6):
+    c = mx.sym.ones_like(log_var)*2.0 * 3.1416
+    c = mx.symbol.log(c)
+    var = mx.sym.exp(log_var)
+    x_mu2 = mx.symbol.square(x - mu)   # [Issue] not sure the dim works or not?
+    x_mu2_over_var = mx.symbol.broadcast_div(x_mu2, var + EPSILON)
+    log_prob = -0.5 * (c + log_var + x_mu2_over_var)
+    #log_prob = (x_mu2)
+    log_prob = mx.symbol.sum(log_prob, axis=1, name=name)   # keep_dims=True,
+    return log_prob
+
+#######################################################################
+# Calculate the discriminator layer loss
+####################################################################### 
+def DiscriminatorLayerLoss():
+        
+    data = mx.sym.Variable('data')
+    
+    label = mx.sym.Variable('label')    
+    
+    data = mx.sym.Flatten(data)
+    label = mx.sym.Flatten(label)        
+    
+    label = mx.sym.BlockGrad(label)
+    
+    zeros = mx.sym.zeros_like(data)
+    
+    output = -GaussianLogDensity(label, data, zeros)
+    
+    dloss = mx.symbol.MakeLoss(mx.symbol.mean(output),name='lloss')
+            
+    #dloss = mx.sym.MAERegressionOutput(data=data, label=label, name='lloss')
+    
+    return dloss    
+
+#######################################################################
+# KLDivergence loss
+####################################################################### 
+def KLDivergenceLoss():
+    
+    data = mx.sym.Variable('data')
+    mu1, lv1 = mx.sym.split(data,  num_outputs=2, axis=0)
+    mu2 = mx.sym.zeros_like(mu1)
+    lv2 = mx.sym.zeros_like(lv1)
+    
+    v1 = mx.sym.exp(lv1)
+    v2 = mx.sym.exp(lv2)
+    mu_diff_sq = mx.sym.square(mu1 - mu2)
+    dimwise_kld = .5 * (
+    (lv2 - lv1) + mx.symbol.broadcast_div(v1, v2) + mx.symbol.broadcast_div(mu_diff_sq, v2) - 1.)
+    KL = mx.symbol.sum(dimwise_kld, axis=1)
+        
+    KLloss = mx.symbol.MakeLoss(mx.symbol.mean(KL),name='KLloss')
+    return KLloss
+
+#######################################################################
+# Get the dataset
+####################################################################### 
+def get_data(path, activation):
+    #mnist = fetch_mldata('MNIST original')
+    #import ipdb; ipdb.set_trace()
+    data = []
+    image_names = []
+    #set the path to the 32x32 images of caltech101 dataset created using the convert_data_inverted.py script
+    #path = '/home/ubuntu/datasets/caltech101/data/images32x32/'
+    #path_wo_ext = '/home/ubuntu/datasets/caltech101/data/images/'
+    for filename in os.listdir(path):
+        img = cv2.imread(os.path.join(path,filename), cv2.IMREAD_GRAYSCALE)
+        image_names.append(filename)
+        if img is not None:
+            data.append(img)
+    
+    data = np.asarray(data)
+    
+    if activation == 'sigmoid':
+        #converting image values from 0 to 1 as the generator activation is sigmoid
+        data = data.astype(np.float32)/(255.0)
+    elif activation == 'tanh':
+        #converting image values from -1 to 1 as the generator activation is tanh
+        data = data.astype(np.float32)/(255.0/2) - 1.0
+
+    data = data.reshape((data.shape[0], 1, data.shape[1], data.shape[2])) 
+    
+    np.random.seed(1234) # set seed for deterministic ordering
+    p = np.random.permutation(data.shape[0])
+    X = data[p]
+
+    return X, image_names
+
+#######################################################################
+# Create a random iterator for generator
+####################################################################### 
+class RandIter(mx.io.DataIter):
+    def __init__(self, batch_size, ndim):
+        self.batch_size = batch_size
+        self.ndim = ndim
+        self.provide_data = [('rand', (batch_size, ndim, 1, 1))]
+        self.provide_label = []
+
+    def iter_next(self):
+        return True
+
+    def getdata(self):
+        return [mx.random.normal(0, 1.0, shape=(self.batch_size, self.ndim, 1, 1))]
+
+#######################################################################
+# fill the ith grid of the buffer matrix with the values from the img
+# buf : buffer matrix
+# i : serial of the image in the 2D grid  
+# img : image data
+# shape : ( height width depth ) of image 
+####################################################################### 
+def fill_buf(buf, i, img, shape):
+    
+    #n = buf.shape[0]/shape[1]
+    
+    # grid height is a multiple of individual image height
+    m = buf.shape[0]/shape[0]
+
+    sx = (i%m)*shape[1]
+    sy = (i/m)*shape[0]
+    buf[sy:sy+shape[0], sx:sx+shape[1], :] = img
+
+
+#######################################################################
+# create a grid of images and save it as a final image
+# title : grid image name
+# X : array of images
+####################################################################### 
+def visual(title, X, activation):
+    assert len(X.shape) == 4
+            
+    X = X.transpose((0, 2, 3, 1))
+    if activation == 'sigmoid':
+        X = np.clip((X)*(255.0), 0, 255).astype(np.uint8)
+    elif activation == 'tanh':
+        X = np.clip((X+1.0)*(255.0/2.0), 0, 255).astype(np.uint8)
+    n = np.ceil(np.sqrt(X.shape[0]))
+    buff = np.zeros((int(n*X.shape[1]), int(n*X.shape[2]), int(X.shape[3])), dtype=np.uint8)
+    for i, img in enumerate(X):
+        fill_buf(buff, i, img, X.shape[1:3])
+    #buff = cv2.cvtColor(buff, cv2.COLOR_BGR2RGB)
+    #local_out = 1
+    #num = 1
+    cv2.imwrite('%s.jpg' % (title), buff)
+
+
+#######################################################################
+# adverial training of the VAE
+####################################################################### 
+def train(dataset, nef, ndf, ngf, nc, batch_size, Z, lr, beta1, epsilon, ctx, check_point, g_dl_weight, output_path, checkpoint_path, data_path, activation,num_epoch, save_after_every, visualize_after_every, show_after_every): 
+    
+    #encoder
+    z_mu, z_lv, z = encoder(nef, Z, batch_size)
+    symE = mx.sym.Group([z_mu, z_lv, z])        
+    
+    #generator
+    symG = generator(ngf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12, z_dim = Z, activation=activation )
+    
+    #discriminator
+    h  = discriminator1(ndf)
+    dloss  = discriminator2(ndf)
+    #symD = mx.sym.Group([dloss, h])
+    symD1 = h
+    symD2 = dloss    
+
+    
+    #symG, symD = make_dcgan_sym(nef, ngf, ndf, nc)
+    #mx.viz.plot_network(symG, shape={'rand': (batch_size, 100, 1, 1)}).view()
+    #mx.viz.plot_network(symD, shape={'data': (batch_size, nc, 64, 64)}).view()
+
+    # ==============data==============
+    #if dataset == 'caltech':
+    X_train, _ = get_data(data_path, activation)
+        #import ipdb; ipdb.set_trace()
+    train_iter = mx.io.NDArrayIter(X_train, batch_size=batch_size, shuffle=True)
+    #elif dataset == 'imagenet':
+    #    train_iter = ImagenetIter(imgnet_path, batch_size, (3, 32, 32))
+    
+    #print('=============================================', str(batch_size), str(Z))
+    rand_iter = RandIter(batch_size, Z)
+    label = mx.nd.zeros((batch_size,), ctx=ctx)
+
+    # =============module E=============
+    modE = mx.mod.Module(symbol=symE, data_names=('data',), label_names=None, context=ctx)
+    modE.bind(data_shapes=train_iter.provide_data)
+    modE.init_params(initializer=mx.init.Normal(0.02))
+    modE.init_optimizer(
+        optimizer='adam',
+        optimizer_params={
+            'learning_rate': lr,
+            'wd': 1e-6,
+            'beta1': beta1,
+            'epsilon': epsilon,
+            'rescale_grad': (1.0/batch_size)
+        })
+    mods = [modE]    
+
+    # =============module G=============
+    modG = mx.mod.Module(symbol=symG, data_names=('rand',), label_names=None, context=ctx)
+    modG.bind(data_shapes=rand_iter.provide_data, inputs_need_grad=True)
+    modG.init_params(initializer=mx.init.Normal(0.02))
+    modG.init_optimizer(
+        optimizer='adam',
+        optimizer_params={
+            'learning_rate': lr,
+            'wd': 1e-6,
+            'beta1': beta1,
+            'epsilon': epsilon,
+            #'rescale_grad': (1.0/batch_size)
+        })
+    mods.append(modG)
+
+    # =============module D=============
+    modD1 = mx.mod.Module(symD1, label_names=[], context=ctx)
+    modD2 = mx.mod.Module(symD2, label_names=('label',), context=ctx)
+    modD = mx.mod.SequentialModule()
+    modD.add(modD1).add(modD2, take_labels=True, auto_wiring=True)    
+    #modD = mx.mod.Module(symbol=symD, data_names=('data',), label_names=('label',), context=ctx)
+    modD.bind(data_shapes=train_iter.provide_data,
+              label_shapes=[('label', (batch_size,))],
+              inputs_need_grad=True)
+    modD.init_params(initializer=mx.init.Normal(0.02))
+    modD.init_optimizer(
+        optimizer='adam',
+        optimizer_params={
+            'learning_rate': lr,
+            'wd': 1e-3,
+            'beta1': beta1,
+            'epsilon': epsilon,
+            'rescale_grad': (1.0/batch_size)
+        })
+    mods.append(modD)
+    
+    
+    # =============module DL=============    
+    symDL = DiscriminatorLayerLoss()
+    modDL = mx.mod.Module(symbol=symDL, data_names=('data',), label_names=('label',), context=ctx)
+    modDL.bind(data_shapes=[('data', (batch_size,nef * 4,4,4))], ################################################################################################################################ fix 512 here
+              label_shapes=[('label', (batch_size,nef * 4,4,4))],
+              inputs_need_grad=True)
+    modDL.init_params(initializer=mx.init.Normal(0.02))
+    modDL.init_optimizer(
+        optimizer='adam',
+        optimizer_params={
+            'learning_rate': lr,
+            'wd': 0.,
+            'beta1': beta1,
+            'epsilon': epsilon,
+            'rescale_grad': (1.0/batch_size)
+        })
+    
+    # =============module KL=============    
+    symKL = KLDivergenceLoss()
+    modKL = mx.mod.Module(symbol=symKL, data_names=('data',), label_names=None, context=ctx)
+    modKL.bind(data_shapes=[('data', (batch_size*2,Z))],
+               inputs_need_grad=True)
+    modKL.init_params(initializer=mx.init.Normal(0.02))
+    modKL.init_optimizer(
+        optimizer='adam',
+        optimizer_params={
+            'learning_rate': lr,
+            'wd': 0.,
+            'beta1': beta1,
+            'epsilon': epsilon,
+            'rescale_grad': (1.0/batch_size)
+        })    
+    mods.append(modKL)
+       
+    def norm_stat(d):
+        return mx.nd.norm(d)/np.sqrt(d.size)
+    mon = mx.mon.Monitor(10, norm_stat, pattern=".*output|d1_backward_data", sort=True)
+    mon = None
+    if mon is not None:
+        for mod in mods:
+            pass
+
+    # ============calculating prediction accuracy==============
+    def facc(label, pred):
+        pred = pred.ravel()
+        label = label.ravel()
+        return ((pred > 0.5) == label).mean()
+
+    # ============calculating binary cross-entropy loss==============
+    def fentropy(label, pred):
+        pred = pred.ravel()
+        label = label.ravel()
+        return -(label*np.log(pred+1e-12) + (1.-label)*np.log(1.-pred+1e-12)).mean()
+    
+    # ============calculating KL divergence loss==============
+    def kldivergence(label, pred):
+        #pred = pred.ravel()
+        #label = label.ravel()
+        mean, log_var = np.split(pred, 2, axis=0)
+        var = np.exp(log_var)
+        KLLoss = -0.5 * np.sum(1 + log_var - np.power(mean, 2) - var)
+        KLLoss = KLLoss / nElements
+        return KLLoss   
+
+    mG = mx.metric.CustomMetric(fentropy)
+    mD = mx.metric.CustomMetric(fentropy)
+    mE = mx.metric.CustomMetric(kldivergence)
+    mACC = mx.metric.CustomMetric(facc)
+
+    print('Training...')
+    stamp =  datetime.now().strftime('%Y_%m_%d-%H_%M')
+
+    # =============train===============
+    for epoch in range(num_epoch):
+        train_iter.reset()
+        for t, batch in enumerate(train_iter):
+            
+            rbatch = rand_iter.next()
+
+            if mon is not None:
+                mon.tic()
+
+            modG.forward(rbatch, is_train=True)
+            outG = modG.get_outputs()
+
+            #print('======================================================================')
+            #print(outG)
+
+            # update discriminator on fake
+            label[:] = 0
+            modD.forward(mx.io.DataBatch(outG, [label]), is_train=True)
+            modD.backward()
+            #modD.update()
+            gradD11 = [[grad.copyto(grad.context) for grad in grads] for grads in modD1._exec_group.grad_arrays]
+            gradD12 = [[grad.copyto(grad.context) for grad in grads] for grads in modD2._exec_group.grad_arrays]
+
+            modD.update_metric(mD, [label])
+            modD.update_metric(mACC, [label])
+
+
+            #update discriminator on decoded
+            modE.forward(batch, is_train=True)
+            mu, lv, z = modE.get_outputs()
+            #z = GaussianSampleLayer(mu, lv)          
+            z = z.reshape((batch_size, Z, 1, 1))
+            sample = mx.io.DataBatch([z], label=None, provide_data = [('rand', (batch_size, Z, 1, 1))])                          
+            modG.forward(sample, is_train=True)
+            xz = modG.get_outputs()      
+            label[:] = 0    
+            modD.forward(mx.io.DataBatch(xz, [label]), is_train=True)
+            modD.backward()
+                        
+            #modD.update()
+            gradD21 = [[grad.copyto(grad.context) for grad in grads] for grads in modD1._exec_group.grad_arrays]
+            gradD22 = [[grad.copyto(grad.context) for grad in grads] for grads in modD2._exec_group.grad_arrays]    
+            
+            modD.update_metric(mD, [label])
+            modD.update_metric(mACC, [label])
+
+            # update discriminator on real
+            label[:] = 1
+            batch.label = [label]
+            modD.forward(batch, is_train=True)
+            lx = [out.copyto(out.context) for out in modD1.get_outputs()]            
+            modD.backward()
+            for gradsr, gradsf, gradsd in zip(modD1._exec_group.grad_arrays, gradD11, gradD21):
+                for gradr, gradf, gradd in zip(gradsr, gradsf, gradsd):
+                    gradr += 0.5 * (gradf + gradd)
+            for gradsr, gradsf, gradsd in zip(modD2._exec_group.grad_arrays, gradD12, gradD22):
+                for gradr, gradf, gradd in zip(gradsr, gradsf, gradsd):
+                    gradr += 0.5 * (gradf + gradd)       
+                                        
+            modD.update()
+            modD.update_metric(mD, [label])
+            modD.update_metric(mACC, [label])
+
+            # update generator twice as the discriminator is too strong           
+            #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+            modG.forward(rbatch, is_train=True)
+            outG = modG.get_outputs()            
+            label[:] = 1
+            modD.forward(mx.io.DataBatch(outG, [label]), is_train=True)
+            modD.backward()
+            diffD = modD1.get_input_grads()
+            modG.backward(diffD)
+            #modG.update()
+            gradG1 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays]                        
+            mG.update([label], modD.get_outputs())
+
+            modG.forward(sample, is_train=True)
+            xz = modG.get_outputs()      
+            label[:] = 1    
+            modD.forward(mx.io.DataBatch(xz, [label]), is_train=True)
+            modD.backward()            
+            diffD = modD1.get_input_grads()
+            modG.backward(diffD)
+            gradG2 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays]
+            #modG.update() 
+            mG.update([label], modD.get_outputs())
+            
+            modG.forward(sample, is_train=True)
+            xz = modG.get_outputs()            
+            modD1.forward(mx.io.DataBatch(xz, []), is_train=True)
+            outD1 = modD1.get_outputs()            
+            modDL.forward(mx.io.DataBatch(outD1, lx), is_train=True)
+            modDL.backward()
+            dlGrad = modDL.get_input_grads()                        
+            modD1.backward(dlGrad)
+            diffD = modD1.get_input_grads()
+            modG.backward(diffD)          
+           
+            for grads, gradsG1, gradsG2 in zip(modG._exec_group.grad_arrays, gradG1, gradG2):
+                for grad, gradg1, gradg2 in zip(grads, gradsG1, gradsG2):
+                    grad = g_dl_weight * grad + 0.5 * (gradg1 + gradg2)            
+            
+            modG.update()            
+            mG.update([label], modD.get_outputs())
+
+            #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+            modG.forward(rbatch, is_train=True)
+            outG = modG.get_outputs()            
+            label[:] = 1
+            modD.forward(mx.io.DataBatch(outG, [label]), is_train=True)
+            modD.backward()
+            diffD = modD1.get_input_grads()
+            modG.backward(diffD)
+            #modG.update()
+            gradG1 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays]                        
+            mG.update([label], modD.get_outputs())
+
+            modG.forward(sample, is_train=True)
+            xz = modG.get_outputs()      
+            label[:] = 1    
+            modD.forward(mx.io.DataBatch(xz, [label]), is_train=True)
+            modD.backward()            
+            diffD = modD1.get_input_grads()
+            modG.backward(diffD)
+            gradG2 = [[grad.copyto(grad.context) for grad in grads] for grads in modG._exec_group.grad_arrays]
+            #modG.update() 
+            mG.update([label], modD.get_outputs())
+            
+            modG.forward(sample, is_train=True)
+            xz = modG.get_outputs()            
+            modD1.forward(mx.io.DataBatch(xz, []), is_train=True)
+            outD1 = modD1.get_outputs()            
+            modDL.forward(mx.io.DataBatch(outD1, lx), is_train=True)
+            modDL.backward()
+            dlGrad = modDL.get_input_grads()                        
+            modD1.backward(dlGrad)
+            diffD = modD1.get_input_grads()
+            modG.backward(diffD)          
+           
+            for grads, gradsG1, gradsG2 in zip(modG._exec_group.grad_arrays, gradG1, gradG2):
+                for grad, gradg1, gradg2 in zip(grads, gradsG1, gradsG2):
+                    grad = g_dl_weight * grad + 0.5 * (gradg1 + gradg2)            
+            
+            modG.update()            
+            mG.update([label], modD.get_outputs())
+            
+                        
+            ##update encoder--------------------------------------------------
+            
+            #modE.forward(batch, is_train=True)
+            #mu, lv, z = modE.get_outputs()          
+            #z = z.reshape((batch_size, Z, 1, 1))
+            #sample = mx.io.DataBatch([z], label=None, provide_data = [('rand', (batch_size, Z, 1, 1))])                          
+            modG.forward(sample, is_train=True)
+            xz = modG.get_outputs()
+            
+            #update generator
+            modD1.forward(mx.io.DataBatch(xz, []), is_train=True)
+            outD1 = modD1.get_outputs()            
+            modDL.forward(mx.io.DataBatch(outD1, lx), is_train=True)
+            DLloss = modDL.get_outputs()
+            modDL.backward()
+            dlGrad = modDL.get_input_grads()                        
+            modD1.backward(dlGrad)
+            diffD = modD1.get_input_grads()
+            modG.backward(diffD)          
+            #modG.update()
+            
+            #print('updating encoder=====================================')            
+            
+            #update encoder
+            nElements = batch_size
+            #var = mx.ndarray.exp(lv)
+            
+            modKL.forward(mx.io.DataBatch([mx.ndarray.concat(mu,lv, dim=0)]), is_train=True)
+            KLloss = modKL.get_outputs()
+            modKL.backward()
+            gradKLLoss = modKL.get_input_grads()
+                        
+            diffG = modG.get_input_grads()
+            #print('======================================================================')
+            #print(np.sum(diffG[0].asnumpy()))
+            diffG = diffG[0].reshape((batch_size, Z))
+            modE.backward(mx.ndarray.split(gradKLLoss[0], num_outputs=2, axis=0) + [diffG])
+            modE.update()
+            #print('mu type : ')
+            #print(type(mu))
+            pred = mx.ndarray.concat(mu,lv, dim=0)
+            #print(pred)
+            mE.update([pred], [pred])            
+            
+
+
+            if mon is not None:
+                mon.toc_print()
+                
+            t += 1
+            if t % show_after_every == 0:
+                print('epoch:', epoch, 'iter:', t, 'metric:', mACC.get(), mG.get(), mD.get(), mE.get(), KLloss[0].asnumpy(), DLloss[0].asnumpy())
+                mACC.reset()
+                mG.reset()
+                mD.reset()
+                mE.reset()
+                
+            if epoch % visualize_after_every == 0:
+                visual(output_path +'gout'+str(epoch), outG[0].asnumpy(), activation)
+                #diff = diffD[0].asnumpy()
+                #diff = (diff - diff.mean())/diff.std()
+                #visual('diff', diff)
+                visual(output_path + 'data'+str(epoch), batch.data[0].asnumpy(), activation)
+
+        if check_point and epoch % save_after_every == 0:
+            print('Saving...')
+            modG.save_params(checkpoint_path + '/%s_G-%04d.params'%(dataset, epoch))
+            modD.save_params(checkpoint_path + '/%s_D-%04d.params'%(dataset, epoch))
+            modE.save_params(checkpoint_path + '/%s_E-%04d.params'%(dataset, epoch))
+
+#######################################################################
+# Test the VAE with a pretrained encoder and generator. 
+# Keep the batch size 1 
+####################################################################### 
+def test(nef, ngf, nc, batch_size, Z, ctx, pretrained_encoder_path, pretrained_generator_path, output_path, data_path, activation, save_embedding, embedding_path = ''):
+    
+    #encoder
+    z_mu, z_lv, z = encoder(nef, Z, batch_size)
+    symE = mx.sym.Group([z_mu, z_lv, z])    
+    
+    #generator
+    symG = generator(ngf, nc, no_bias=True, fix_gamma=True, eps=1e-5 + 1e-12, z_dim = Z, activation=activation )    
+    
+    #symG, symD = make_dcgan_sym(nef, ngf, ndf, nc)
+    #mx.viz.plot_network(symG, shape={'rand': (batch_size, 100, 1, 1)}).view()
+    #mx.viz.plot_network(symD, shape={'data': (batch_size, nc, 64, 64)}).view()
+
+    # ==============data==============
+    X_test, image_names = get_data(data_path, activation)
+    #import ipdb; ipdb.set_trace()
+    test_iter = mx.io.NDArrayIter(X_test, batch_size=batch_size, shuffle=False)
+
+    # =============module E=============
+    modE = mx.mod.Module(symbol=symE, data_names=('data',), label_names=None, context=ctx)
+    modE.bind(data_shapes=test_iter.provide_data)
+    #modE.init_params(initializer=mx.init.Normal(0.02))
+    modE.load_params(pretrained_encoder_path)
+
+    # =============module G============= 
+    modG = mx.mod.Module(symbol=symG, data_names=('rand',), label_names=None, context=ctx)
+    modG.bind(data_shapes=[('rand', (1, Z, 1, 1))])
+    #modG.init_params(initializer=mx.init.Normal(0.02))        
+    modG.load_params(pretrained_generator_path)
+
+    print('Testing...')
+
+    # =============test===============
+    test_iter.reset()
+    for t, batch in enumerate(test_iter):
+
+        #update discriminator on decoded
+        modE.forward(batch, is_train=False)
+        mu, lv, z = modE.get_outputs()
+        #z = GaussianSampleLayer(mu, lv)          
+        mu = mu.reshape((batch_size, Z, 1, 1))
+        sample = mx.io.DataBatch([mu], label=None, provide_data = [('rand', (batch_size, Z, 1, 1))])         
+        modG.forward(sample, is_train=False)
+        outG = modG.get_outputs()                              
+            
+        visual(output_path + '/' + 'gout'+str(t), outG[0].asnumpy(), activation)
+        visual(output_path +  '/' + 'data'+str(t), batch.data[0].asnumpy(), activation)
+        image_name = image_names[t].split('.')[0]
+        
+        if save_embedding:            
+            savemat(embedding_path+'/'+image_name+'.mat', {'embedding':mu.asnumpy()})    
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train and Test an Adversarial Variatiional Encoder')
+
+    parser.add_argument('--train', help='train the network', action='store_true')
+    parser.add_argument('--test', help='test the network', action='store_true')
+    parser.add_argument('--save_embedding', help='saves the shape embedding of each input image', action='store_true')                   
+    parser.add_argument('--dataset', help='dataset name', default='caltech', type=str)
+    parser.add_argument('--activation', help='activation i.e. sigmoid or tanh', default='sigmoid', type=str)    
+    parser.add_argument('--training_data_path', help='training data path', default='/home/ubuntu/datasets/caltech101/data/images32x32/', type=str)    
+    parser.add_argument('--testing_data_path', help='testing data path', default='/home/ubuntu/datasets/MPEG7dataset/images/', type=str)
+    parser.add_argument('--pretrained_encoder_path', help='pretrained encoder model path', default='checkpoints32x32_sigmoid/caltech_E-0045.params', type=str)
+    parser.add_argument('--pretrained_generator_path', help='pretrained generator model path', default='checkpoints32x32_sigmoid/caltech_G-0045.params', type=str)    
+    parser.add_argument('--output_path', help='output path for the generated images', default='outputs32x32_sigmoid/', type=str)
+    parser.add_argument('--embedding_path', help='output path for the generated embeddings', default='outputs32x32_sigmoid/', type=str)    
+    parser.add_argument('--checkpoint_path', help='checkpoint saving path ', default='checkpoints32x32_sigmoid/', type=str)     
+    parser.add_argument('--nef', help='encoder filter count in the first layer', default=64, type=int)
+    parser.add_argument('--ndf', help='discriminator filter count in the first layer', default=64, type=int)
+    parser.add_argument('--ngf', help='generator filter count in the second last layer', default=64, type=int)    
+    parser.add_argument('--nc', help='generator filter count in the last layer i.e. 1 for grayscale image, 3 for RGB image', default=1, type=int)    
+    parser.add_argument('--batch_size', help='batch size, keep it 1 during testing', default=64, type=int)    
+    parser.add_argument('--Z', help='embedding size', default=100, type=int)
+    parser.add_argument('--lr', help='learning rate', default=0.0002, type=float)         
+    parser.add_argument('--beta1', help='beta1 for adam optimizer', default=0.5, type=float)    
+    parser.add_argument('--epsilon', help='epsilon for adam optimizer', default=1e-5, type=float)
+    parser.add_argument('--g_dl_weight', help='discriminator layer loss weight', default=1e-1, type=float)                     
+    parser.add_argument('--gpu', help='gpu index', default=0, type=int)
+    parser.add_argument('--num_epoch', help='number of maximum epochs ', default=45, type=int)
+    parser.add_argument('--save_after_every', help='save checkpoint after every this number of epochs ', default=5, type=int)
+    parser.add_argument('--visualize_after_every', help='save output images after every this number of epochs', default=5, type=int)
+    parser.add_argument('--show_after_every', help='show metrics after this number of iterations', default=10, type=int)    
+    
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+    
+    # gpu context 
+    ctx = mx.gpu(args.gpu)
+    
+    # checkpoint saving flags
+    check_point = True
+    
+    if args.train:        
+        train(args.dataset, args.nef, args.ndf, args.ngf, args.nc, args.batch_size, args.Z, args.lr, args.beta1, args.epsilon, ctx, check_point, args.g_dl_weight, args.output_path, args.checkpoint_path, args.training_data_path, args.activation, args.num_epoch, args.save_after_every, args.visualize_after_every, args.show_after_every)
+    
+    if args.test:
+        test(args.nef, args.ngf, args.nc, 1, args.Z, ctx, args.pretrained_encoder_path, args.pretrained_generator_path, args.output_path, args.testing_data_path, args.activation, args.save_embedding, args.embedding_path)    
+
+if __name__ == '__main__':
+    
+    logging.basicConfig(level=logging.DEBUG)    
+    main()
+        
+  
+    
diff --git a/example/numpy-ops/custom_softmax.py b/example/numpy-ops/custom_softmax.py
index 162215f3b0d1..82f491e458ad 100644
--- a/example/numpy-ops/custom_softmax.py
+++ b/example/numpy-ops/custom_softmax.py
@@ -16,8 +16,11 @@
 # under the License.
 
 # pylint: skip-file
+import sys
 import os
-from data import mnist_iterator
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+from get_data import MNISTIterator
 import mxnet as mx
 import numpy as np
 import logging
@@ -72,7 +75,7 @@ def create_operator(self, ctx, shapes, dtypes):
 
 # data
 
-train, val = mnist_iterator(batch_size=100, input_shape = (784,))
+train, val = MNISTIterator(batch_size=100, input_shape = (784,))
 
 # train
 
diff --git a/example/numpy-ops/data.py b/example/numpy-ops/data.py
deleted file mode 100644
index 0ca8e1fd6653..000000000000
--- a/example/numpy-ops/data.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: skip-file
-""" data iterator for mnist """
-import sys
-import os
-# code to automatically download dataset
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
-import get_data
-import mxnet as mx
-
-def mnist_iterator(batch_size, input_shape):
-    """return train and val iterators for mnist"""
-    # download data
-    get_data.GetMNIST_ubyte()
-    flat = False if len(input_shape) == 3 else True
-
-    train_dataiter = mx.io.MNISTIter(
-        image="data/train-images-idx3-ubyte",
-        label="data/train-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        shuffle=True,
-        flat=flat)
-
-    val_dataiter = mx.io.MNISTIter(
-        image="data/t10k-images-idx3-ubyte",
-        label="data/t10k-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        flat=flat)
-
-    return (train_dataiter, val_dataiter)
diff --git a/example/numpy-ops/ndarray_softmax.py b/example/numpy-ops/ndarray_softmax.py
index aa8555e5ad3e..4ced2c5cd8d5 100644
--- a/example/numpy-ops/ndarray_softmax.py
+++ b/example/numpy-ops/ndarray_softmax.py
@@ -16,7 +16,11 @@
 # under the License.
 
 # pylint: skip-file
-from data import mnist_iterator
+import os
+import sys
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+from get_data import MNISTIterator
 import mxnet as mx
 import numpy as np
 import logging
@@ -93,7 +97,7 @@ def backward(self, out_grad, in_data, out_data, in_grad):
 
 # data
 
-train, val = mnist_iterator(batch_size=100, input_shape = (784,))
+train, val = MNISTIterator(batch_size=100, input_shape = (784,))
 
 # train
 
diff --git a/example/numpy-ops/numpy_softmax.py b/example/numpy-ops/numpy_softmax.py
index f90783b494b4..cbcb7787ae6e 100644
--- a/example/numpy-ops/numpy_softmax.py
+++ b/example/numpy-ops/numpy_softmax.py
@@ -16,7 +16,11 @@
 # under the License.
 
 # pylint: skip-file
-from data import mnist_iterator
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+from get_data import MNISTIterator
 import mxnet as mx
 import numpy as np
 import logging
@@ -66,7 +70,7 @@ def backward(self, out_grad, in_data, out_data, in_grad):
 
 # data
 
-train, val = mnist_iterator(batch_size=100, input_shape = (784,))
+train, val = MNISTIterator(batch_size=100, input_shape = (784,))
 
 # train
 
diff --git a/example/python-howto/data.py b/example/python-howto/data.py
deleted file mode 100644
index 0ca8e1fd6653..000000000000
--- a/example/python-howto/data.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: skip-file
-""" data iterator for mnist """
-import sys
-import os
-# code to automatically download dataset
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
-import get_data
-import mxnet as mx
-
-def mnist_iterator(batch_size, input_shape):
-    """return train and val iterators for mnist"""
-    # download data
-    get_data.GetMNIST_ubyte()
-    flat = False if len(input_shape) == 3 else True
-
-    train_dataiter = mx.io.MNISTIter(
-        image="data/train-images-idx3-ubyte",
-        label="data/train-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        shuffle=True,
-        flat=flat)
-
-    val_dataiter = mx.io.MNISTIter(
-        image="data/t10k-images-idx3-ubyte",
-        label="data/t10k-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        flat=flat)
-
-    return (train_dataiter, val_dataiter)
diff --git a/example/python-howto/monitor_weights.py b/example/python-howto/monitor_weights.py
index c54e64954535..a8b255196d22 100644
--- a/example/python-howto/monitor_weights.py
+++ b/example/python-howto/monitor_weights.py
@@ -16,7 +16,11 @@
 # under the License.
 
 # pylint: skip-file
-from data import mnist_iterator
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+from get_data import MNISTIterator
 import mxnet as mx
 import numpy as np
 import logging
@@ -31,7 +35,7 @@
 
 # data
 
-train, val = mnist_iterator(batch_size=100, input_shape = (784,))
+train, val = MNISTIterator(batch_size=100, input_shape = (784,))
 
 # train
 
diff --git a/example/rcnn/LICENSE b/example/rcnn/LICENSE
index 749e0a16b363..ac015288d18c 100644
--- a/example/rcnn/LICENSE
+++ b/example/rcnn/LICENSE
@@ -15,23 +15,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 
 
-MXNet
-
-Copyright (c) 2015-2016 by Contributors
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-
 Fast R-CNN
 
 Copyright (c) Microsoft Corporation
diff --git a/example/rcnn/rcnn/core/callback.py b/example/rcnn/rcnn/core/callback.py
index bacff9665e91..06eb2629e7bd 100644
--- a/example/rcnn/rcnn/core/callback.py
+++ b/example/rcnn/rcnn/core/callback.py
@@ -15,44 +15,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import time
-import logging
 import mxnet as mx
 
 
-class Speedometer(object):
-    def __init__(self, batch_size, frequent=50):
-        self.batch_size = batch_size
-        self.frequent = frequent
-        self.init = False
-        self.tic = 0
-        self.last_count = 0
-
-    def __call__(self, param):
-        """Callback to Show speed."""
-        count = param.nbatch
-        if self.last_count > count:
-            self.init = False
-        self.last_count = count
-
-        if self.init:
-            if count % self.frequent == 0:
-                speed = self.frequent * self.batch_size / (time.time() - self.tic)
-                if param.eval_metric is not None:
-                    name, value = param.eval_metric.get()
-                    s = "Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-" % (param.epoch, count, speed)
-                    for n, v in zip(name, value):
-                        s += "%s=%f,\t" % (n, v)
-                    logging.info(s)
-                else:
-                    logging.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec",
-                                 param.epoch, count, speed)
-                self.tic = time.time()
-        else:
-            self.init = True
-            self.tic = time.time()
-
-
 def do_checkpoint(prefix, means, stds):
     def _callback(iter_no, sym, arg, aux):
         arg['bbox_pred_weight_test'] = (arg['bbox_pred_weight'].T * mx.nd.array(stds)).T
diff --git a/example/rcnn/rcnn/core/loader.py b/example/rcnn/rcnn/core/loader.py
index 826ee20f080c..fdd6e5c386f1 100644
--- a/example/rcnn/rcnn/core/loader.py
+++ b/example/rcnn/rcnn/core/loader.py
@@ -165,11 +165,16 @@ def reset(self):
                 vert = np.logical_not(horz)
                 horz_inds = np.where(horz)[0]
                 vert_inds = np.where(vert)[0]
+                # Avoid putting different aspect ratio image into the same bucket,
+                # which may cause bucketing warning.
+                pad_horz = self.batch_size - len(horz_inds) % self.batch_size
+                pad_vert = self.batch_size - len(vert_inds) % self.batch_size
+                horz_inds = np.hstack([horz_inds, horz_inds[:pad_horz]])
+                vert_inds = np.hstack([vert_inds, vert_inds[:pad_vert]])
                 inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds)))
-                extra = inds.shape[0] % self.batch_size
-                inds_ = np.reshape(inds[:-extra], (-1, self.batch_size))
-                row_perm = np.random.permutation(np.arange(inds_.shape[0]))
-                inds[:-extra] = np.reshape(inds_[row_perm, :], (-1,))
+                inds = np.reshape(inds[:], (-1, self.batch_size))
+                row_perm = np.random.permutation(np.arange(inds.shape[0]))
+                inds = np.reshape(inds[row_perm, :], (-1,))
                 self.index = inds
             else:
                 np.random.shuffle(self.index)
diff --git a/example/rcnn/rcnn/io/rpn.py b/example/rcnn/rcnn/io/rpn.py
index 20cd1ce4e744..59dd615aaa39 100644
--- a/example/rcnn/rcnn/io/rpn.py
+++ b/example/rcnn/rcnn/io/rpn.py
@@ -188,7 +188,7 @@ def _unmap(data, count, inds, fill=0):
     fg_inds = np.where(labels == 1)[0]
     if len(fg_inds) > num_fg:
         disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False)
-        if logger.level == logging.INFO:
+        if logger.level == logging.DEBUG:
             disable_inds = fg_inds[:(len(fg_inds) - num_fg)]
         labels[disable_inds] = -1
 
@@ -197,7 +197,7 @@ def _unmap(data, count, inds, fill=0):
     bg_inds = np.where(labels == 0)[0]
     if len(bg_inds) > num_bg:
         disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False)
-        if logger.level == logging.INFO:
+        if logger.level == logging.DEBUG:
             disable_inds = bg_inds[:(len(bg_inds) - num_bg)]
         labels[disable_inds] = -1
 
diff --git a/example/rcnn/rcnn/symbol/symbol_resnet.py b/example/rcnn/rcnn/symbol/symbol_resnet.py
index f914d117eb18..4a9677d44099 100644
--- a/example/rcnn/rcnn/symbol/symbol_resnet.py
+++ b/example/rcnn/rcnn/symbol/symbol_resnet.py
@@ -113,7 +113,7 @@ def get_resnet_train(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCH
     rpn_cls_act_reshape = mx.symbol.Reshape(
         data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape')
     if config.TRAIN.CXX_PROPOSAL:
-        rois = mx.contrib.symbol.Proposal(
+        rois = mx.symbol.contrib.Proposal(
             cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TRAIN.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TRAIN.RPN_POST_NMS_TOP_N,
@@ -189,7 +189,7 @@ def get_resnet_test(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHO
     rpn_cls_prob_reshape = mx.symbol.Reshape(
         data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
     if config.TEST.CXX_PROPOSAL:
-        rois = mx.contrib.symbol.Proposal(
+        rois = mx.symbol.contrib.Proposal(
             cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TEST.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.RPN_POST_NMS_TOP_N,
diff --git a/example/rcnn/rcnn/symbol/symbol_vgg.py b/example/rcnn/rcnn/symbol/symbol_vgg.py
index f04ba89dc1d4..00ba15ed8e60 100644
--- a/example/rcnn/rcnn/symbol/symbol_vgg.py
+++ b/example/rcnn/rcnn/symbol/symbol_vgg.py
@@ -242,7 +242,7 @@ def get_vgg_rpn_test(num_anchors=config.NUM_ANCHORS):
     rpn_cls_prob_reshape = mx.symbol.Reshape(
         data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
     if config.TEST.CXX_PROPOSAL:
-        group = mx.contrib.symbol.Proposal(
+        group = mx.symbol.contrib.Proposal(
             cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', output_score=True,
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TEST.PROPOSAL_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.PROPOSAL_POST_NMS_TOP_N,
@@ -290,7 +290,7 @@ def get_vgg_test(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS)
     rpn_cls_prob_reshape = mx.symbol.Reshape(
         data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape')
     if config.TEST.CXX_PROPOSAL:
-        rois = mx.contrib.symbol.Proposal(
+        rois = mx.symbol.contrib.Proposal(
             cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TEST.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.RPN_POST_NMS_TOP_N,
@@ -373,7 +373,7 @@ def get_vgg_train(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS
     rpn_cls_act_reshape = mx.symbol.Reshape(
         data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape')
     if config.TRAIN.CXX_PROPOSAL:
-        rois = mx.contrib.symbol.Proposal(
+        rois = mx.symbol.contrib.Proposal(
             cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois',
             feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS),
             rpn_pre_nms_top_n=config.TRAIN.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TRAIN.RPN_POST_NMS_TOP_N,
diff --git a/example/rcnn/rcnn/tools/train_rcnn.py b/example/rcnn/rcnn/tools/train_rcnn.py
index c5417b34c2dc..70ff7b7afed5 100644
--- a/example/rcnn/rcnn/tools/train_rcnn.py
+++ b/example/rcnn/rcnn/tools/train_rcnn.py
@@ -118,7 +118,7 @@ def train_rcnn(network, dataset, image_set, root_path, dataset_path,
     for child_metric in [eval_metric, cls_metric, bbox_metric]:
         eval_metrics.add(child_metric)
     # callback
-    batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=frequent)
+    batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent, auto_reset=False)
     epoch_end_callback = callback.do_checkpoint(prefix, means, stds)
     # decide learning rate
     base_lr = lr
diff --git a/example/rcnn/rcnn/tools/train_rpn.py b/example/rcnn/rcnn/tools/train_rpn.py
index aaaf570a1fc3..f3c1104ee376 100644
--- a/example/rcnn/rcnn/tools/train_rpn.py
+++ b/example/rcnn/rcnn/tools/train_rpn.py
@@ -119,7 +119,7 @@ def train_rpn(network, dataset, image_set, root_path, dataset_path,
     for child_metric in [eval_metric, cls_metric, bbox_metric]:
         eval_metrics.add(child_metric)
     # callback
-    batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=frequent)
+    batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent, auto_reset=False)
     epoch_end_callback = mx.callback.do_checkpoint(prefix)
     # decide learning rate
     base_lr = lr
diff --git a/example/rcnn/train_end2end.py b/example/rcnn/train_end2end.py
index 5c942936aa4c..24c658aeca4d 100644
--- a/example/rcnn/train_end2end.py
+++ b/example/rcnn/train_end2end.py
@@ -126,7 +126,7 @@ def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch,
     for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]:
         eval_metrics.add(child_metric)
     # callback
-    batch_end_callback = callback.Speedometer(train_data.batch_size, frequent=args.frequent)
+    batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=args.frequent, auto_reset=False)
     means = np.tile(np.array(config.TRAIN.BBOX_MEANS), config.NUM_CLASSES)
     stds = np.tile(np.array(config.TRAIN.BBOX_STDS), config.NUM_CLASSES)
     epoch_end_callback = callback.do_checkpoint(prefix, means, stds)
diff --git a/example/recommenders/crossentropy.py b/example/recommenders/crossentropy.py
index 51648b0eb157..d8577ed898d7 100644
--- a/example/recommenders/crossentropy.py
+++ b/example/recommenders/crossentropy.py
@@ -20,6 +20,7 @@
 """Cross-entropy loss layer for MXNet.
 """
 import os
+import time
 
 import numpy as np
 import mxnet as mx
@@ -51,10 +52,10 @@ def forward(self, is_train, req, in_data, out_data, aux):
         #  d = number of dimensions
         actually_calculate_loss = False
         if actually_calculate_loss:
-            p = in_data[0].asnumpy()  # shape=(b,d)
-            y = in_data[1].asnumpy()
-            out = y * np.log(p+self.eps) + (1.-y) * np.log((self.eps1) - p)
-            self.assign(out_data[0], req[0], mx.nd.array(out))
+            p = in_data[0]  # shape=(b,d)
+            y = in_data[1]
+            out = y * mx.nd.log(p+self.eps) + (1.-y) * mx.nd.log((self.eps1) - p)
+            self.assign(out_data[0], req[0], out)
         else:
             # Just copy the predictions forward
             self.assign(out_data[0], req[0], in_data[0])
@@ -70,19 +71,19 @@ def approx_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
         grad = 1/(p-1+y)
         which is more numerically stable
         """
-        p = in_data[0].asnumpy()  # shape=(b,d)
-        y = in_data[1].asnumpy()
+        p = in_data[0]  # shape=(b,d)
+        y = in_data[1]
         grad = -1. / (p - self.eps_1 + y)
-        self.assign(in_grad[0], req[0], mx.nd.array(grad))
+        self.assign(in_grad[0], req[0], grad)
 
 
     def exact_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
         """grad = (y-p)/(p-p^2)
         """
-        p = in_data[0].asnumpy()  # shape=(b,d)
-        y = in_data[1].asnumpy()  # seems right
+        p = in_data[0]  # shape=(b,d)
+        y = in_data[1]  # seems right
         grad = (p - y) / ((p+self.eps) * (self.eps1 - p))
-        self.assign(in_grad[0], req[0], mx.nd.array(grad))
+        self.assign(in_grad[0], req[0], grad)
 
 
 @mx.operator.register("CrossEntropyLoss")
@@ -126,5 +127,15 @@ def infer_shape(self, in_shape):
         out = e.outputs[0].asnumpy()
         if np.abs(out).max() > 1e20:
             raise ValueError("output too high!")
-    print("Done with test")
 
+    print("performance test")
+    sz = (6,4)
+    d = mx.nd.array(rand.uniform(0.01,0.99,sz))
+    l = mx.nd.array(rand.randint(0,2,sz))
+    e = net.bind(ctx=mx.cpu(), args={'data':d, 'labs':l})
+    tic = time.time()
+    for i in range(5000):
+        e.forward()
+        e.outputs[0].wait_to_read()
+    print("5000 tests costs time: %f s" % (time.time()-tic))
+    print("Done with test")
diff --git a/example/rnn-time-major/get_ptb_data.sh b/example/rnn-time-major/get_ptb_data.sh
index d2641cb32b81..0a0c7051b010 100755
--- a/example/rnn-time-major/get_ptb_data.sh
+++ b/example/rnn-time-major/get_ptb_data.sh
@@ -17,6 +17,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+echo ""
+echo "NOTE: Please review the licensing of the datasets in this script before proceeding"
+echo "See https://catalog.ldc.upenn.edu/ldc99t42 for the licensing"
+echo "Once that is done, please uncomment the wget commands in this script"
+echo ""
 
 RNN_DIR=$(cd `dirname $0`; pwd)
 DATA_DIR="${RNN_DIR}/data/"
@@ -26,7 +31,7 @@ if [[ ! -d "${DATA_DIR}" ]]; then
   mkdir -p ${DATA_DIR}
 fi
 
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/example/rnn/bucket_R/aclImdb_lstm_classification.R b/example/rnn/bucket_R/aclImdb_lstm_classification.R
index bb5eaacf26dd..27fe0004632e 100644
--- a/example/rnn/bucket_R/aclImdb_lstm_classification.R
+++ b/example/rnn/bucket_R/aclImdb_lstm_classification.R
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 require("mxnet")
 
 source("mx.io.bucket.iter.R")
@@ -13,21 +30,21 @@ batch.size <- 64
 
 num.round <- 16
 
-train.data <- mx.io.bucket.iter(buckets = corpus_bucketed_train$buckets, batch.size = batch.size, 
+train.data <- mx.io.bucket.iter(buckets = corpus_bucketed_train$buckets, batch.size = batch.size,
   data.mask.element = 0, shuffle = TRUE)
 
-eval.data <- mx.io.bucket.iter(buckets = corpus_bucketed_test$buckets, batch.size = batch.size, 
+eval.data <- mx.io.bucket.iter(buckets = corpus_bucketed_test$buckets, batch.size = batch.size,
   data.mask.element = 0, shuffle = FALSE)
 
 mx.set.seed(0)
-optimizer <- mx.opt.create("adadelta", rho = 0.92, epsilon = 1e-06, wd = 2e-04, clip_gradient = NULL, 
+optimizer <- mx.opt.create("adadelta", rho = 0.92, epsilon = 1e-06, wd = 2e-04, clip_gradient = NULL,
   rescale.grad = 1/batch.size)
 
-model_sentiment_lstm <- mx.rnn.buckets(train.data = train.data, begin.round = 1, 
-  num.round = num.round, ctx = mx.cpu(), metric = mx.metric.accuracy, optimizer = optimizer, 
-  num.rnn.layer = 2, num.embed = 16, num.hidden = 24, num.label = 2, input.size = vocab, 
-  initializer = mx.init.Xavier(rnd_type = "gaussian", factor_type = "in", magnitude = 2), 
-  dropout = 0.25, config = "seq-to-one", batch.end.callback = mx.callback.log.train.metric(period = 50), 
+model_sentiment_lstm <- mx.rnn.buckets(train.data = train.data, begin.round = 1,
+  num.round = num.round, ctx = mx.cpu(), metric = mx.metric.accuracy, optimizer = optimizer,
+  num.rnn.layer = 2, num.embed = 16, num.hidden = 24, num.label = 2, input.size = vocab,
+  initializer = mx.init.Xavier(rnd_type = "gaussian", factor_type = "in", magnitude = 2),
+  dropout = 0.25, config = "seq-to-one", batch.end.callback = mx.callback.log.train.metric(period = 50),
   verbose = TRUE)
 
 mx.model.save(model_sentiment_lstm, prefix = "model_sentiment_lstm", iteration = num.round)
diff --git a/example/rnn/bucket_R/data_preprocessing.R b/example/rnn/bucket_R/data_preprocessing.R
index c91e3fb5eb49..96520776937a 100644
--- a/example/rnn/bucket_R/data_preprocessing.R
+++ b/example/rnn/bucket_R/data_preprocessing.R
@@ -1,6 +1,23 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # download the IMDB dataset
 if (!file.exists("aclImdb_v1.tar.gz")) {
-  download.file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
+  download.file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
     "aclImdb_v1.tar.gz")
   untar("aclImdb_v1.tar.gz")
 }
@@ -43,7 +60,7 @@ saveRDS(test_raw, file = "test_raw.rds")
 text_pre_process <- function(corpus, count_threshold = 10, dic = NULL) {
   raw_vec <- corpus
   raw_vec <- stri_enc_toascii(str = raw_vec)
-  
+
   ### remove non-printable characters
   raw_vec <- str_replace_all(string = raw_vec, pattern = "[^[:print:]]", replacement = "")
   raw_vec <- str_to_lower(string = raw_vec)
@@ -51,12 +68,12 @@ text_pre_process <- function(corpus, count_threshold = 10, dic = NULL) {
   raw_vec <- str_replace_all(string = raw_vec, pattern = "\\bbr\\b", replacement = "")
   raw_vec <- str_replace_all(string = raw_vec, pattern = "\\s+", replacement = " ")
   raw_vec <- str_trim(string = raw_vec)
-  
+
   ### Split raw sequence vectors into lists of word vectors (one list element per
   ### sequence)
-  word_vec_list <- stri_split_boundaries(raw_vec, type = "word", skip_word_none = T, 
+  word_vec_list <- stri_split_boundaries(raw_vec, type = "word", skip_word_none = T,
     skip_word_number = F, simplify = F)
-  
+
   ### Build vocabulary
   if (is.null(dic)) {
     word_vec_unlist <- unlist(word_vec_list)
@@ -66,79 +83,79 @@ text_pre_process <- function(corpus, count_threshold = 10, dic = NULL) {
     stopwords <- c(letters, "an", "the", "br")
     word_keep <- setdiff(word_keep, stopwords)
   } else word_keep <- names(dic)[!dic == 0]
-  
+
   ### Clean the sentences to keep only the curated list of words
   word_vec_list <- lapply(word_vec_list, function(x) x[x %in% word_keep])
-  
+
   # sentence_vec<- stri_split_boundaries(raw_vec, type='sentence', simplify = T)
   word_vec_length <- lapply(word_vec_list, length) %>% unlist()
-  
+
   ### Build dictionnary
   dic <- 1:length(word_keep)
   names(dic) <- word_keep
   dic <- c(`¤` = 0, dic)
-  
+
   ### reverse dictionnary
   rev_dic <- names(dic)
   names(rev_dic) <- dic
-  
+
   return(list(word_vec_list = word_vec_list, dic = dic, rev_dic = rev_dic))
 }
 
-################################################################ 
+################################################################
 make_bucket_data <- function(word_vec_list, labels, dic, seq_len = c(225), right_pad = T) {
   ### Trunc sequence to max bucket length
   word_vec_list <- lapply(word_vec_list, head, n = max(seq_len))
-  
+
   word_vec_length <- lapply(word_vec_list, length) %>% unlist()
-  bucketID <- cut(word_vec_length, breaks = c(0, seq_len, Inf), include.lowest = T, 
+  bucketID <- cut(word_vec_length, breaks = c(0, seq_len, Inf), include.lowest = T,
     labels = F)
   # table(bucketID)
-  
+
   ### Right or Left side Padding Pad sequences to their bucket length with
   ### dictionnary 0-label
   word_vec_list_pad <- lapply(1:length(word_vec_list), function(x) {
     length(word_vec_list[[x]]) <- seq_len[bucketID[x]]
     word_vec_list[[x]][is.na(word_vec_list[[x]])] <- names(dic[1])
-    if (right_pad == F) 
+    if (right_pad == F)
       word_vec_list[[x]] <- rev(word_vec_list[[x]])
     return(word_vec_list[[x]])
   })
-  
+
   ### Assign sequences to buckets and unroll them in order to be reshaped into arrays
-  unrolled_arrays <- lapply(1:length(seq_len), function(x) unlist(word_vec_list_pad[bucketID == 
+  unrolled_arrays <- lapply(1:length(seq_len), function(x) unlist(word_vec_list_pad[bucketID ==
     x]))
-  
+
   ### Assign labels to their buckets
   bucketed_labels <- lapply(1:length(seq_len), function(x) labels[bucketID == x])
   names(bucketed_labels) <- as.character(seq_len)
-  
+
   ### Assign the dictionnary to each bucket terms
   unrolled_arrays_dic <- lapply(1:length(seq_len), function(x) dic[unrolled_arrays[[x]]])
-  
+
   # length(splitted_arrays_dic[[1]]) Reshape into arrays having each sequence into
   # a column
-  features_arrays <- lapply(1:length(seq_len), function(x) array(unrolled_arrays_dic[[x]], 
+  features_arrays <- lapply(1:length(seq_len), function(x) array(unrolled_arrays_dic[[x]],
     dim = c(seq_len[x], length(unrolled_arrays_dic[[x]])/seq_len[x])))
-  
-  features <- lapply(1:length(seq_len), function(x) features_arrays[[x]][1:seq_len[x], 
+
+  features <- lapply(1:length(seq_len), function(x) features_arrays[[x]][1:seq_len[x],
     ])
   names(features) <- as.character(seq_len)
-  
+
   ### Combine data and labels into buckets
-  buckets <- lapply(1:length(seq_len), function(x) c(list(data = features[[x]]), 
+  buckets <- lapply(1:length(seq_len), function(x) c(list(data = features[[x]]),
     list(label = bucketed_labels[[x]])))
   names(buckets) <- as.character(seq_len)
-  
+
   ### reverse dictionnary
   rev_dic <- names(dic)
   names(rev_dic) <- dic
-  
+
   return(list(buckets = buckets, dic = dic, rev_dic = rev_dic))
 }
 
 
-corpus_preprocessed_train <- text_pre_process(corpus = train_raw, count_threshold = 10, 
+corpus_preprocessed_train <- text_pre_process(corpus = train_raw, count_threshold = 10,
   dic = NULL)
 
 # length(corpus_preprocessed_train$dic)
@@ -152,15 +169,15 @@ corpus_preprocessed_train <- readRDS(file = "corpus_preprocessed_train_10.rds")
 corpus_preprocessed_test <- readRDS(file = "corpus_preprocessed_test_10.rds")
 
 
-corpus_bucketed_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list, 
-  labels = rep(0:1, each = 12500), dic = corpus_preprocessed_train$dic, seq_len = c(100, 
+corpus_bucketed_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list,
+  labels = rep(0:1, each = 12500), dic = corpus_preprocessed_train$dic, seq_len = c(100,
     200, 300, 500, 800), right_pad = F)
 
 # lapply(corpus_bucketed_train$buckets, function(x) length(x[[2]]))
 
 
-corpus_bucketed_test <- make_bucket_data(word_vec_list = corpus_preprocessed_test$word_vec_list, 
-  labels = rep(0:1, each = 12500), dic = corpus_preprocessed_test$dic, seq_len = c(100, 
+corpus_bucketed_test <- make_bucket_data(word_vec_list = corpus_preprocessed_test$word_vec_list,
+  labels = rep(0:1, each = 12500), dic = corpus_preprocessed_test$dic, seq_len = c(100,
     200, 300, 500, 800), right_pad = F)
 
 # lapply(corpus_bucketed_test$buckets, function(x) length(x[[2]]))
diff --git a/example/rnn/bucket_R/data_preprocessing_seq_to_one.R b/example/rnn/bucket_R/data_preprocessing_seq_to_one.R
new file mode 100644
index 000000000000..a7d73f0acf02
--- /dev/null
+++ b/example/rnn/bucket_R/data_preprocessing_seq_to_one.R
@@ -0,0 +1,193 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# download the IMDB dataset
+if (!file.exists("data/aclImdb_v1.tar.gz")) {
+  download.file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
+                "data/aclImdb_v1.tar.gz")
+  untar("data/aclImdb_v1.tar.gz")
+}
+
+# install required packages
+list.of.packages <- c("readr", "dplyr", "stringr", "stringi")
+new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
+if (length(new.packages)) install.packages(new.packages)
+
+require("readr")
+require("dplyr")
+require("stringr")
+require("stringi")
+
+negative_train_list <- list.files("data/aclImdb/train/neg/", full.names = T)
+positive_train_list <- list.files("data/aclImdb/train/pos/", full.names = T)
+
+negative_test_list <- list.files("data/aclImdb/test/neg/", full.names = T)
+positive_test_list <- list.files("data/aclImdb/test/pos/", full.names = T)
+
+file_import <- function(file_list) {
+  import <- sapply(file_list, read_file)
+  return(import)
+}
+
+negative_train_raw <- file_import(negative_train_list)
+positive_train_raw <- file_import(positive_train_list)
+
+negative_test_raw <- file_import(negative_test_list)
+positive_test_raw <- file_import(positive_test_list)
+
+train_raw <- c(negative_train_raw, positive_train_raw)
+test_raw <- c(negative_test_raw, positive_test_raw)
+
+# Pre-process a corpus composed of a vector of sequences Build a dictionnary
+# removing too rare words
+text_pre_process <- function(corpus, count_threshold = 10, dic = NULL) {
+  raw_vec <- corpus
+  raw_vec <- stri_enc_toascii(str = raw_vec)
+
+  ### perform some preprocessing
+  raw_vec <- str_replace_all(string = raw_vec, pattern = "[^[:print:]]", replacement = "")
+  raw_vec <- str_to_lower(string = raw_vec)
+  raw_vec <- str_replace_all(string = raw_vec, pattern = "_", replacement = " ")
+  raw_vec <- str_replace_all(string = raw_vec, pattern = "\\bbr\\b", replacement = "")
+  raw_vec <- str_replace_all(string = raw_vec, pattern = "\\s+", replacement = " ")
+  raw_vec <- str_trim(string = raw_vec)
+
+  ### Split raw sequence vectors into lists of word vectors (one list element per
+  ### sequence)
+  word_vec_list <- stri_split_boundaries(raw_vec, type = "word", skip_word_none = T,
+    skip_word_number = F, simplify = F)
+
+  ### Build vocabulary
+  if (is.null(dic)) {
+    word_vec_unlist <- unlist(word_vec_list)
+    word_vec_table <- sort(table(word_vec_unlist), decreasing = T)
+    word_cutoff <- which.max(word_vec_table < count_threshold)
+    word_keep <- names(word_vec_table)[1:(word_cutoff - 1)]
+    stopwords <- c(letters, "an", "the", "br")
+    word_keep <- setdiff(word_keep, stopwords)
+  } else word_keep <- names(dic)[!dic == 0]
+
+  ### Clean the sentences to keep only the curated list of words
+  word_vec_list <- lapply(word_vec_list, function(x) x[x %in% word_keep])
+
+  # sentence_vec<- stri_split_boundaries(raw_vec, type='sentence', simplify = T)
+  word_vec_length <- lapply(word_vec_list, length) %>% unlist()
+
+  ### Build dictionnary
+  dic <- 1:length(word_keep)
+  names(dic) <- word_keep
+  dic <- c(`¤` = 0, dic)
+
+  ### reverse dictionnary
+  rev_dic <- names(dic)
+  names(rev_dic) <- dic
+
+  return(list(word_vec_list = word_vec_list, dic = dic, rev_dic = rev_dic))
+}
+
+################################################################
+make_bucket_data <- function(word_vec_list, labels, dic, seq_len = c(225), right_pad = T) {
+  ### Trunc sequence to max bucket length
+  word_vec_list <- lapply(word_vec_list, head, n = max(seq_len))
+
+  word_vec_length <- lapply(word_vec_list, length) %>% unlist()
+  bucketID <- cut(word_vec_length, breaks = c(0, seq_len, Inf), include.lowest = T,
+    labels = F)
+
+  ### Right or Left side Padding Pad sequences to their bucket length with
+  ### dictionnary 0-label
+  word_vec_list_pad <- lapply(1:length(word_vec_list), function(x) {
+    length(word_vec_list[[x]]) <- seq_len[bucketID[x]]
+    word_vec_list[[x]][is.na(word_vec_list[[x]])] <- names(dic[1])
+    if (right_pad == F)
+      word_vec_list[[x]] <- rev(word_vec_list[[x]])
+    return(word_vec_list[[x]])
+  })
+
+  ### Assign sequences to buckets and unroll them in order to be reshaped into arrays
+  unrolled_arrays <- lapply(1:length(seq_len), function(x) unlist(word_vec_list_pad[bucketID ==
+    x]))
+
+  ### Assign labels to their buckets
+  bucketed_labels <- lapply(1:length(seq_len), function(x) labels[bucketID == x])
+  names(bucketed_labels) <- as.character(seq_len)
+
+  ### Assign the dictionnary to each bucket terms
+  unrolled_arrays_dic <- lapply(1:length(seq_len), function(x) dic[unrolled_arrays[[x]]])
+
+  # Reshape into arrays having each sequence into a row
+  features <- lapply(1:length(seq_len), function(x) {
+    t(array(unrolled_arrays_dic[[x]],
+          dim = c(seq_len[x], length(unrolled_arrays_dic[[x]])/seq_len[x])))
+  })
+
+  names(features) <- as.character(seq_len)
+
+  ### Combine data and labels into buckets
+  buckets <- lapply(1:length(seq_len), function(x) c(list(data = features[[x]]),
+    list(label = bucketed_labels[[x]])))
+  names(buckets) <- as.character(seq_len)
+
+  ### reverse dictionnary
+  rev_dic <- names(dic)
+  names(rev_dic) <- dic
+
+  return(list(buckets = buckets, dic = dic, rev_dic = rev_dic))
+}
+
+
+corpus_preprocessed_train <- text_pre_process(corpus = train_raw, count_threshold = 10,
+  dic = NULL)
+
+corpus_preprocessed_test <- text_pre_process(corpus = test_raw, dic = corpus_preprocessed_train$dic)
+
+seq_length_dist <- unlist(lapply(corpus_preprocessed_train$word_vec_list, length))
+quantile(seq_length_dist, 0:20/20)
+
+
+# Save bucketed corpus
+corpus_bucketed_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list,
+                                          labels = rep(0:1, each = 12500),
+                                          dic = corpus_preprocessed_train$dic,
+                                          seq_len = c(100, 150, 250, 400, 600),
+                                          right_pad = T)
+
+corpus_bucketed_test <- make_bucket_data(word_vec_list = corpus_preprocessed_test$word_vec_list,
+                                         labels = rep(0:1, each = 12500),
+                                         dic = corpus_preprocessed_test$dic,
+                                         seq_len = c(100, 150, 250, 400, 600),
+                                         right_pad = T)
+
+saveRDS(corpus_bucketed_train, file = "data/corpus_bucketed_train.rds")
+saveRDS(corpus_bucketed_test, file = "data/corpus_bucketed_test.rds")
+
+
+# Save non bucketed corpus
+corpus_single_train <- make_bucket_data(word_vec_list = corpus_preprocessed_train$word_vec_list,
+                                          labels = rep(0:1, each = 12500),
+                                          dic = corpus_preprocessed_train$dic,
+                                          seq_len = c(600),
+                                          right_pad = T)
+
+corpus_single_test <- make_bucket_data(word_vec_list = corpus_preprocessed_test$word_vec_list,
+                                         labels = rep(0:1, each = 12500),
+                                         dic = corpus_preprocessed_test$dic,
+                                         seq_len = c(600),
+                                         right_pad = T)
+
+saveRDS(corpus_single_train, file = "data/corpus_single_train.rds")
+saveRDS(corpus_single_test, file = "data/corpus_single_test.rds")
diff --git a/example/rnn/bucket_R/gru.cell.R b/example/rnn/bucket_R/gru.cell.R
index 5932cdf17efa..91f5917af5ac 100644
--- a/example/rnn/bucket_R/gru.cell.R
+++ b/example/rnn/bucket_R/gru.cell.R
@@ -1,54 +1,71 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # GRU cell symbol
-gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, 
+gru.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0,
   data_masking) {
-  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight, 
-    bias = param$gates.i2h.bias, num.hidden = num.hidden * 2, name = paste0("t", 
+  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight,
+    bias = param$gates.i2h.bias, num.hidden = num.hidden * 2, name = paste0("t",
       seqidx, ".l", layeridx, ".gates.i2h"))
-  
-  if (dropout > 0) 
+
+  if (dropout > 0)
     i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
-  
+
   if (!is.null(prev.state)) {
-    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight, 
-      bias = param$gates.h2h.bias, num.hidden = num.hidden * 2, name = paste0("t", 
+    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight,
+      bias = param$gates.h2h.bias, num.hidden = num.hidden * 2, name = paste0("t",
         seqidx, ".l", layeridx, ".gates.h2h"))
     gates <- i2h + h2h
   } else {
     gates <- i2h
   }
-  
-  split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F, 
+
+  split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F,
     name = paste0("t", seqidx, ".l", layeridx, ".split"))
-  
+
   update.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
   reset.gate <- mx.symbol.Activation(split.gates[[2]], act.type = "sigmoid")
-  
-  htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight, 
-    bias = param$trans.i2h.bias, num.hidden = num.hidden, name = paste0("t", 
+
+  htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight,
+    bias = param$trans.i2h.bias, num.hidden = num.hidden, name = paste0("t",
       seqidx, ".l", layeridx, ".trans.i2h"))
-  
+
   if (is.null(prev.state)) {
     h.after.reset <- reset.gate * 0
   } else {
     h.after.reset <- prev.state$h * reset.gate
   }
-  
-  htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight, 
-    bias = param$trans.h2h.bias, num.hidden = num.hidden, name = paste0("t", 
+
+  htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight,
+    bias = param$trans.h2h.bias, num.hidden = num.hidden, name = paste0("t",
       seqidx, ".l", layeridx, ".trans.h2h"))
-  
+
   h.trans <- htrans.i2h + htrans.h2h
   h.trans.active <- mx.symbol.Activation(h.trans, act.type = "tanh")
-  
+
   if (is.null(prev.state)) {
     next.h <- update.gate * h.trans.active
   } else {
     next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h)
   }
-  
+
   ### Add a mask - using the mask_array approach
   data_mask_expand <- mx.symbol.Reshape(data = data_masking, shape = c(1, -2))
   next.h <- mx.symbol.broadcast_mul(lhs = next.h, rhs = data_mask_expand)
-  
+
   return(list(h = next.h))
 }
diff --git a/example/rnn/bucket_R/lstm.cell.R b/example/rnn/bucket_R/lstm.cell.R
index 3c7b0e456d20..5f82ad82769a 100644
--- a/example/rnn/bucket_R/lstm.cell.R
+++ b/example/rnn/bucket_R/lstm.cell.R
@@ -1,41 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # LSTM cell symbol
-lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, 
+lstm.cell <- function(num.hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0,
   data_masking) {
-  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias, 
+  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias,
     num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".i2h"))
-  
-  if (dropout > 0) 
+
+  if (dropout > 0)
     i2h <- mx.symbol.Dropout(data = i2h, p = dropout)
-  
+
   if (!is.null(prev.state)) {
-    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight, 
-      bias = param$h2h.bias, num.hidden = num.hidden * 4, name = paste0("t", 
+    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight,
+      bias = param$h2h.bias, num.hidden = num.hidden * 4, name = paste0("t",
         seqidx, ".l", layeridx, ".h2h"))
     gates <- i2h + h2h
   } else {
     gates <- i2h
   }
-  
-  split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F, 
+
+  split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F,
     name = paste0("t", seqidx, ".l", layeridx, ".slice"))
-  
+
   in.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
   in.transform <- mx.symbol.Activation(split.gates[[2]], act.type = "tanh")
   forget.gate <- mx.symbol.Activation(split.gates[[3]], act.type = "sigmoid")
   out.gate <- mx.symbol.Activation(split.gates[[4]], act.type = "sigmoid")
-  
+
   if (is.null(prev.state)) {
     next.c <- in.gate * in.transform
   } else {
     next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
   }
-  
+
   next.h <- out.gate * mx.symbol.Activation(next.c, act.type = "tanh")
-  
+
   ### Add a mask - using the mask_array approach
   data_mask_expand <- mx.symbol.Reshape(data = data_masking, shape = c(1, -2))
   next.c <- mx.symbol.broadcast_mul(lhs = next.c, rhs = data_mask_expand)
   next.h <- mx.symbol.broadcast_mul(lhs = next.h, rhs = data_mask_expand)
-  
+
   return(list(c = next.c, h = next.h))
 }
diff --git a/example/rnn/bucket_R/mx.io.bucket.iter.R b/example/rnn/bucket_R/mx.io.bucket.iter.R
index 61f87957ede0..febed2178cf6 100644
--- a/example/rnn/bucket_R/mx.io.bucket.iter.R
+++ b/example/rnn/bucket_R/mx.io.bucket.iter.R
@@ -1,6 +1,23 @@
-BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "batch.size", 
-  "data.mask.element", "shuffle", "bucket.plan", "bucketID", "epoch", "batch", 
-  "batch.per.epoch", "seed"), contains = "Rcpp_MXArrayDataIter", methods = list(initialize = function(buckets, 
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "batch.size",
+  "data.mask.element", "shuffle", "bucket.plan", "bucketID", "epoch", "batch",
+  "batch.per.epoch", "seed"), contains = "Rcpp_MXArrayDataIter", methods = list(initialize = function(buckets,
   batch.size, data.mask.element = 0, shuffle = FALSE, seed = 123) {
   .self$buckets <- buckets
   .self$bucket.names <- names(.self$buckets)
@@ -25,16 +42,16 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
   .self$batch.per.epoch <- sum(batch_per_bucket)
   .self$epoch <- .self$epoch + 1
   .self$batch <- 0
-  
+
   if (.self$shuffle) {
     set.seed(.self$seed)
     bucket_plan_names <- sample(rep(names(batch_per_bucket), times = batch_per_bucket))
-    .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
+    .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names,
       FUN = cumsum)
     names(.self$bucket.plan) <- bucket_plan_names
     ### Return first BucketID at reset for initialization of the model
     .self$bucketID <- .self$bucket.plan[1]
-    
+
     .self$buckets <- lapply(.self$buckets, function(x) {
       shuffle_id <- sample(ncol(x$data))
       if (length(dim(x$label)) == 0) {
@@ -45,7 +62,7 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
     })
   } else {
     bucket_plan_names <- rep(names(batch_per_bucket), times = batch_per_bucket)
-    .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names, 
+    .self$bucket.plan <- ave(bucket_plan_names == bucket_plan_names, bucket_plan_names,
       FUN = cumsum)
     names(.self$bucket.plan) <- bucket_plan_names
   }
@@ -70,12 +87,12 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
   } else {
     label <- .self$buckets[[names(.self$bucketID)]]$label[, idx, drop = F]
   }
-  return(list(data = mx.nd.array(data), data.mask.array = mx.nd.array(data_mask_array), 
+  return(list(data = mx.nd.array(data), data.mask.array = mx.nd.array(data_mask_array),
     label = mx.nd.array(label)))
 }, finalize = function() {
 }))
 
-# 
+#
 #' Create Bucket Iter
 #'
 #' @param buckets The data array.
@@ -85,8 +102,8 @@ BucketIter <- setRefClass("BucketIter", fields = c("buckets", "bucket.names", "b
 #' @param seed The random seed
 #'
 #' @export
-mx.io.bucket.iter <- function(buckets, batch.size, data.mask.element = 0, shuffle = FALSE, 
+mx.io.bucket.iter <- function(buckets, batch.size, data.mask.element = 0, shuffle = FALSE,
   seed = 123) {
-  return(BucketIter$new(buckets = buckets, batch.size = batch.size, data.mask.element = data.mask.element, 
+  return(BucketIter$new(buckets = buckets, batch.size = batch.size, data.mask.element = data.mask.element,
     shuffle = shuffle, seed = seed))
 }
diff --git a/example/rnn/bucket_R/rnn.R b/example/rnn/bucket_R/rnn.R
index ea02b959a7e3..3485cd1c87a5 100644
--- a/example/rnn/bucket_R/rnn.R
+++ b/example/rnn/bucket_R/rnn.R
@@ -1,104 +1,121 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 library(mxnet)
 
 source("lstm.cell.R")
 source("gru.cell.R")
 
 # unrolled RNN network
-rnn.unroll <- function(num.rnn.layer, seq.len, input.size, num.embed, num.hidden, 
-  num.label, dropout = 0, ignore_label = 0, init.state = NULL, config, cell.type = "lstm", 
+rnn.unroll <- function(num.rnn.layer, seq.len, input.size, num.embed, num.hidden,
+  num.label, dropout = 0, ignore_label = 0, init.state = NULL, config, cell.type = "lstm",
   output_last_state = F) {
   embed.weight <- mx.symbol.Variable("embed.weight")
   cls.weight <- mx.symbol.Variable("cls.weight")
   cls.bias <- mx.symbol.Variable("cls.bias")
-  
+
   param.cells <- lapply(1:num.rnn.layer, function(i) {
     if (cell.type == "lstm") {
-      cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")), 
-        i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), h2h.weight = mx.symbol.Variable(paste0("l", 
-          i, ".h2h.weight")), h2h.bias = mx.symbol.Variable(paste0("l", i, 
+      cell <- list(i2h.weight = mx.symbol.Variable(paste0("l", i, ".i2h.weight")),
+        i2h.bias = mx.symbol.Variable(paste0("l", i, ".i2h.bias")), h2h.weight = mx.symbol.Variable(paste0("l",
+          i, ".h2h.weight")), h2h.bias = mx.symbol.Variable(paste0("l", i,
           ".h2h.bias")))
     } else if (cell.type == "gru") {
-      cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")), 
-        gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")), 
-        gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")), 
-        gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")), 
-        trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")), 
-        trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")), 
-        trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")), 
+      cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.i2h.weight")),
+        gates.i2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.i2h.bias")),
+        gates.h2h.weight = mx.symbol.Variable(paste0("l", i, ".gates.h2h.weight")),
+        gates.h2h.bias = mx.symbol.Variable(paste0("l", i, ".gates.h2h.bias")),
+        trans.i2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.i2h.weight")),
+        trans.i2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.i2h.bias")),
+        trans.h2h.weight = mx.symbol.Variable(paste0("l", i, ".trans.h2h.weight")),
         trans.h2h.bias = mx.symbol.Variable(paste0("l", i, ".trans.h2h.bias")))
     }
     return(cell)
   })
-  
+
   # embeding layer
   label <- mx.symbol.Variable("label")
   data <- mx.symbol.Variable("data")
   data_mask_array <- mx.symbol.Variable("data.mask.array")
   data_mask_array <- mx.symbol.stop_gradient(data_mask_array, name = "data.mask.array")
-  
-  embed <- mx.symbol.Embedding(data = data, input_dim = input.size, weight = embed.weight, 
+
+  embed <- mx.symbol.Embedding(data = data, input_dim = input.size, weight = embed.weight,
     output_dim = num.embed, name = "embed")
-  
+
   wordvec <- mx.symbol.split(data = embed, axis = 1, num.outputs = seq.len, squeeze_axis = T)
-  data_mask_split <- mx.symbol.split(data = data_mask_array, axis = 1, num.outputs = seq.len, 
+  data_mask_split <- mx.symbol.split(data = data_mask_array, axis = 1, num.outputs = seq.len,
     squeeze_axis = T)
-  
+
   last.hidden <- list()
   last.states <- list()
   decode <- list()
   softmax <- list()
   fc <- list()
-  
+
   for (seqidx in 1:seq.len) {
     hidden <- wordvec[[seqidx]]
-    
+
     for (i in 1:num.rnn.layer) {
       if (seqidx == 1) {
         prev.state <- init.state[[i]]
       } else {
         prev.state <- last.states[[i]]
       }
-      
+
       if (cell.type == "lstm") {
         cell.symbol <- lstm.cell
       } else if (cell.type == "gru") {
         cell.symbol <- gru.cell
       }
-      
-      next.state <- cell.symbol(num.hidden = num.hidden, indata = hidden, prev.state = prev.state, 
-        param = param.cells[[i]], seqidx = seqidx, layeridx = i, dropout = dropout, 
+
+      next.state <- cell.symbol(num.hidden = num.hidden, indata = hidden, prev.state = prev.state,
+        param = param.cells[[i]], seqidx = seqidx, layeridx = i, dropout = dropout,
         data_masking = data_mask_split[[seqidx]])
       hidden <- next.state$h
       # if (dropout > 0) hidden <- mx.symbol.Dropout(data=hidden, p=dropout)
       last.states[[i]] <- next.state
     }
-    
+
     # Decoding
     if (config == "one-to-one") {
       last.hidden <- c(last.hidden, hidden)
     }
   }
-  
+
   if (config == "seq-to-one") {
-    fc <- mx.symbol.FullyConnected(data = hidden, weight = cls.weight, bias = cls.bias, 
+    fc <- mx.symbol.FullyConnected(data = hidden, weight = cls.weight, bias = cls.bias,
       num.hidden = num.label)
-    
+
     loss <- mx.symbol.SoftmaxOutput(data = fc, name = "sm", label = label, ignore_label = ignore_label)
-    
+
   } else if (config == "one-to-one") {
-    last.hidden_expand <- lapply(last.hidden, function(i) mx.symbol.expand_dims(i, 
+    last.hidden_expand <- lapply(last.hidden, function(i) mx.symbol.expand_dims(i,
       axis = 1))
     concat <- mx.symbol.concat(last.hidden_expand, num.args = seq.len, dim = 1)
     reshape <- mx.symbol.Reshape(concat, shape = c(num.hidden, -1))
-    
-    fc <- mx.symbol.FullyConnected(data = reshape, weight = cls.weight, bias = cls.bias, 
+
+    fc <- mx.symbol.FullyConnected(data = reshape, weight = cls.weight, bias = cls.bias,
       num.hidden = num.label)
-    
+
     label <- mx.symbol.reshape(data = label, shape = c(-1))
     loss <- mx.symbol.SoftmaxOutput(data = fc, name = "sm", label = label, ignore_label = ignore_label)
-    
+
   }
-  
+
   if (output_last_state) {
     group <- mx.symbol.Group(c(unlist(last.states), loss))
     return(group)
@@ -108,32 +125,32 @@ rnn.unroll <- function(num.rnn.layer, seq.len, input.size, num.embed, num.hidden
 }
 
 ########################################### mx.rnn.buckets
-mx.rnn.buckets <- function(train.data, eval.data = NULL, num.rnn.layer, num.hidden, 
-  num.embed, num.label, input.size, ctx = NULL, num.round = 1, initializer = mx.init.uniform(0.01), 
-  dropout = 0, config = "one-to-one", optimizer = "sgd", batch.end.callback = NULL, 
-  epoch.end.callback = NULL, begin.round = 1, metric = mx.metric.rmse, cell.type = "lstm", 
+mx.rnn.buckets <- function(train.data, eval.data = NULL, num.rnn.layer, num.hidden,
+  num.embed, num.label, input.size, ctx = NULL, num.round = 1, initializer = mx.init.uniform(0.01),
+  dropout = 0, config = "one-to-one", optimizer = "sgd", batch.end.callback = NULL,
+  epoch.end.callback = NULL, begin.round = 1, metric = mx.metric.rmse, cell.type = "lstm",
   kvstore = "local", verbose = FALSE) {
-  
+
   if (!train.data$iter.next()) {
     train.data$reset()
-    if (!train.data$iter.next()) 
+    if (!train.data$iter.next())
       stop("Empty train.data")
   }
-  
+
   if (!is.null(eval.data)) {
     if (!eval.data$iter.next()) {
       eval.data$reset()
-      if (!eval.data$iter.next()) 
+      if (!eval.data$iter.next())
         stop("Empty eval.data")
     }
   }
-  
-  if (is.null(ctx)) 
+
+  if (is.null(ctx))
     ctx <- mx.ctx.default()
   if (is.mx.context(ctx)) {
     ctx <- list(ctx)
   }
-  if (!is.list(ctx)) 
+  if (!is.list(ctx))
     stop("ctx must be mx.context or list of mx.context")
   if (is.character(optimizer)) {
     if (is.numeric(input.shape)) {
@@ -145,17 +162,17 @@ mx.rnn.buckets <- function(train.data, eval.data = NULL, num.rnn.layer, num.hidd
     }
     optimizer <- mx.opt.create(optimizer, rescale.grad = (1/batchsize), ...)
   }
-  
+
   # get unrolled lstm symbol
   sym_list <- sapply(train.data$bucket.names, function(x) {
-    rnn.unroll(num.rnn.layer = num.rnn.layer, num.hidden = num.hidden, seq.len = as.integer(x), 
-      input.size = input.size, num.embed = num.embed, num.label = num.label, 
+    rnn.unroll(num.rnn.layer = num.rnn.layer, num.hidden = num.hidden, seq.len = as.integer(x),
+      input.size = input.size, num.embed = num.embed, num.label = num.label,
       dropout = dropout, cell.type = cell.type, config = config)
   }, simplify = F, USE.NAMES = T)
-  
+
   # setup lstm model
   symbol <- sym_list[[names(train.data$bucketID)]]
-  
+
   arg.names <- symbol$arguments
   input.names <- c("data", "data.mask.array")
   input.shape <- sapply(input.names, function(n) {
@@ -165,21 +182,21 @@ mx.rnn.buckets <- function(train.data, eval.data = NULL, num.rnn.layer, num.hidd
   output.shape <- sapply(output.names, function(n) {
     dim(train.data$value()[[n]])
   }, simplify = FALSE)
-  
-  params <- mx.model.init.params(symbol, input.shape, output.shape, initializer, 
+
+  params <- mx.model.init.params(symbol, input.shape, output.shape, initializer,
     mx.cpu())
-  
-  kvstore <- mxnet:::mx.model.create.kvstore(kvstore, params$arg.params, length(ctx), 
+
+  kvstore <- mxnet:::mx.model.create.kvstore(kvstore, params$arg.params, length(ctx),
     verbose = verbose)
-  
+
   ### Execute training - rnn.model.R
-  model <- mx.model.train.rnn.buckets(sym_list = sym_list, input.shape = input.shape, 
-    output.shape = output.shape, arg.params = params$arg.params, aux.params = params$aux.params, 
-    optimizer = optimizer, train.data = train.data, eval.data = eval.data, verbose = verbose, 
-    begin.round = begin.round, end.round = num.round, metric = metric, ctx = ctx, 
-    batch.end.callback = batch.end.callback, epoch.end.callback = epoch.end.callback, 
+  model <- mx.model.train.rnn.buckets(sym_list = sym_list, input.shape = input.shape,
+    output.shape = output.shape, arg.params = params$arg.params, aux.params = params$aux.params,
+    optimizer = optimizer, train.data = train.data, eval.data = eval.data, verbose = verbose,
+    begin.round = begin.round, end.round = num.round, metric = metric, ctx = ctx,
+    batch.end.callback = batch.end.callback, epoch.end.callback = epoch.end.callback,
     kvstore = kvstore)
-  
+
   return(model)
 }
 
diff --git a/example/rnn/bucket_R/rnn.infer.R b/example/rnn/bucket_R/rnn.infer.R
index 41488aac898e..a3b9f533956d 100644
--- a/example/rnn/bucket_R/rnn.infer.R
+++ b/example/rnn/bucket_R/rnn.infer.R
@@ -1,8 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 library(mxnet)
 
 source("rnn.R")
 
-mx.rnn.infer.buckets <- function(infer_iter, model, config, ctx = mx.cpu(), output_last_state = FALSE, 
+mx.rnn.infer.buckets <- function(infer_iter, model, config, ctx = mx.cpu(), output_last_state = FALSE,
   init.state = NULL, cell.type = "lstm") {
   ### Infer parameters from model
   if (cell.type == "lstm") {
@@ -12,65 +29,65 @@ mx.rnn.infer.buckets <- function(infer_iter, model, config, ctx = mx.cpu(), outp
     num.rnn.layer <- round((length(model$arg.params) - 3)/8)
     num.hidden <- dim(model$arg.params$l1.gates.h2h.weight)[1]
   }
-  
+
   input.size <- dim(model$arg.params$embed.weight)[2]
   num.embed <- dim(model$arg.params$embed.weight)[1]
   num.label <- dim(model$arg.params$cls.bias)
-  
+
   ### Initialise the iterator
   infer_iter$reset()
   infer_iter$iter.next()
   batch_size <- infer_iter$batch.size
-  
+
   # get unrolled lstm symbol
   sym_list <- sapply(infer_iter$bucket.names, function(x) {
-    rnn.unroll(num.rnn.layer = num.rnn.layer, num.hidden = num.hidden, seq.len = as.integer(x), 
-      input.size = input.size, num.embed = num.embed, num.label = num.label, 
-      config = config, dropout = 0, init.state = init.state, cell.type = cell.type, 
+    rnn.unroll(num.rnn.layer = num.rnn.layer, num.hidden = num.hidden, seq.len = as.integer(x),
+      input.size = input.size, num.embed = num.embed, num.label = num.label,
+      config = config, dropout = 0, init.state = init.state, cell.type = cell.type,
       output_last_state = output_last_state)
   }, simplify = F, USE.NAMES = T)
-  
+
   symbol <- sym_list[[names(infer_iter$bucketID)]]
-  
+
   input.shape <- lapply(infer_iter$value(), dim)
   input.shape <- input.shape[names(input.shape) %in% arguments(symbol)]
-  
+
   infer_shapes <- symbol$infer.shape(input.shape)
   arg.params <- model$arg.params
   aux.params <- model$aux.params
-  
+
   input.names <- names(input.shape)
   arg.names <- names(arg.params)
-  
+
   # Grad request
   grad_req <- rep("null", length(symbol$arguments))
-  
+
   # Arg array order
   update_names <- c(input.names, arg.names)
   arg_update_idx <- match(symbol$arguments, update_names)
-  
+
   # Initial input shapes - need to be adapted for multi-devices - divide highest
   # dimension by device nb
   s <- sapply(input.shape, function(shape) {
     mx.nd.zeros(shape = shape, ctx = mx.cpu())
   })
-  
-  train.execs <- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, arg.params)[arg_update_idx], 
+
+  train.execs <- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, arg.params)[arg_update_idx],
     aux.arrays = aux.params, ctx = ctx, grad.req = grad_req)
-  
+
   packer <- mxnet:::mx.nd.arraypacker()
   infer_iter$reset()
   while (infer_iter$iter.next()) {
     # Get input data slice
     dlist <- infer_iter$value()[input.names]
-    
+
     symbol <- sym_list[[names(infer_iter$bucketID)]]
-    
-    texec <- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(dlist, train.execs$arg.arrays[arg.names])[arg_update_idx], 
+
+    texec <- mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(dlist, train.execs$arg.arrays[arg.names])[arg_update_idx],
       aux.arrays = train.execs$aux.arrays, ctx = ctx, grad.req = grad_req)
-    
+
     mx.exec.forward(texec, is.train = FALSE)
-    
+
     out.preds <- mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
     packer$push(out.preds)
   }
diff --git a/example/rnn/bucket_R/rnn.train.R b/example/rnn/bucket_R/rnn.train.R
index b833b2b1d37a..d587e97fde37 100644
--- a/example/rnn/bucket_R/rnn.train.R
+++ b/example/rnn/bucket_R/rnn.train.R
@@ -1,44 +1,61 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 library(mxnet)
 
 source("rnn.R")
 
 # Internal function to do multiple device training on RNN
-mx.model.train.rnn.buckets <- function(ctx, sym_list, arg.params, aux.params, input.shape, 
-  output.shape, begin.round, end.round, optimizer, train.data, eval.data, metric, 
+mx.model.train.rnn.buckets <- function(ctx, sym_list, arg.params, aux.params, input.shape,
+  output.shape, begin.round, end.round, optimizer, train.data, eval.data, metric,
   epoch.end.callback, batch.end.callback, kvstore, verbose = TRUE) {
   symbol <- sym_list[[names(train.data$bucketID)]]
-  
+
   input.names <- names(input.shape)
   output.names <- names(output.shape)
   arg.names <- names(arg.params)
-  
+
   ndevice <- length(ctx)
-  if (verbose) 
+  if (verbose)
     message(paste0("Start training with ", ndevice, " devices"))
   input_slice <- mxnet:::mx.model.slice.shape(input.shape, ndevice)
   output_slice <- mxnet:::mx.model.slice.shape(output.shape, ndevice)
-  
-  
+
+
   # Grad request
   grad_req <- rep("write", length(symbol$arguments))
   # grad_null_idx <- match(c(input.names, output.names), symbol$arguments)
   grad_null_idx <- match(input.names, symbol$arguments)
   grad_req[grad_null_idx] <- "null"
-  
+
   # Arg array order
   update_names <- c(input.names, output.names, arg.names)
   arg_update_idx <- match(symbol$arguments, update_names)
-  
+
   train.execs <- lapply(1:ndevice, function(i) {
     s <- sapply(append(input_slice[[i]]$shape, output_slice[[i]]$shape), function(shape) {
       mx.nd.zeros(shape = shape, ctx = mx.cpu())
     })
-    mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, arg.params)[arg_update_idx], 
+    mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, arg.params)[arg_update_idx],
       aux.arrays = aux.params, ctx = mx.cpu(), grad.req = grad_req)
   })
-  
+
   # KVStore related stuffs
-  params.index <- as.integer(mxnet:::mx.util.filter.null(lapply(1:length(train.execs[[1]]$ref.grad.arrays), 
+  params.index <- as.integer(mxnet:::mx.util.filter.null(lapply(1:length(train.execs[[1]]$ref.grad.arrays),
     function(k) {
       if (!is.null(train.execs[[1]]$ref.grad.arrays[[k]])) k else NULL
     })))
@@ -51,11 +68,11 @@ mx.model.train.rnn.buckets <- function(ctx, sym_list, arg.params, aux.params, in
       mx.opt.get.updater(optimizer, train.execs[[i]]$ref.arg.arrays)
     })
   }
-  
+
   if (!is.null(kvstore)) {
     kvstore$init(params.index, train.execs[[1]]$ref.arg.arrays[params.index])
   }
-  
+
   for (iteration in begin.round:end.round) {
     nbatch <- 0
     if (!is.null(metric)) {
@@ -72,25 +89,25 @@ mx.model.train.rnn.buckets <- function(ctx, sym_list, arg.params, aux.params, in
         })
         return(ret)
       })
-      
+
       train.execs <- lapply(1:ndevice, function(i) {
         s <- slices[[i]]
-        mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.names])[arg_update_idx], 
+        mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.names])[arg_update_idx],
           aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad_req)
       })
-      
+
       for (texec in train.execs) {
         mx.exec.forward(texec, is.train = TRUE)
       }
-      
+
       out.preds <- lapply(train.execs, function(texec) {
         mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
       })
-      
+
       for (texec in train.execs) {
         mx.exec.backward(texec)
       }
-      
+
       if (!is.null(kvstore)) {
         # push the gradient
         kvstore$push(params.index, lapply(train.execs, function(texec) {
@@ -116,29 +133,29 @@ mx.model.train.rnn.buckets <- function(ctx, sym_list, arg.params, aux.params, in
           mx.exec.update.arg.arrays(train.execs[[i]], arg.blocks[[i]], skip.null = TRUE)
         }
       }
-      
+
       # Update the evaluation metrics
       if (!is.null(metric)) {
         # train.metric <- metric$update(dlist$label, out.preds, train.metric)
         for (i in 1:ndevice) {
-          train.metric <- metric$update(slices[[i]][[length(slices[[i]])]], 
+          train.metric <- metric$update(slices[[i]][[length(slices[[i]])]],
           out.preds[[i]], train.metric)
         }
       }
-      
+
       nbatch <- nbatch + 1
-      
+
       if (!is.null(batch.end.callback)) {
         batch.end.callback(iteration, nbatch, environment())
       }
     }
-    
+
     if (!is.null(metric)) {
       result <- metric$get(train.metric)
-      if (verbose) 
+      if (verbose)
         message(paste0("[", iteration, "] Train-", result$name, "=", result$value))
     }
-    
+
     if (!is.null(eval.data)) {
       if (!is.null(metric)) {
         eval.metric <- metric$init()
@@ -155,35 +172,35 @@ mx.model.train.rnn.buckets <- function(ctx, sym_list, arg.params, aux.params, in
           })
           return(ret)
         })
-        
-        
+
+
         train.execs <- lapply(1:ndevice, function(i) {
           s <- slices[[i]]
-          mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.names])[arg_update_idx], 
+          mxnet:::mx.symbol.bind(symbol = symbol, arg.arrays = c(s, train.execs[[i]]$arg.arrays[arg.names])[arg_update_idx],
           aux.arrays = train.execs[[i]]$aux.arrays, ctx = ctx[[i]], grad.req = grad_req)
         })
-        
+
         for (texec in train.execs) {
           mx.exec.forward(texec, is.train = FALSE)
         }
-        
+
         # copy outputs to CPU
         out.preds <- lapply(train.execs, function(texec) {
           mx.nd.copyto(texec$ref.outputs[[1]], mx.cpu())
         })
-        
+
         if (!is.null(metric)) {
           for (i in 1:ndevice) {
-          eval.metric <- metric$update(slices[[i]][[length(slices[[i]])]], 
+          eval.metric <- metric$update(slices[[i]][[length(slices[[i]])]],
             out.preds[[i]], eval.metric)
           }
         }
       }
-      
+
       if (!is.null(metric)) {
         result <- metric$get(eval.metric)
         if (verbose) {
-          message(paste0("[", iteration, "] Validation-", result$name, "=", 
+          message(paste0("[", iteration, "] Validation-", result$name, "=",
           result$value))
         }
       }
@@ -192,12 +209,12 @@ mx.model.train.rnn.buckets <- function(ctx, sym_list, arg.params, aux.params, in
     }
     # get the model out
     model <- mxnet:::mx.model.extract.model(symbol, train.execs)
-    
+
     epoch_continue <- TRUE
     if (!is.null(epoch.end.callback)) {
       epoch_continue <- epoch.end.callback(iteration, 0, environment(), verbose = verbose)
     }
-    
+
     if (!epoch_continue) {
       break
     }
diff --git a/example/rnn/get_ptb_data.sh b/example/rnn/get_ptb_data.sh
index d2641cb32b81..0a0c7051b010 100755
--- a/example/rnn/get_ptb_data.sh
+++ b/example/rnn/get_ptb_data.sh
@@ -17,6 +17,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+echo ""
+echo "NOTE: Please review the licensing of the datasets in this script before proceeding"
+echo "See https://catalog.ldc.upenn.edu/ldc99t42 for the licensing"
+echo "Once that is done, please uncomment the wget commands in this script"
+echo ""
 
 RNN_DIR=$(cd `dirname $0`; pwd)
 DATA_DIR="${RNN_DIR}/data/"
@@ -26,7 +31,7 @@ if [[ ! -d "${DATA_DIR}" ]]; then
   mkdir -p ${DATA_DIR}
 fi
 
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/example/rnn/lstm_bucketing.py b/example/rnn/lstm_bucketing.py
index 2e7bc65d437a..0e7f064f0078 100644
--- a/example/rnn/lstm_bucketing.py
+++ b/example/rnn/lstm_bucketing.py
@@ -48,7 +48,6 @@
 parser.add_argument('--disp-batches', type=int, default=50,
                     help='show progress for every n batches')
 
-
 def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
     if not os.path.isfile(fname):
         raise IOError("Please use get_ptb_data.sh to download requied file (data/ptb.train.txt)")
diff --git a/example/rnn/old/get_ptb_data.sh b/example/rnn/old/get_ptb_data.sh
index d2641cb32b81..0a0c7051b010 100755
--- a/example/rnn/old/get_ptb_data.sh
+++ b/example/rnn/old/get_ptb_data.sh
@@ -17,6 +17,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+echo ""
+echo "NOTE: Please review the licensing of the datasets in this script before proceeding"
+echo "See https://catalog.ldc.upenn.edu/ldc99t42 for the licensing"
+echo "Once that is done, please uncomment the wget commands in this script"
+echo ""
 
 RNN_DIR=$(cd `dirname $0`; pwd)
 DATA_DIR="${RNN_DIR}/data/"
@@ -26,7 +31,7 @@ if [[ ! -d "${DATA_DIR}" ]]; then
   mkdir -p ${DATA_DIR}
 fi
 
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/example/sparse/get_data.py b/example/sparse/get_data.py
new file mode 100644
index 000000000000..19c635fe33ff
--- /dev/null
+++ b/example/sparse/get_data.py
@@ -0,0 +1,91 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import os, gzip
+import sys
+import mxnet as mx
+
+class DummyIter(mx.io.DataIter):
+    "A dummy iterator that always return the same batch, used for speed testing"
+    def __init__(self, real_iter):
+        super(DummyIter, self).__init__()
+        self.real_iter = real_iter
+        self.provide_data = real_iter.provide_data
+        self.provide_label = real_iter.provide_label
+        self.batch_size = real_iter.batch_size
+
+        for batch in real_iter:
+            self.the_batch = batch
+            break
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+        return self.the_batch
+
+def get_libsvm_data(data_dir, data_name, url):
+    if not os.path.isdir(data_dir):
+        os.mkdir(data_dir)
+    os.chdir(data_dir)
+    if (not os.path.exists(data_name)):
+        print("Dataset " + data_name + " not present. Downloading now ...")
+        import urllib
+        zippath = os.path.join(data_dir, data_name + ".bz2")
+        urllib.urlretrieve(url + data_name + ".bz2", zippath)
+        os.system("bzip2 -d %r" % data_name + ".bz2")
+        print("Dataset " + data_name + " is now present.")
+    os.chdir("..")
+
+def get_movielens_data(prefix):
+    if not os.path.exists("%s.zip" % prefix):
+        print("Dataset MovieLens 10M not present. Downloading now ...")
+        os.system("wget http://files.grouplens.org/datasets/movielens/%s.zip" % prefix)
+        os.system("unzip %s.zip" % prefix)
+        os.system("cd ml-10M100K; sh split_ratings.sh; cd -;")
+
+def get_movielens_iter(filename, batch_size, dummy_iter):
+    """Not particularly fast code to parse the text file and load into NDArrays.
+    return two data iters, one for train, the other for validation.
+    """
+    print("Preparing data iterators for " + filename + " ... ")
+    user = []
+    item = []
+    score = []
+    with open(filename, 'r') as f:
+        num_samples = 0
+        for line in f:
+            tks = line.strip().split('::')
+            if len(tks) != 4:
+                continue
+            num_samples += 1
+            user.append((tks[0]))
+            item.append((tks[1]))
+            score.append((tks[2]))
+            if dummy_iter and num_samples > batch_size * 10:
+                break
+    # convert to ndarrays
+    user = mx.nd.array(user, dtype='int32')
+    item = mx.nd.array(item)
+    score = mx.nd.array(score)
+    # prepare data iters
+    data_train = {'user':user, 'item':item}
+    label_train = {'score':score}
+    iter_train = mx.io.NDArrayIter(data=data_train,label=label_train,
+                                   batch_size=batch_size, shuffle=True)
+    iter_train = DummyIter(iter_train) if dummy_iter else iter_train
+    return mx.io.PrefetchingIter(iter_train)
diff --git a/example/sparse/linear_classification.py b/example/sparse/linear_classification.py
new file mode 100644
index 000000000000..1d63c55b1191
--- /dev/null
+++ b/example/sparse/linear_classification.py
@@ -0,0 +1,133 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.test_utils import *
+from get_data import get_libsvm_data
+from linear_model import *
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="Run sparse linear classification " \
+                                             "with distributed kvstore",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-epoch', type=int, default=5,
+                    help='number of epochs to train')
+parser.add_argument('--batch-size', type=int, default=8192,
+                    help='number of examples per batch')
+parser.add_argument('--kvstore', type=str, default=None,
+                    help='what kvstore to use',
+                    choices=["dist_async", "local"])
+parser.add_argument('--optimizer', type=str, default='ftrl',
+                    help='what optimizer to use',
+                    choices=["ftrl", "sgd", "adam"])
+
+AVAZU = {
+    'train': 'avazu-app',
+    'test': 'avazu-app.t',
+    'url': "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/",
+    # 1000000 + 1 since LibSVMIter uses zero-based indexing
+    'num_features': 1000001,
+}
+
+if __name__ == '__main__':
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.INFO, format=head)
+
+    # arg parser
+    args = parser.parse_args()
+    logging.info(args)
+    num_epoch = args.num_epoch
+    kvstore = args.kvstore
+    batch_size = args.batch_size
+    optimizer = args.optimizer
+
+    # create kvstore
+    kv = mx.kvstore.create(kvstore) if kvstore else None
+    rank = kv.rank if kv else 0
+    num_worker = kv.num_workers if kv else 1
+
+    # dataset
+    num_features = AVAZU['num_features']
+    data_dir = os.path.join(os.getcwd(), 'data')
+    train_data = os.path.join(data_dir, AVAZU['train'])
+    val_data = os.path.join(data_dir, AVAZU['test'])
+    get_libsvm_data(data_dir, AVAZU['train'], AVAZU['url'])
+    get_libsvm_data(data_dir, AVAZU['test'], AVAZU['url'])
+
+    # data iterator
+    train_data = mx.io.LibSVMIter(data_libsvm=train_data, data_shape=(num_features,),
+                                  batch_size=batch_size, num_parts=num_worker,
+                                  part_index=rank)
+    eval_data = mx.io.LibSVMIter(data_libsvm=val_data, data_shape=(num_features,),
+                                 batch_size=batch_size)
+
+    # model
+    # The positive class weight, says how much more we should upweight the importance of
+    # positive instances in the objective function.
+    # This is used to combat the extreme class imbalance.
+    positive_class_weight = 2
+    model = linear_model(num_features, positive_class_weight)
+
+    # module
+    mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['softmax_label'])
+    mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    mod.init_params()
+    optim = mx.optimizer.create(optimizer, learning_rate=0.01, rescale_grad=1.0/batch_size/num_worker)
+    mod.init_optimizer(optimizer=optim, kvstore=kv)
+    # use accuracy as the metric
+    metric = mx.metric.create(['nll_loss'])
+
+    # get the sparse weight parameter
+    weight_index = mod._exec_group.param_names.index('weight')
+    weight_param = mod._exec_group.param_arrays[weight_index]
+    all_row_ids = mx.nd.arange(0, num_features, dtype='int64')
+    speedometer = mx.callback.Speedometer(batch_size, 100)
+
+    logging.info('Training started ...')
+    data_iter = iter(train_data)
+    for epoch in range(num_epoch):
+        nbatch = 0
+        metric.reset()
+        for batch in data_iter:
+            nbatch += 1
+            # for distributed training, we need to manually pull sparse weights from kvstore
+            if kv:
+                row_ids = batch.data[0].indices
+                kv.row_sparse_pull('weight', weight_param, row_ids=[row_ids],
+                                   priority=-weight_index)
+            mod.forward_backward(batch)
+            # update all parameters (including the weight parameter)
+            mod.update()
+            # update training metric
+            mod.update_metric(metric, batch.label)
+            speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
+                                                       eval_metric=metric, locals=locals())
+            speedometer(speedometer_param)
+        # pull all rows before making a checkpoint
+        if kv:
+            kv.row_sparse_pull('weight', weight_param, row_ids=[all_row_ids],
+                               priority=-weight_index)
+        # evaluate metric on validation dataset
+        score = mod.score(eval_data, ['nll_loss'])
+        logging.info('epoch %d, eval nll = %s ' % (epoch, score[0][1]))
+        save_optimizer_states = 'dist' not in kv.type if kv else True
+        mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=save_optimizer_states)
+        # reset the iterator for next pass of data
+        data_iter.reset()
+    logging.info('Training completed.')
diff --git a/example/sparse/linear_model.py b/example/sparse/linear_model.py
new file mode 100644
index 000000000000..4dc6ac09e5f8
--- /dev/null
+++ b/example/sparse/linear_model.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from weighted_softmax_ce import *
+
+def linear_model(num_features, positive_cls_weight):
+    # data with csr storage type to enable feeding data with CSRNDArray
+    x = mx.symbol.Variable("data", stype='csr')
+    norm_init = mx.initializer.Normal(sigma=0.01)
+    # weight with row_sparse storage type to enable sparse gradient updates
+    weight = mx.symbol.Variable("weight", shape=(num_features, 2),
+                                init=norm_init, stype='row_sparse')
+    bias = mx.symbol.Variable("bias", shape=(2,))
+    dot = mx.symbol.sparse.dot(x, weight)
+    pred = mx.symbol.broadcast_add(dot, bias)
+    y = mx.symbol.Variable("softmax_label")
+    model = mx.sym.Custom(pred, y, op_type='weighted_softmax_ce_loss',
+                          positive_cls_weight=positive_cls_weight, name='out')
+    return mx.sym.MakeLoss(model)
diff --git a/example/sparse/matrix_fact_model.py b/example/sparse/matrix_fact_model.py
new file mode 100644
index 000000000000..d2d8de5dd33c
--- /dev/null
+++ b/example/sparse/matrix_fact_model.py
@@ -0,0 +1,51 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+def matrix_fact_net(factor_size, num_hidden, max_user, max_item, sparse_embed=True):
+    # input
+    user = mx.symbol.Variable('user')
+    item = mx.symbol.Variable('item')
+    score = mx.symbol.Variable('score')
+    if sparse_embed:
+        # user feature lookup
+        user_weight = mx.symbol.Variable('user_weight', stype='row_sparse')
+        user = mx.symbol.contrib.SparseEmbedding(data=user, weight=user_weight,
+                                                 input_dim=max_user, output_dim=factor_size)
+        # item feature lookup
+        item_weight = mx.symbol.Variable('item_weight', stype='row_sparse')
+        item = mx.symbol.contrib.SparseEmbedding(data=item, weight=item_weight,
+                                                 input_dim=max_item, output_dim=factor_size)
+    else:
+        # user feature lookup
+        user = mx.symbol.Embedding(data=user, input_dim=max_user, output_dim=factor_size)
+        # item feature lookup
+        item = mx.symbol.Embedding(data=item, input_dim=max_item, output_dim=factor_size)
+    # non-linear transformation of user features
+    user = mx.symbol.Activation(data=user, act_type='relu')
+    user = mx.symbol.FullyConnected(data=user, num_hidden=num_hidden)
+    # non-linear transformation of item features
+    item = mx.symbol.Activation(data=item, act_type='relu')
+    item = mx.symbol.FullyConnected(data=item, num_hidden=num_hidden)
+    # predict by the inner product, which is elementwise product and then sum
+    pred = user * item
+    pred = mx.symbol.sum(data=pred, axis = 1)
+    pred = mx.symbol.Flatten(data=pred)
+    # loss layer
+    pred = mx.symbol.LinearRegressionOutput(data=pred, label=score)
+    return pred
diff --git a/example/sparse/matrix_factorization.py b/example/sparse/matrix_factorization.py
new file mode 100644
index 000000000000..3387706665db
--- /dev/null
+++ b/example/sparse/matrix_factorization.py
@@ -0,0 +1,114 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import logging
+import time
+import mxnet as mx
+import numpy as np
+from get_data import get_movielens_iter, get_movielens_data
+from matrix_fact_model import matrix_fact_net
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+parser = argparse.ArgumentParser(description="Run matrix factorization with sparse embedding",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-epoch', type=int, default=3,
+                    help='number of epochs to train')
+parser.add_argument('--batch-size', type=int, default=128,
+                    help='number of examples per batch')
+parser.add_argument('--print-every', type=int, default=100,
+                    help='logging frequency')
+parser.add_argument('--factor-size', type=int, default=128,
+                    help="the factor size of the embedding operation")
+parser.add_argument('--use-dense', action='store_true',
+                    help="use the dense embedding operator")
+parser.add_argument('--use-gpu', action='store_true',
+                    help="use gpu")
+parser.add_argument('--dummy-iter', action='store_true',
+                    help="use the dummy data iterator for speed test")
+
+MOVIELENS = {
+    'dataset': 'ml-10m',
+    'train': './ml-10M100K/r1.train',
+    'val': './ml-10M100K/r1.test',
+    'max_user': 71569,
+    'max_movie': 65135,
+}
+
+if __name__ == '__main__':
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.INFO, format=head)
+
+    # arg parser
+    args = parser.parse_args()
+    logging.info(args)
+    num_epoch = args.num_epoch
+    batch_size = args.batch_size
+    optimizer = 'sgd'
+    use_sparse = not args.use_dense
+    factor_size = args.factor_size
+    dummy_iter = args.dummy_iter
+    print_every = args.print_every
+
+    momentum = 0.9
+    ctx = mx.gpu(0) if args.use_gpu else mx.cpu(0)
+    learning_rate = 0.1
+
+    # prepare dataset and iterators
+    max_user = MOVIELENS['max_user']
+    max_movies = MOVIELENS['max_movie']
+    get_movielens_data(MOVIELENS['dataset'])
+    train_iter = get_movielens_iter(MOVIELENS['train'], batch_size, dummy_iter)
+    val_iter = get_movielens_iter(MOVIELENS['val'], batch_size, dummy_iter)
+
+    # construct the model
+    net = matrix_fact_net(factor_size, factor_size, max_user, max_movies, sparse_embed=use_sparse)
+
+    # initialize the module
+    mod = mx.module.Module(symbol=net, context=ctx, data_names=['user', 'item'],
+                           label_names=['score'])
+    mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
+    mod.init_params(initializer=mx.init.Xavier(factor_type="in", magnitude=2.34))
+    optim = mx.optimizer.create(optimizer, learning_rate=learning_rate, momentum=momentum,
+                                wd=1e-4, rescale_grad=1.0/batch_size)
+    mod.init_optimizer(optimizer=optim)
+    # use MSE as the metric
+    metric = mx.metric.create(['MSE'])
+    speedometer = mx.callback.Speedometer(batch_size, print_every)
+    logging.info('Training started ...')
+    for epoch in range(num_epoch):
+        nbatch = 0
+        metric.reset()
+        for batch in train_iter:
+            nbatch += 1
+            mod.forward_backward(batch)
+            # update all parameters
+            mod.update()
+            # update training metric
+            mod.update_metric(metric, batch.label)
+            speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
+                                                       eval_metric=metric, locals=locals())
+            speedometer(speedometer_param)
+        # evaluate metric on validation dataset
+        score = mod.score(val_iter, ['MSE'])
+        logging.info('epoch %d, eval MSE = %s ' % (epoch, score[0][1]))
+        # reset the iterator for next pass of data
+        train_iter.reset()
+        val_iter.reset()
+    logging.info('Training completed.')
diff --git a/example/sparse/readme.md b/example/sparse/readme.md
new file mode 100644
index 000000000000..e443bfa2d5f9
--- /dev/null
+++ b/example/sparse/readme.md
@@ -0,0 +1,21 @@
+Example
+===========
+This folder contains examples using the sparse feature in MXNet. They are for demonstration purpose only.
+
+## Linear Classification Using Sparse Matrix Multiplication
+
+The example demonstrates the basic usage of the sparse feature in MXNet to speedup computation. It utilizes the sparse data loader, sparse operators and a sparse gradient updater to train a linear model on the [Avazu](https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html#avazu) click-through-prediction dataset.
+
+- `python linear_classification.py`
+
+Notes on Distributed Training:
+
+- For distributed training, please use the `../../tools/launch.py` script to launch a cluster.
+- For example, to run two workers and two servers with one machine, run `../../tools/launch.py -n 2 --launcher=local python linear_classification.py --kvstore=dist_async`
+
+## Matrix Factorization Using Sparse Embedding
+
+The example demonstrates the basic usage of the SparseEmbedding operator in MXNet, adapted based on @leopd's recommender examples.
+
+- `python matrix_factorization.py`
+- To compare the train speed with (dense) Embedding, run `python matrix_factorization.py --use-dense`
diff --git a/example/sparse/weighted_softmax_ce.py b/example/sparse/weighted_softmax_ce.py
new file mode 100644
index 000000000000..a40ece658e96
--- /dev/null
+++ b/example/sparse/weighted_softmax_ce.py
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+
+class WeightedSoftmaxCrossEntropyLoss(mx.operator.CustomOp):
+    """ softmax cross entropy weighted loss, where the loss is adjusted by \
+((1 + label * pos_cls_weight) / pos_cls_weight)
+
+    """
+
+    def __init__(self, positive_cls_weight):
+        self.positive_cls_weight = float(positive_cls_weight)
+
+    def forward(self, is_train, req, in_data, out_data, aux):
+        """Implements forward computation.
+
+        is_train : bool, whether forwarding for training or testing.
+        req : list of {'null', 'write', 'inplace', 'add'}, how to assign to out_data. 'null' means skip assignment, etc.
+        in_data : list of NDArray, input data.
+        out_data : list of NDArray, pre-allocated output buffers.
+        aux : list of NDArray, mutable auxiliary states. Usually not used.
+        """
+        data = in_data[0]
+        label = in_data[1]
+        pred = mx.nd.SoftmaxOutput(data, label)
+        self.assign(out_data[0], req[0], pred)
+
+    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
+        """Implements backward computation
+
+        req : list of {'null', 'write', 'inplace', 'add'}, how to assign to in_grad
+        out_grad : list of NDArray, gradient w.r.t. output data.
+        in_grad : list of NDArray, gradient w.r.t. input data. This is the output buffer.
+        """
+        label = in_data[1]
+        pred = out_data[0]
+        dx = pred - mx.nd.one_hot(label, 2)
+        pos_cls_weight = self.positive_cls_weight
+        scale_factor = ((1 + label * pos_cls_weight) / pos_cls_weight).reshape((pred.shape[0],1))
+        rescaled_dx = scale_factor * dx
+        self.assign(in_grad[0], req[0], rescaled_dx)
+
+@mx.operator.register("weighted_softmax_ce_loss")
+class WeightedSoftmaxCrossEntropyLossProp(mx.operator.CustomOpProp):
+    def __init__(self, positive_cls_weight):
+        super(WeightedSoftmaxCrossEntropyLossProp, self).__init__(True)
+        self.positive_cls_weight = positive_cls_weight
+        assert(positive_cls_weight > 0)
+
+    def list_arguments(self):
+        return ['data', 'label']
+
+    def list_outputs(self):
+        return ['output']
+
+    def infer_shape(self, in_shapes):
+        """Calculate output shapes from input shapes. This can be
+        omited if all your inputs and outputs have the same shape.
+
+        in_shapes : list of shape. Shape is described by a tuple of int.
+        """
+        data_shape = in_shapes[0]
+        output_shape = data_shape
+        # return 3 lists representing inputs shapes, outputs shapes, and aux data shapes.
+        return (in_shapes), (output_shape,), ()
+
+    def create_operator(self, ctx, in_shapes, in_dtypes):
+        #  create and return the CustomOp class.
+        return WeightedSoftmaxCrossEntropyLoss(self.positive_cls_weight)
diff --git a/example/speech-demo/decode_mxnet.sh b/example/speech-demo/decode_mxnet.sh
index d300d0e91c40..983b14c5e042 100755
--- a/example/speech-demo/decode_mxnet.sh
+++ b/example/speech-demo/decode_mxnet.sh
@@ -1,23 +1,5 @@
 #!/bin/bash
 
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
 # Copyright 2012-2013 Karel Vesely, Daniel Povey
 # 	    2015 Yu Zhang
 # Apache 2.0
diff --git a/example/speech-demo/io_func/convert2kaldi.py b/example/speech-demo/io_func/convert2kaldi.py
index eac8ee695a9b..6ea7bc4be0ac 100644
--- a/example/speech-demo/io_func/convert2kaldi.py
+++ b/example/speech-demo/io_func/convert2kaldi.py
@@ -1,34 +1,6 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
 
 # Copyright 2013    Yajie Miao    Carnegie Mellon University
 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
 
 import numpy as np
 import os
diff --git a/example/speech_recognition/flac_to_wav.sh b/example/speech_recognition/flac_to_wav.sh
old mode 100644
new mode 100755
diff --git a/example/ssd/config/config.py b/example/ssd/config/config.py
index 38a07b5e655d..b084888f4ba5 100644
--- a/example/ssd/config/config.py
+++ b/example/ssd/config/config.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import os
-from utils import DotDict, namedtuple_with_defaults, zip_namedtuple, config_as_dict
+from config.utils import DotDict, namedtuple_with_defaults, zip_namedtuple, config_as_dict
 
 RandCropper = namedtuple_with_defaults('RandCropper',
     'min_crop_scales, max_crop_scales, \
diff --git a/example/ssd/dataset/concat_db.py b/example/ssd/dataset/concat_db.py
index cb6c99e34fc1..7e22105ff8cf 100644
--- a/example/ssd/dataset/concat_db.py
+++ b/example/ssd/dataset/concat_db.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from imdb import Imdb
+from dataset.imdb import Imdb
 import random
 
 class ConcatDB(Imdb):
@@ -64,7 +64,7 @@ def _load_image_set_index(self, shuffle):
         self.num_images = 0
         for db in self.imdbs:
             self.num_images += db.num_images
-        indices = range(self.num_images)
+        indices = list(range(self.num_images))
         if shuffle:
             random.shuffle(indices)
         return indices
diff --git a/example/ssd/dataset/iterator.py b/example/ssd/dataset/iterator.py
index 8b6857b94edf..0d35b4724102 100644
--- a/example/ssd/dataset/iterator.py
+++ b/example/ssd/dataset/iterator.py
@@ -210,8 +210,8 @@ def iter_next(self):
     def next(self):
         if self.iter_next():
             self._get_batch()
-            data_batch = mx.io.DataBatch(data=self._data.values(),
-                                   label=self._label.values(),
+            data_batch = mx.io.DataBatch(data=list(self._data.values()),
+                                   label=list(self._label.values()),
                                    pad=self.getpad(), index=self.getindex())
             self._current += self.batch_size
             return data_batch
diff --git a/example/ssd/dataset/mscoco.py b/example/ssd/dataset/mscoco.py
index ff2a753ddc6f..469a15ae2720 100644
--- a/example/ssd/dataset/mscoco.py
+++ b/example/ssd/dataset/mscoco.py
@@ -17,8 +17,8 @@
 
 import os
 import numpy as np
-from imdb import Imdb
-from pycocotools.coco import COCO
+from dataset.imdb import Imdb
+from dataset.pycocotools.coco import COCO
 
 
 class Coco(Imdb):
diff --git a/example/ssd/dataset/pascal_voc.py b/example/ssd/dataset/pascal_voc.py
index d9868905514c..98e217fd15ad 100644
--- a/example/ssd/dataset/pascal_voc.py
+++ b/example/ssd/dataset/pascal_voc.py
@@ -18,7 +18,7 @@
 from __future__ import print_function
 import os
 import numpy as np
-from imdb import Imdb
+from dataset.imdb import Imdb
 import xml.etree.ElementTree as ET
 from evaluate.eval_voc import voc_eval
 import cv2
diff --git a/example/ssd/demo.py b/example/ssd/demo.py
index 521267550b41..965f2ecec5fa 100644
--- a/example/ssd/demo.py
+++ b/example/ssd/demo.py
@@ -50,6 +50,8 @@ def get_detector(net, prefix, epoch, data_shape, mean_pixels, ctx, num_class,
         force suppress different categories
     """
     if net is not None:
+        if isinstance(data_shape, tuple):
+            data_shape = data_shape[0]
         net = get_symbol(net, data_shape, num_classes=num_class, nms_thresh=nms_thresh,
             force_nms=force_nms, nms_topk=nms_topk)
     detector = Detector(net, prefix, epoch, data_shape, mean_pixels, ctx=ctx)
@@ -74,7 +76,7 @@ def parse_args():
                         action='store_true', default=False)
     parser.add_argument('--gpu', dest='gpu_id', type=int, default=0,
                         help='GPU device id to detect with')
-    parser.add_argument('--data-shape', dest='data_shape', type=int, default=512,
+    parser.add_argument('--data-shape', dest='data_shape', type=str, default='512',
                         help='set image shape')
     parser.add_argument('--mean-r', dest='mean_r', type=float, default=123,
                         help='red mean value')
@@ -115,6 +117,17 @@ def parse_class_names(class_names):
         raise RuntimeError("No valid class_name provided...")
     return class_names
 
+def parse_data_shape(data_shape_str):
+    """Parse string to tuple or int"""
+    ds = data_shape_str.strip().split(',')
+    if len(ds) == 1:
+        data_shape = (int(ds[0]), int(ds[0]))
+    elif len(ds) == 2:
+        data_shape = (int(ds[0]), int(ds[1]))
+    else:
+        raise ValueError("Unexpected data_shape: %s", data_shape_str)
+    return data_shape
+
 if __name__ == '__main__':
     args = parse_args()
     if args.cpu:
@@ -128,12 +141,13 @@ def parse_class_names(class_names):
 
     network = None if args.deploy_net else args.network
     class_names = parse_class_names(args.class_names)
+    data_shape = parse_data_shape(args.data_shape)
     if args.prefix.endswith('_'):
-        prefix = args.prefix + args.network + '_' + str(args.data_shape)
+        prefix = args.prefix + args.network + '_' + str(data_shape[0])
     else:
         prefix = args.prefix
     detector = get_detector(network, prefix, args.epoch,
-                            args.data_shape,
+                            data_shape,
                             (args.mean_r, args.mean_g, args.mean_b),
                             ctx, len(class_names), args.nms_thresh, args.force_nms)
     # run detection
diff --git a/example/ssd/deploy.py b/example/ssd/deploy.py
index 415f334fdc2d..a20e8a7816a4 100644
--- a/example/ssd/deploy.py
+++ b/example/ssd/deploy.py
@@ -59,5 +59,5 @@ def parse_args():
     tmp = prefix.rsplit('/', 1)
     save_prefix = '/deploy_'.join(tmp)
     mx.model.save_checkpoint(save_prefix, args.epoch, net, arg_params, aux_params)
-    print("Saved model: {}-{:04d}.param".format(save_prefix, args.epoch))
+    print("Saved model: {}-{:04d}.params".format(save_prefix, args.epoch))
     print("Saved symbol: {}-symbol.json".format(save_prefix))
diff --git a/example/ssd/detect/detector.py b/example/ssd/detect/detector.py
index b6adac110cf7..3a7f89fcadd1 100644
--- a/example/ssd/detect/detector.py
+++ b/example/ssd/detect/detector.py
@@ -52,10 +52,11 @@ def __init__(self, symbol, model_prefix, epoch, data_shape, mean_pixels, \
         if symbol is None:
             symbol = load_symbol
         self.mod = mx.mod.Module(symbol, label_names=None, context=ctx)
+        if not isinstance(data_shape, tuple):
+            data_shape = (data_shape, data_shape)
         self.data_shape = data_shape
-        self.mod.bind(data_shapes=[('data', (batch_size, 3, data_shape, data_shape))])
+        self.mod.bind(data_shapes=[('data', (batch_size, 3, data_shape[0], data_shape[1]))])
         self.mod.set_params(args, auxs)
-        self.data_shape = data_shape
         self.mean_pixels = mean_pixels
 
     def detect(self, det_iter, show_timer=False):
diff --git a/example/ssd/evaluate/evaluate_net.py b/example/ssd/evaluate/evaluate_net.py
index 7f1a32dea518..fabe54f00c75 100644
--- a/example/ssd/evaluate/evaluate_net.py
+++ b/example/ssd/evaluate/evaluate_net.py
@@ -81,7 +81,7 @@ class names in string, must correspond to num_classes if set
     model_prefix += '_' + str(data_shape[1])
 
     # iterator
-    eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape,
+    eval_iter = DetRecordIter(path_imgrec, batch_size, data_shape, mean_pixels=mean_pixels,
                               path_imglist=path_imglist, **cfg.valid)
     # model params
     load_net, args, auxs = mx.model.load_checkpoint(model_prefix, epoch)
diff --git a/example/ssd/symbol/common.py b/example/ssd/symbol/common.py
index ea58c1599add..4a0458f87288 100644
--- a/example/ssd/symbol/common.py
+++ b/example/ssd/symbol/common.py
@@ -283,8 +283,9 @@ def multibox_layer(from_layers, num_classes, sizes=[.2, .95],
             step = (steps[k], steps[k])
         else:
             step = '(-1.0, -1.0)'
-        anchors = mx.contrib.symbol.MultiBoxPrior(from_layer, sizes=size_str, ratios=ratio_str, \
-            clip=clip, name="{}_anchors".format(from_name), steps=step)
+        anchors = mx.symbol.contrib.MultiBoxPrior(from_layer, sizes=size_str, ratios=ratio_str,
+                                                  clip=clip, name="{}_anchors".format(from_name),
+                                                  steps=step)
         anchors = mx.symbol.Flatten(data=anchors)
         anchor_layers.append(anchors)
 
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_300.py b/example/ssd/symbol/legacy_vgg16_ssd_300.py
index c1f8ea7cb88e..29fc30be65d4 100644
--- a/example/ssd/symbol/legacy_vgg16_ssd_300.py
+++ b/example/ssd/symbol/legacy_vgg16_ssd_300.py
@@ -144,7 +144,7 @@ def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False,
         num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
         num_channels=num_channels, clip=False, interm_layer=0, steps=steps)
 
-    tmp = mx.contrib.symbol.MultiBoxTarget(
+    tmp = mx.symbol.contrib.MultiBoxTarget(
         *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \
         ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \
         negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2),
@@ -163,7 +163,7 @@ def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False,
 
     # monitoring training status
     cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label")
-    det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+    det = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
     det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out")
@@ -202,7 +202,7 @@ def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False,
 
     cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
         name='cls_prob')
-    out = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+    out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
     return out
diff --git a/example/ssd/symbol/legacy_vgg16_ssd_512.py b/example/ssd/symbol/legacy_vgg16_ssd_512.py
index 6cc3aa274a73..c5c3095dfd77 100644
--- a/example/ssd/symbol/legacy_vgg16_ssd_512.py
+++ b/example/ssd/symbol/legacy_vgg16_ssd_512.py
@@ -148,7 +148,7 @@ def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_t
         num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
         num_channels=num_channels, clip=False, interm_layer=0, steps=steps)
 
-    tmp = mx.contrib.symbol.MultiBoxTarget(
+    tmp = mx.symbol.contrib.MultiBoxTarget(
         *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \
         ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \
         negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2),
@@ -167,7 +167,7 @@ def get_symbol_train(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_t
 
     # monitoring training status
     cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label")
-    det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+    det = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
     det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out")
@@ -205,7 +205,7 @@ def get_symbol(num_classes=20, nms_thresh=0.5, force_suppress=False, nms_topk=40
 
     cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
         name='cls_prob')
-    out = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+    out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
     return out
diff --git a/example/ssd/symbol/symbol_builder.py b/example/ssd/symbol/symbol_builder.py
index 4cd7f88ea312..041f83eb44da 100644
--- a/example/ssd/symbol/symbol_builder.py
+++ b/example/ssd/symbol/symbol_builder.py
@@ -16,7 +16,7 @@
 # under the License.
 
 import mxnet as mx
-from common import multi_layer_feature, multibox_layer
+from symbol.common import multi_layer_feature, multibox_layer
 
 
 def import_module(module_name):
@@ -87,7 +87,7 @@ def get_symbol_train(network, num_classes, from_layers, num_filters, strides, pa
         num_classes, sizes=sizes, ratios=ratios, normalization=normalizations, \
         num_channels=num_filters, clip=False, interm_layer=0, steps=steps)
 
-    tmp = mx.contrib.symbol.MultiBoxTarget(
+    tmp = mx.symbol.contrib.MultiBoxTarget(
         *[anchor_boxes, label, cls_preds], overlap_threshold=.5, \
         ignore_label=-1, negative_mining_ratio=3, minimum_negative_samples=0, \
         negative_mining_thresh=.5, variances=(0.1, 0.1, 0.2, 0.2),
@@ -106,7 +106,7 @@ def get_symbol_train(network, num_classes, from_layers, num_filters, strides, pa
 
     # monitoring training status
     cls_label = mx.symbol.MakeLoss(data=cls_target, grad_scale=0, name="cls_label")
-    det = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+    det = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
     det = mx.symbol.MakeLoss(data=det, grad_scale=0, name="det_out")
@@ -177,7 +177,7 @@ def get_symbol(network, num_classes, from_layers, num_filters, sizes, ratios,
 
     cls_prob = mx.symbol.SoftmaxActivation(data=cls_preds, mode='channel', \
         name='cls_prob')
-    out = mx.contrib.symbol.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
+    out = mx.symbol.contrib.MultiBoxDetection(*[cls_prob, loc_preds, anchor_boxes], \
         name="detection", nms_threshold=nms_thresh, force_suppress=force_suppress,
         variances=(0.1, 0.1, 0.2, 0.2), nms_topk=nms_topk)
     return out
diff --git a/example/ssd/symbol/symbol_factory.py b/example/ssd/symbol/symbol_factory.py
index c451cd61ab83..1b1419f04edd 100644
--- a/example/ssd/symbol/symbol_factory.py
+++ b/example/ssd/symbol/symbol_factory.py
@@ -17,7 +17,7 @@
 
 """Presets for various network configurations"""
 import logging
-import symbol_builder
+from symbol import symbol_builder
 
 def get_config(network, data_shape, **kwargs):
     """Configuration factory for various networks
diff --git a/example/ssd/tools/caffe_converter/.gitignore b/example/ssd/tools/caffe_converter/.gitignore
new file mode 100644
index 000000000000..7804af1597c2
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/.gitignore
@@ -0,0 +1,5 @@
+model/
+*.caffemodel
+*.prototxt
+*.json
+*.params
diff --git a/example/ssd/tools/caffe_converter/Makefile b/example/ssd/tools/caffe_converter/Makefile
index 71b8b26e0575..d0698d70308a 100644
--- a/example/ssd/tools/caffe_converter/Makefile
+++ b/example/ssd/tools/caffe_converter/Makefile
@@ -1,4 +1,3 @@
-# find protoc
 ifndef PROTOC
 DEPS_PROTOC=../../deps/bin/protoc
 ifneq ("$(wildcard $(DEPS_PROTOC))","")
@@ -8,10 +7,10 @@ PROTOC = protoc
 endif
 endif
 
-all: caffe_parse/caffe_pb2.py
+all: caffe_pb2.py
 
 clean:
-	rm caffe_parse/caffe_pb2.py*
+	rm caffe_pb2.py*
 
-caffe_parse/caffe_pb2.py:
-	$(PROTOC) --python_out=./ ./caffe_parse/caffe.proto
+caffe_pb2.py:
+	$(PROTOC) --python_out=./ ./caffe.proto
diff --git a/example/ssd/tools/caffe_converter/README.md b/example/ssd/tools/caffe_converter/README.md
index 446ba4ed32f1..5d40024c0588 100644
--- a/example/ssd/tools/caffe_converter/README.md
+++ b/example/ssd/tools/caffe_converter/README.md
@@ -1,31 +1,20 @@
 # Convert Caffe Model to Mxnet Format
 
-### Build (Linux)
+This folder contains the source codes for this tool.
 
-Either [Caffe's python package](http://caffe.berkeleyvision.org/installation.html) or [Google protobuf](https://developers.google.com/protocol-buffers/?hl=en) is required. The latter is often much easier to install:  
+If Caffe with python binding is installed, we can use the following command to
+convert a Resnet-50 pretrained model.
 
-1. We first install the protobuf compiler. If you compiled mxnet with `USE_DIST_KVSTORE = 1` then it is already built. Otherwise, install `protobuf-compiler` by your favor package manager, e.g. `sudo apt-get install protobuf-compiler` for ubuntu and `sudo yum install protobuf-compiler` for redhat/fedora.
-
-2. Then install the protobuf's python binding. For example `sudo pip install protobuf`
-
-Now we can build the tool by running `make` in the current directory.
-
-### Build (Windows)
-
-Note: this tool currently only works on python 2.
-
-We must make sure that the installed python binding and protobuf compiler are using the same version of protobuf,
-so we install the bindings first, and then install the corresponding compiler.
-
-1. Install the protobuf bindings. At time of writing, the conda package manager has the most up to date version. Either run `conda install -c conda-forge protobuf` or `pip install protobuf`
-2. Download the win32 build of protoc from [Protocol Buffers Releases](https://github.com/google/protobuf/releases). Make sure to download the version that corresponds to the version of the bindings. Extract to any location then add that location to your `PATH`
-3. Run `make_win32.bat` to build the package
+```bash
+python convert_caffe_modelzoo.py resnet-50
+```
 
+Please refer to
+[docs/how_to/caffe.md](../../docs/how_to/caffe.md) for more details.
 
 ### How to use
 To convert ssd caffemodels, Use: `python convert_model.py prototxt caffemodel outputprefix`
 
-
 ### Note
 
 Use this converter for ssd caffemodels only. General converter is available in `mxnet/tools/caffe_converter`.
diff --git a/example/ssd/tools/caffe_converter/caffe_parse/caffe.proto b/example/ssd/tools/caffe_converter/caffe.proto
similarity index 100%
rename from example/ssd/tools/caffe_converter/caffe_parse/caffe.proto
rename to example/ssd/tools/caffe_converter/caffe.proto
diff --git a/example/ssd/tools/caffe_converter/caffe_parse/__init__.py b/example/ssd/tools/caffe_converter/caffe_parse/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/example/ssd/tools/caffe_converter/caffe_parser.py b/example/ssd/tools/caffe_converter/caffe_parser.py
new file mode 100644
index 000000000000..cff6fd590701
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/caffe_parser.py
@@ -0,0 +1,80 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Parse caffe's protobuf
+"""
+import re
+try:
+    import caffe
+    from caffe.proto import caffe_pb2
+    use_caffe = True
+except ImportError:
+    try:
+        import caffe_pb2
+    except ImportError:
+        raise ImportError('You used to compile with protoc --python_out=./ ./caffe.proto')
+    use_caffe = False
+
+from google.protobuf import text_format
+
+def read_prototxt(fname):
+    """Return a caffe_pb2.NetParameter object that defined in a prototxt file
+    """
+    proto = caffe_pb2.NetParameter()
+    with open(fname, 'r') as f:
+        text_format.Merge(str(f.read()), proto)
+    return proto
+
+def get_layers(proto):
+    """Returns layers in a caffe_pb2.NetParameter object
+    """
+    if len(proto.layer):
+        return proto.layer
+    elif len(proto.layers):
+        return proto.layers
+    else:
+        raise ValueError('Invalid proto file.')
+
+def read_caffemodel(prototxt_fname, caffemodel_fname):
+    """Return a caffe_pb2.NetParameter object that defined in a binary
+    caffemodel file
+    """
+    if use_caffe:
+        caffe.set_mode_cpu()
+        net = caffe.Net(prototxt_fname, caffemodel_fname, caffe.TEST)
+        layer_names = net._layer_names
+        layers = net.layers
+        return (layers, layer_names)
+    else:
+        proto = caffe_pb2.NetParameter()
+        with open(caffemodel_fname, 'rb') as f:
+            proto.ParseFromString(f.read())
+        return (get_layers(proto), None)
+
+def layer_iter(layers, layer_names):
+    if use_caffe:
+        for layer_idx, layer in enumerate(layers):
+            layer_name = re.sub('[-/]', '_', layer_names[layer_idx])
+            layer_type = layer.type
+            layer_blobs = layer.blobs
+            yield (layer_name, layer_type, layer_blobs)
+    else:
+        for layer in layers:
+            layer_name = re.sub('[-/]', '_', layer.name)
+            layer_type = layer.type
+            layer_blobs = layer.blobs
+            yield (layer_name, layer_type, layer_blobs)
diff --git a/example/ssd/tools/caffe_converter/caffe_proto_utils.py b/example/ssd/tools/caffe_converter/caffe_proto_utils.py
new file mode 100644
index 000000000000..45978e7dc59d
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/caffe_proto_utils.py
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Helper functions for parsing caffe prototxt into a workable DAG
+"""
+
+
+def process_network_proto(caffe_root, deploy_proto):
+    """
+    Runs the caffe upgrade tool on the prototxt to create a prototxt in the latest format.
+    This enable us to work just with latest structures, instead of supporting all the variants
+
+    :param caffe_root: link to caffe root folder, where the upgrade tool is located
+    :param deploy_proto: name of the original prototxt file
+    :return: name of new processed prototxt file
+    """
+    processed_deploy_proto = deploy_proto + ".processed"
+
+    from shutil import copyfile
+    copyfile(deploy_proto, processed_deploy_proto)
+
+    # run upgrade tool on new file name (same output file)
+    import os
+    upgrade_tool_command_line = caffe_root + '/build/tools/upgrade_net_proto_text.bin ' \
+                                + processed_deploy_proto + ' ' + processed_deploy_proto
+    os.system(upgrade_tool_command_line)
+
+    return processed_deploy_proto
+
+
+class LayerRecord(object):
+    """
+    A record which describe basic layer parameters
+    """
+
+    def __init__(self, layer_def):
+
+        self.layer_def = layer_def
+        self.name = layer_def.name
+        self.type = layer_def.type
+
+        # keep filter, stride and pad
+        if layer_def.type == 'Convolution':
+            if LayerRecord._is_iterable(layer_def.convolution_param.kernel_size):
+                self.filter = list(layer_def.convolution_param.kernel_size)
+            else:
+                self.filter = list([layer_def.convolution_param.kernel_size])
+            if len(self.filter) == 1:
+                self.filter *= 2
+            if LayerRecord._is_iterable(layer_def.convolution_param.pad):
+                self.pad = list(layer_def.convolution_param.pad)
+            else:
+                self.pad = list([layer_def.convolution_param.pad])
+            if len(self.pad) == 0:
+                self.pad = [0, 0]
+            elif len(self.pad) == 1:
+                self.pad *= 2
+            if LayerRecord._is_iterable(layer_def.convolution_param.stride):
+                self.stride = list(layer_def.convolution_param.stride)
+            else:
+                self.stride = list([layer_def.convolution_param.stride])
+            if len(self.stride) == 0:
+                self.stride = [1, 1]
+            elif len(self.stride) == 1:
+                self.stride *= 2
+
+        elif layer_def.type == 'Pooling':
+            self.filter = [layer_def.pooling_param.kernel_size]
+            if len(self.filter) == 1:
+                self.filter *= 2
+            self.pad = [layer_def.pooling_param.pad]
+            if len(self.pad) == 0:
+                self.pad = [0, 0]
+            elif len(self.pad) == 1:
+                self.pad *= 2
+            self.stride = [layer_def.pooling_param.stride]
+            if len(self.stride) == 0:
+                self.stride = [1, 1]
+            elif len(self.stride) == 1:
+                self.stride *= 2
+
+        else:
+            self.filter = [0, 0]
+            self.pad = [0, 0]
+            self.stride = [1, 1]
+
+        # keep tops
+        self.tops = list(layer_def.top)
+
+        # keep bottoms
+        self.bottoms = list(layer_def.bottom)
+
+        # list of parent layers
+        self.parents = []
+
+        # list of child layers
+        self.children = []
+
+    @staticmethod
+    def _is_iterable(obj):
+        return hasattr(obj, '__iter__')
+
+def read_network_dag(processed_deploy_prototxt):
+    """
+    Reads from the caffe prototxt the network structure
+    :param processed_deploy_prototxt: name of prototxt to load, preferably the prototxt should
+     be processed before using a call to process_network_proto()
+    :return: network_def, layer_name_to_record, top_to_layers
+    network_def: caffe network structure, gives access to *all* the network information
+    layer_name_to_record: *ordered* dictionary which maps between layer name and a structure which
+      describes in a simple form the layer parameters
+    top_to_layers: dictionary which maps a blob name to an ordered list of layers which output it
+     when a top is used several times, like in inplace layhers, the list will contain all the layers
+     by order of appearance
+    """
+
+    from caffe.proto import caffe_pb2
+    from google.protobuf import text_format
+    from collections import OrderedDict
+
+    # load prototxt file
+    network_def = caffe_pb2.NetParameter()
+    with open(processed_deploy_prototxt, 'r') as proto_file:
+        text_format.Merge(str(proto_file.read()), network_def)
+
+    # map layer name to layer record
+    layer_name_to_record = OrderedDict()
+    for layer_def in network_def.layer:
+        if (len(layer_def.include) == 0) or \
+           (caffe_pb2.TEST in [item.phase for item in layer_def.include]):
+
+            layer_name_to_record[layer_def.name] = LayerRecord(layer_def)
+
+    top_to_layers = dict()
+    for layer in network_def.layer:
+        # no specific phase, or TEST phase is specifically asked for
+        if (len(layer.include) == 0) or (caffe_pb2.TEST in [item.phase for item in layer.include]):
+            for top in layer.top:
+                if top not in top_to_layers:
+                    top_to_layers[top] = list()
+                top_to_layers[top].append(layer.name)
+
+    # find parents and children of all layers
+    for child_layer_name in layer_name_to_record.keys():  # pylint: disable=too-many-nested-blocks
+        child_layer_def = layer_name_to_record[child_layer_name]
+        for bottom in child_layer_def.bottoms:
+            if bottom in top_to_layers:
+                for parent_layer_name in top_to_layers[bottom]:
+                    if parent_layer_name in layer_name_to_record:
+                        parent_layer_def = layer_name_to_record[parent_layer_name]
+                        if parent_layer_def not in child_layer_def.parents:
+                            child_layer_def.parents.append(parent_layer_def)
+                        if child_layer_def not in parent_layer_def.children:
+                            parent_layer_def.children.append(child_layer_def)
+
+    # update filter, strid, pad for maxout "structures"
+    for layer_name in layer_name_to_record.keys():
+        layer_def = layer_name_to_record[layer_name]
+        if layer_def.type == 'Eltwise' and \
+           len(layer_def.parents) == 1 and \
+           layer_def.parents[0].type == 'Slice' and \
+           len(layer_def.parents[0].parents) == 1 and \
+           layer_def.parents[0].parents[0].type in ['Convolution', 'InnerProduct']:
+            layer_def.filter = layer_def.parents[0].parents[0].filter
+            layer_def.stride = layer_def.parents[0].parents[0].stride
+            layer_def.pad = layer_def.parents[0].parents[0].pad
+
+    return network_def, layer_name_to_record, top_to_layers
+
+
+def read_caffe_mean(caffe_mean_file):
+    """
+    Reads caffe formatted mean file
+    :param caffe_mean_file: path to caffe mean file, presumably with 'binaryproto' suffix
+    :return: mean image, converted from BGR to RGB format
+    """
+
+    import caffe_parser
+    import numpy as np
+    mean_blob = caffe_parser.caffe_pb2.BlobProto()
+    with open(caffe_mean_file, 'rb') as f:
+        mean_blob.ParseFromString(f.read())
+
+    img_mean_np = np.array(mean_blob.data)
+    img_mean_np = img_mean_np.reshape(mean_blob.channels, mean_blob.height, mean_blob.width)
+
+    # swap channels from Caffe BGR to RGB
+    img_mean_np[[0, 2], :, :] = img_mean_np[[2, 0], :, :]
+
+    return img_mean_np
diff --git a/example/ssd/tools/caffe_converter/compare_layers.py b/example/ssd/tools/caffe_converter/compare_layers.py
new file mode 100644
index 000000000000..12568ed2060a
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/compare_layers.py
@@ -0,0 +1,364 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test converted models layer by layer
+"""
+import os
+import argparse
+import logging
+import mxnet as mx
+import cv2
+import numpy as np
+
+logging.basicConfig(level=logging.INFO)
+
+
+def read_image(img_path, image_dims=None, mean=None):
+    """
+    Reads an image from file path or URL, optionally resizing to given image dimensions and
+    subtracting mean.
+    :param img_path: path to file, or url to download
+    :param image_dims: image dimensions to resize to, or None
+    :param mean: mean file to subtract, or None
+    :return: loaded image, in RGB format
+    """
+
+    import urllib
+
+    filename = img_path.split("/")[-1]
+    if img_path.startswith('http'):
+        urllib.urlretrieve(img_path, filename)
+        img = cv2.imread(filename)
+    else:
+        img = cv2.imread(img_path)
+
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+
+    if image_dims is not None:
+        img = cv2.resize(img, image_dims)  # resize to image_dims to fit model
+    img = np.rollaxis(img, 2) # change to (c, h, w) order
+    img = img[np.newaxis, :]  # extend to (n, c, h, w)
+    if mean is not None:
+        mean = np.array(mean)
+        if mean.shape == (3,):
+            mean = mean[np.newaxis, :, np.newaxis, np.newaxis]  # extend to (n, c, 1, 1)
+        img = img.astype(np.float32) - mean # subtract mean
+
+    return img
+
+
+def _ch_dev(arg_params, aux_params, ctx):
+    """
+    Changes device of given mxnet arguments
+    :param arg_params: arguments
+    :param aux_params: auxiliary parameters
+    :param ctx: new device context
+    :return: arguments and auxiliary parameters on new device
+    """
+    new_args = dict()
+    new_auxs = dict()
+    for k, v in arg_params.items():
+        new_args[k] = v.as_in_context(ctx)
+    for k, v in aux_params.items():
+        new_auxs[k] = v.as_in_context(ctx)
+    return new_args, new_auxs
+
+
+def convert_and_compare_caffe_to_mxnet(image_url, gpu, caffe_prototxt_path, caffe_model_path,
+                                       caffe_mean, mean_diff_allowed, max_diff_allowed):
+    """
+    Run the layer comparison on a caffe model, given its prototxt, weights and mean.
+    The comparison is done by inferring on a given image using both caffe and mxnet model
+    :param image_url: image file or url to run inference on
+    :param gpu: gpu to use, -1 for cpu
+    :param caffe_prototxt_path: path to caffe prototxt
+    :param caffe_model_path: path to caffe weights
+    :param caffe_mean: path to caffe mean file
+    """
+
+    import caffe
+    from caffe_proto_utils import read_network_dag, process_network_proto, read_caffe_mean
+    from convert_model import convert_model
+
+    if isinstance(caffe_mean, str):
+        caffe_mean = read_caffe_mean(caffe_mean)
+    elif caffe_mean is None:
+        pass
+    elif len(caffe_mean) == 3:
+        # swap channels from Caffe BGR to RGB
+        caffe_mean = caffe_mean[::-1]
+
+    # get caffe root location, this is needed to run the upgrade network utility, so we only need
+    # to support parsing of latest caffe
+    caffe_root = os.path.dirname(os.path.dirname(caffe.__path__[0]))
+    caffe_prototxt_path = process_network_proto(caffe_root, caffe_prototxt_path)
+
+    _, layer_name_to_record, top_to_layers = read_network_dag(caffe_prototxt_path)
+
+    caffe.set_mode_cpu()
+    caffe_net = caffe.Net(caffe_prototxt_path, caffe_model_path, caffe.TEST)
+
+    image_dims = tuple(caffe_net.blobs['data'].shape)[2:4]
+
+    logging.info('getting image %s', image_url)
+    img_rgb = read_image(image_url, image_dims, caffe_mean)
+    img_bgr = img_rgb[:, ::-1, :, :]
+
+    caffe_net.blobs['data'].reshape(*img_bgr.shape)
+    caffe_net.blobs['data'].data[...] = img_bgr
+    _ = caffe_net.forward()
+
+    # read sym and add all outputs
+    sym, arg_params, aux_params, _ = convert_model(caffe_prototxt_path, caffe_model_path)
+    sym = sym.get_internals()
+
+    # now mxnet
+    if gpu < 0:
+        ctx = mx.cpu(0)
+    else:
+        ctx = mx.gpu(gpu)
+
+    arg_params, aux_params = _ch_dev(arg_params, aux_params, ctx)
+    arg_params["data"] = mx.nd.array(img_rgb, ctx)
+    arg_params["prob_label"] = mx.nd.empty((1,), ctx)
+    exe = sym.bind(ctx, arg_params, args_grad=None, grad_req="null", aux_states=aux_params)
+    exe.forward(is_train=False)
+
+    compare_layers_from_nets(caffe_net, arg_params, aux_params, exe, layer_name_to_record,
+                             top_to_layers, mean_diff_allowed, max_diff_allowed)
+
+    return
+
+
+def _bfs(root_node, process_node):
+    """
+    Implementation of Breadth-first search (BFS) on caffe network DAG
+    :param root_node: root node of caffe network DAG
+    :param process_node: function to run on each node
+    """
+
+    from collections import deque
+
+    seen_nodes = set()
+    next_nodes = deque()
+
+    seen_nodes.add(root_node)
+    next_nodes.append(root_node)
+
+    while next_nodes:
+        current_node = next_nodes.popleft()
+
+        # process current node
+        process_node(current_node)
+
+        for child_node in current_node.children:
+            if child_node not in seen_nodes:
+                seen_nodes.add(child_node)
+                next_nodes.append(child_node)
+
+
+def compare_layers_from_nets(caffe_net, arg_params, aux_params, exe, layer_name_to_record,
+                             top_to_layers, mean_diff_allowed, max_diff_allowed):
+    """
+    Compare layer by layer of a caffe network with mxnet network
+    :param caffe_net: loaded caffe network
+    :param arg_params: arguments
+    :param aux_params: auxiliary parameters
+    :param exe: mxnet model
+    :param layer_name_to_record: map between caffe layer and information record
+    :param top_to_layers: map between caffe blob name to layers which outputs it (including inplace)
+    :param mean_diff_allowed: mean difference allowed between caffe blob and mxnet blob
+    :param max_diff_allowed: max difference allowed between caffe blob and mxnet blob
+    """
+
+    import re
+
+    log_format = '  {0:<40}  {1:<40}  {2:<8}  {3:>10}  {4:>10}  {5:<1}'
+
+    compare_layers_from_nets.is_first_convolution = True
+
+    def _compare_blob(caf_blob, mx_blob, caf_name, mx_name, blob_type, note):
+        diff = np.abs(mx_blob - caf_blob)
+        diff_mean = diff.mean()
+        diff_max = diff.max()
+        logging.info(log_format.format(caf_name, mx_name, blob_type, '%4.5f' % diff_mean,
+                                       '%4.5f' % diff_max, note))
+        assert diff_mean < mean_diff_allowed
+        assert diff_max < max_diff_allowed
+
+    def _process_layer_parameters(layer):
+
+        logging.debug('processing layer %s of type %s', layer.name, layer.type)
+
+        normalized_layer_name = re.sub('[-/]', '_', layer.name)
+
+        # handle weight and bias of convolution and fully-connected layers
+        if layer.name in caffe_net.params and layer.type in ['Convolution', 'InnerProduct',
+                                                             'Deconvolution']:
+
+            has_bias = len(caffe_net.params[layer.name]) > 1
+
+            mx_name_weight = '{}_weight'.format(normalized_layer_name)
+            mx_beta = arg_params[mx_name_weight].asnumpy()
+
+            # first convolution should change from BGR to RGB
+            if layer.type == 'Convolution' and compare_layers_from_nets.is_first_convolution:
+                compare_layers_from_nets.is_first_convolution = False
+
+                # if RGB or RGBA
+                if mx_beta.shape[1] == 3 or mx_beta.shape[1] == 4:
+                    # Swapping BGR of caffe into RGB in mxnet
+                    mx_beta[:, [0, 2], :, :] = mx_beta[:, [2, 0], :, :]
+
+            caf_beta = caffe_net.params[layer.name][0].data
+            _compare_blob(caf_beta, mx_beta, layer.name, mx_name_weight, 'weight', '')
+
+            if has_bias:
+                mx_name_bias = '{}_bias'.format(normalized_layer_name)
+                mx_gamma = arg_params[mx_name_bias].asnumpy()
+                caf_gamma = caffe_net.params[layer.name][1].data
+                _compare_blob(caf_gamma, mx_gamma, layer.name, mx_name_bias, 'bias', '')
+
+        elif layer.name in caffe_net.params and layer.type == 'Scale':
+
+            if 'scale' in normalized_layer_name:
+                bn_name = normalized_layer_name.replace('scale', 'bn')
+            elif 'sc' in normalized_layer_name:
+                bn_name = normalized_layer_name.replace('sc', 'bn')
+            else:
+                assert False, 'Unknown name convention for bn/scale'
+
+            beta_name = '{}_beta'.format(bn_name)
+            gamma_name = '{}_gamma'.format(bn_name)
+
+            mx_beta = arg_params[beta_name].asnumpy()
+            caf_beta = caffe_net.params[layer.name][1].data
+            _compare_blob(caf_beta, mx_beta, layer.name, beta_name, 'mov_mean', '')
+
+            mx_gamma = arg_params[gamma_name].asnumpy()
+            caf_gamma = caffe_net.params[layer.name][0].data
+            _compare_blob(caf_gamma, mx_gamma, layer.name, gamma_name, 'mov_var', '')
+
+        elif layer.name in caffe_net.params and layer.type == 'BatchNorm':
+
+            mean_name = '{}_moving_mean'.format(normalized_layer_name)
+            var_name = '{}_moving_var'.format(normalized_layer_name)
+
+            caf_rescale_factor = caffe_net.params[layer.name][2].data
+
+            mx_mean = aux_params[mean_name].asnumpy()
+            caf_mean = caffe_net.params[layer.name][0].data / caf_rescale_factor
+            _compare_blob(caf_mean, mx_mean, layer.name, mean_name, 'mean', '')
+
+            mx_var = aux_params[var_name].asnumpy()
+            caf_var = caffe_net.params[layer.name][1].data / caf_rescale_factor
+            _compare_blob(caf_var, mx_var, layer.name, var_name, 'var',
+                          'expect 1e-04 change due to cudnn eps')
+
+        elif layer.type in ['Input', 'Pooling', 'ReLU', 'Eltwise', 'Softmax', 'LRN', 'Concat',
+                            'Dropout', 'Crop']:
+            # no parameters to check for these layers
+            pass
+
+        else:
+            logging.warn('No handling for layer %s of type %s, should we ignore it?', layer.name,
+                         layer.type)
+
+        return
+
+    def _process_layer_output(caffe_blob_name):
+
+        logging.debug('processing blob %s', caffe_blob_name)
+
+        # skip blobs not originating from actual layers, e.g. artificial split layers added by caffe
+        if caffe_blob_name not in top_to_layers:
+            return
+
+        caf_blob = caffe_net.blobs[caffe_blob_name].data
+
+        # data should change from BGR to RGB
+        if caffe_blob_name == 'data':
+
+            # if RGB or RGBA
+            if caf_blob.shape[1] == 3 or caf_blob.shape[1] == 4:
+                # Swapping BGR of caffe into RGB in mxnet
+                caf_blob[:, [0, 2], :, :] = caf_blob[:, [2, 0], :, :]
+            mx_name = 'data'
+
+        else:
+            # get last layer name which outputs this blob name
+            last_layer_name = top_to_layers[caffe_blob_name][-1]
+            normalized_last_layer_name = re.sub('[-/]', '_', last_layer_name)
+            mx_name = '{}_output'.format(normalized_last_layer_name)
+            if 'scale' in mx_name:
+                mx_name = mx_name.replace('scale', 'bn')
+            elif 'sc' in mx_name:
+                mx_name = mx_name.replace('sc', 'bn')
+
+        if mx_name not in exe.output_dict:
+            logging.error('mxnet blob %s is missing, time to extend the compare tool..', mx_name)
+            return
+
+        mx_blob = exe.output_dict[mx_name].asnumpy()
+        _compare_blob(caf_blob, mx_blob, caffe_blob_name, mx_name, 'output', '')
+
+        return
+
+    # check layer parameters
+    logging.info('\n***** Network Parameters '.ljust(140, '*'))
+    logging.info(log_format.format('CAFFE', 'MXNET', 'Type', 'Mean(diff)', 'Max(diff)', 'Note'))
+    first_layer_name = layer_name_to_record.keys()[0]
+    _bfs(layer_name_to_record[first_layer_name], _process_layer_parameters)
+
+    # check layer output
+    logging.info('\n***** Network Outputs '.ljust(140, '*'))
+    logging.info(log_format.format('CAFFE', 'MXNET', 'Type', 'Mean(diff)', 'Max(diff)', 'Note'))
+    for caffe_blob_name in caffe_net.blobs.keys():
+        _process_layer_output(caffe_blob_name)
+
+    return
+
+
+def main():
+    """Entrypoint for compare_layers"""
+
+    parser = argparse.ArgumentParser(
+        description='Tool for testing caffe to mxnet conversion layer by layer')
+    parser.add_argument('--image_url', type=str,
+                        default='http://writm.com/wp-content/uploads/2016/08/Cat-hd-wallpapers.jpg',
+                        help='input image to test inference, can be either file path or url')
+    parser.add_argument('--caffe_prototxt_path', type=str,
+                        default='./model.prototxt',
+                        help='path to caffe prototxt')
+    parser.add_argument('--caffe_model_path', type=str,
+                        default='./model.caffemodel',
+                        help='path to caffe weights')
+    parser.add_argument('--caffe_mean', type=str,
+                        default='./model_mean.binaryproto',
+                        help='path to caffe mean file')
+    parser.add_argument('--mean_diff_allowed', type=int, default=1e-03,
+                        help='mean difference allowed between caffe blob and mxnet blob')
+    parser.add_argument('--max_diff_allowed', type=int, default=1e-01,
+                        help='max difference allowed between caffe blob and mxnet blob')
+    parser.add_argument('--gpu', type=int, default=-1, help='the gpu id used for predict')
+    args = parser.parse_args()
+    convert_and_compare_caffe_to_mxnet(args.image_url, args.gpu, args.caffe_prototxt_path,
+                                       args.caffe_model_path, args.caffe_mean,
+                                       args.mean_diff_allowed, args.max_diff_allowed)
+
+if __name__ == '__main__':
+    main()
diff --git a/example/ssd/tools/caffe_converter/convert_mean.py b/example/ssd/tools/caffe_converter/convert_mean.py
new file mode 100644
index 000000000000..3b6dc42a7afc
--- /dev/null
+++ b/example/ssd/tools/caffe_converter/convert_mean.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Convert caffe mean
+"""
+import argparse
+import mxnet as mx
+import numpy as np
+import caffe_parser
+
+def convert_mean(binaryproto_fname, output=None):
+    """Convert caffe mean
+
+    Parameters
+    ----------
+    binaryproto_fname : str
+        Filename of the mean
+    output : str, optional
+        Save the mean into mxnet's format
+
+    Returns
+    -------
+    NDArray
+        Mean in ndarray
+    """
+    mean_blob = caffe_parser.caffe_pb2.BlobProto()
+    with open(binaryproto_fname, 'rb') as f:
+        mean_blob.ParseFromString(f.read())
+
+    img_mean_np = np.array(mean_blob.data)
+    img_mean_np = img_mean_np.reshape(
+        mean_blob.channels, mean_blob.height, mean_blob.width
+    )
+    # swap channels from Caffe BGR to RGB
+    img_mean_np[[0, 2], :, :] = img_mean_np[[2, 0], :, :]
+    nd = mx.nd.array(img_mean_np)
+    if output is not None:
+        mx.nd.save(output, {"mean_image": nd})
+    return nd
+
+def main():
+    parser = argparse.ArgumentParser(description='Convert caffe mean')
+    parser.add_argument('binaryproto_fname', help='Filename of the mean')
+    parser.add_argument('output', help='The name of the output file')
+    args = parser.parse_args()
+    convert_mean(args.binaryproto_fname, args.output)
+
+if __name__ == '__main__':
+    main()
diff --git a/example/ssd/tools/caffe_converter/convert_model.py b/example/ssd/tools/caffe_converter/convert_model.py
index f17a3f250ecf..97bd5fa13e10 100644
--- a/example/ssd/tools/caffe_converter/convert_model.py
+++ b/example/ssd/tools/caffe_converter/convert_model.py
@@ -16,77 +16,56 @@
 # under the License.
 
 from __future__ import print_function
+import argparse
 import sys
-import os.path as osp
-sys.path.insert(0, osp.join(osp.dirname(__file__), '..'))
-import find_mxnet
+import caffe_parser
 import mxnet as mx
 import numpy as np
-import argparse
-import re
-from convert_symbol import proto2symbol
-
-caffe_flag = True
-try:
-    import caffe
-except ImportError:
-    import caffe_parse.parse_from_protobuf as parse
-
-    caffe_flag = False
-
-
-def get_caffe_iter(layer_names, layers):
-    for layer_idx, layer in enumerate(layers):
-        layer_name = re.sub('[-/]', '_', layer_names[layer_idx])
-        layer_type = layer.type
-        layer_blobs = layer.blobs
-        yield (layer_name, layer_type, layer_blobs)
-
-
-def get_iter(layers):
-    for layer in layers:
-        layer_name = re.sub('[-/]', '_', layer.name)
-        layer_type = layer.type
-        layer_blobs = layer.blobs
-        yield (layer_name, layer_type, layer_blobs)
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Caffe prototxt to mxnet model parameter converter.\
-                    Note that only basic functions are implemented. You are welcomed to contribute to this file.')
-    parser.add_argument('caffe_prototxt', help='The prototxt file in Caffe format')
-    parser.add_argument('caffe_model', help='The binary model parameter file in Caffe format')
-    parser.add_argument('save_model_name', help='The name of the output model prefix')
-    args = parser.parse_args()
-
-    prob, input_dim = proto2symbol(args.caffe_prototxt)
-
-    layers = ''
-    layer_names = ''
-
-    if caffe_flag:
-        caffe.set_mode_cpu()
-        net_caffe = caffe.Net(args.caffe_prototxt, args.caffe_model, caffe.TEST)
-        layer_names = net_caffe._layer_names
-        layers = net_caffe.layers
-    else:
-        layers = parse.parse_caffemodel(args.caffe_model)
-
-    arg_shapes, output_shapes, aux_shapes = prob.infer_shape(data=tuple(input_dim))
-    arg_names = prob.list_arguments()
+from convert_symbol import convert_symbol
+
+def convert_model(prototxt_fname, caffemodel_fname, output_prefix=None):
+    """Convert caffe model
+
+    Parameters
+    ----------
+
+    prototxt_fname : str
+         Filename of the prototxt model definition
+    caffemodel_fname : str
+         Filename of the binary caffe model
+    output_prefix : str, optinoal
+         If given, then save the converted MXNet into output_prefx+'.json' and
+         output_prefx+'.params'
+
+    Returns
+    -------
+    sym : Symbol
+         Symbol convereted from prototxt
+    arg_params : list of NDArray
+         Argument parameters
+    aux_params : list of NDArray
+         Aux parameters
+    input_dim : tuple
+         Input dimension
+    """
+    sym, input_dim = convert_symbol(prototxt_fname)
+    arg_shapes, _, aux_shapes = sym.infer_shape(data=tuple(input_dim))
+    arg_names = sym.list_arguments()
+    aux_names = sym.list_auxiliary_states()
     arg_shape_dic = dict(zip(arg_names, arg_shapes))
+    aux_shape_dic = dict(zip(aux_names, aux_shapes))
     arg_params = {}
-
-    iter = ''
-    if caffe_flag:
-        iter = get_caffe_iter(layer_names, layers)
-    else:
-        iter = get_iter(layers)
+    aux_params = {}
     first_conv = True
 
-    for layer_name, layer_type, layer_blobs in iter:
-        if layer_type == 'Convolution' or layer_type == 'InnerProduct' or layer_type == 4 or layer_type == 14 \
-                or layer_type == 'PReLU' or layer_type == 'Normalize':
+    layers, names = caffe_parser.read_caffemodel(prototxt_fname, caffemodel_fname)
+    layer_iter = caffe_parser.layer_iter(layers, names)
+    layers_proto = caffe_parser.get_layers(caffe_parser.read_prototxt(prototxt_fname))
+
+    for layer_name, layer_type, layer_blobs in layer_iter:
+        if layer_type == 'Convolution' or layer_type == 'InnerProduct'  \
+           or layer_type == 4 or layer_type == 14 or layer_type == 'PReLU' \
+           or layer_type == 'Deconvolution' or layer_type == 39  or layer_type == 'Normalize':
             if layer_type == 'PReLU':
                 assert (len(layer_blobs) == 1)
                 wmat = layer_blobs[0].data
@@ -101,31 +80,44 @@ def main():
                 arg_params[weight_name] = mx.nd.zeros((1, len(wmat), 1, 1))
                 arg_params[weight_name][:] = np.array(list(wmat)).reshape((1, len(wmat), 1, 1))
                 continue
-            assert (len(layer_blobs) == 2)
             wmat_dim = []
             if getattr(layer_blobs[0].shape, 'dim', None) is not None:
                 if len(layer_blobs[0].shape.dim) > 0:
                     wmat_dim = layer_blobs[0].shape.dim
                 else:
-                    wmat_dim = [layer_blobs[0].num, layer_blobs[0].channels, layer_blobs[0].height,
-                                layer_blobs[0].width]
+                    wmat_dim = [layer_blobs[0].num, layer_blobs[0].channels,
+                                layer_blobs[0].height, layer_blobs[0].width]
             else:
                 wmat_dim = list(layer_blobs[0].shape)
             wmat = np.array(layer_blobs[0].data).reshape(wmat_dim)
-            bias = np.array(layer_blobs[1].data)
+
             channels = wmat_dim[1]
             if channels == 3 or channels == 4:  # RGB or RGBA
                 if first_conv:
-                    print('Swapping BGR of caffe into RGB in mxnet')
+                    # Swapping BGR of caffe into RGB in mxnet
                     wmat[:, [0, 2], :, :] = wmat[:, [2, 0], :, :]
 
-            assert (wmat.flags['C_CONTIGUOUS'] is True)
-            assert (bias.flags['C_CONTIGUOUS'] is True)
-            print('converting layer {0}, wmat shape = {1}, bias shape = {2}'.format(layer_name, wmat.shape, bias.shape))
+            assert(wmat.flags['C_CONTIGUOUS'] is True)
+            sys.stdout.write('converting layer {0}, wmat shape = {1}'.format(
+                layer_name, wmat.shape))
+            if len(layer_blobs) == 2:
+                bias = np.array(layer_blobs[1].data)
+                bias = bias.reshape((bias.shape[0], 1))
+                assert(bias.flags['C_CONTIGUOUS'] is True)
+                bias_name = layer_name + "_bias"
+
+                if bias_name not in arg_shape_dic:
+                    print(bias_name + ' not found in arg_shape_dic.')
+                    continue
+                bias = bias.reshape(arg_shape_dic[bias_name])
+                arg_params[bias_name] = mx.nd.zeros(bias.shape)
+                arg_params[bias_name][:] = bias
+                sys.stdout.write(', bias shape = {}'.format(bias.shape))
+
+            sys.stdout.write('\n')
+            sys.stdout.flush()
             wmat = wmat.reshape((wmat.shape[0], -1))
-            bias = bias.reshape((bias.shape[0], 1))
             weight_name = layer_name + "_weight"
-            bias_name = layer_name + "_bias"
 
             if weight_name not in arg_shape_dic:
                 print(weight_name + ' not found in arg_shape_dic.')
@@ -134,19 +126,99 @@ def main():
             arg_params[weight_name] = mx.nd.zeros(wmat.shape)
             arg_params[weight_name][:] = wmat
 
-            bias = bias.reshape(arg_shape_dic[bias_name])
-            arg_params[bias_name] = mx.nd.zeros(bias.shape)
-            arg_params[bias_name][:] = bias
 
             if first_conv and (layer_type == 'Convolution' or layer_type == 4):
                 first_conv = False
 
-    model = mx.mod.Module(symbol=prob, label_names=None)
-    model.bind(data_shapes=[('data', tuple(input_dim))])
-    model.init_params(arg_params=arg_params, aux_params={})
+        elif layer_type == 'Scale':
+            if 'scale' in layer_name:
+                bn_name = layer_name.replace('scale', 'bn')
+            elif 'sc' in layer_name:
+                bn_name = layer_name.replace('sc', 'bn')
+            else:
+                assert False, 'Unknown name convention for bn/scale'
+
+            gamma = np.array(layer_blobs[0].data)
+            beta = np.array(layer_blobs[1].data)
+            # beta = np.expand_dims(beta, 1)
+            beta_name = '{}_beta'.format(bn_name)
+            gamma_name = '{}_gamma'.format(bn_name)
+
+            beta = beta.reshape(arg_shape_dic[beta_name])
+            gamma = gamma.reshape(arg_shape_dic[gamma_name])
+            arg_params[beta_name] = mx.nd.zeros(beta.shape)
+            arg_params[gamma_name] = mx.nd.zeros(gamma.shape)
+            arg_params[beta_name][:] = beta
+            arg_params[gamma_name][:] = gamma
+
+            assert gamma.flags['C_CONTIGUOUS'] is True
+            assert beta.flags['C_CONTIGUOUS'] is True
+            print('converting scale layer, beta shape = {}, gamma shape = {}'.format(
+                beta.shape, gamma.shape))
+        elif layer_type == 'BatchNorm':
+            bn_name = layer_name
+            mean = np.array(layer_blobs[0].data)
+            var = np.array(layer_blobs[1].data)
+            rescale_factor = layer_blobs[2].data[0]
+            if rescale_factor != 0:
+                rescale_factor = 1 / rescale_factor
+            mean_name = '{}_moving_mean'.format(bn_name)
+            var_name = '{}_moving_var'.format(bn_name)
+            mean = mean.reshape(aux_shape_dic[mean_name])
+            var = var.reshape(aux_shape_dic[var_name])
+            aux_params[mean_name] = mx.nd.zeros(mean.shape)
+            aux_params[var_name] = mx.nd.zeros(var.shape)
+            # Get the original epsilon
+            for idx, layer in enumerate(layers_proto):
+                if layer.name == bn_name:
+                    bn_index = idx
+            eps_caffe = layers_proto[bn_index].batch_norm_param.eps
+            # Compensate for the epsilon shift performed in convert_symbol
+            eps_symbol = float(sym.attr_dict()[bn_name + '_moving_mean']['eps'])
+            eps_correction = eps_caffe - eps_symbol
+            # Fill parameters
+            aux_params[mean_name][:] = mean * rescale_factor
+            aux_params[var_name][:] = var * rescale_factor + eps_correction
+            assert var.flags['C_CONTIGUOUS'] is True
+            assert mean.flags['C_CONTIGUOUS'] is True
+            print('converting batchnorm layer, mean shape = {}, var shape = {}'.format(
+                mean.shape, var.shape))
+
+            fix_gamma = layers_proto[bn_index+1].type != 'Scale'
+            if fix_gamma:
+                gamma_name = '{}_gamma'.format(bn_name)
+                gamma = np.array(np.ones(arg_shape_dic[gamma_name]))
+                beta_name = '{}_beta'.format(bn_name)
+                beta = np.array(np.zeros(arg_shape_dic[beta_name]))
+                arg_params[beta_name] = mx.nd.zeros(beta.shape)
+                arg_params[gamma_name] = mx.nd.zeros(gamma.shape)
+                arg_params[beta_name][:] = beta
+                arg_params[gamma_name][:] = gamma
+                assert gamma.flags['C_CONTIGUOUS'] is True
+                assert beta.flags['C_CONTIGUOUS'] is True
+
+        else:
+            print('\tskipping layer {} of type {}'.format(layer_name, layer_type))
+            assert len(layer_blobs) == 0
+
+    if output_prefix is not None:
+        model = mx.mod.Module(symbol=sym, label_names=None)
+        model.bind(data_shapes=[('data', tuple(input_dim))])
+        model.init_params(arg_params=arg_params, aux_params=aux_params)
+        model.save_checkpoint(output_prefix, 0)
+
+    return sym, arg_params, aux_params, input_dim
 
-    model.save_checkpoint(args.save_model_name, 1)
+def main():
+    parser = argparse.ArgumentParser(
+        description='Caffe prototxt to mxnet model parameter converter.')
+    parser.add_argument('prototxt', help='The prototxt filename')
+    parser.add_argument('caffemodel', help='The binary caffemodel filename')
+    parser.add_argument('save_model_name', help='The name of the output model prefix')
+    args = parser.parse_args()
 
+    convert_model(args.prototxt, args.caffemodel, args.save_model_name)
+    print ('Saved model successfully to {}'.format(args.save_model_name))
 
 if __name__ == '__main__':
     main()
diff --git a/example/ssd/tools/caffe_converter/convert_symbol.py b/example/ssd/tools/caffe_converter/convert_symbol.py
index 10510aa92569..17fdcd996b96 100644
--- a/example/ssd/tools/caffe_converter/convert_symbol.py
+++ b/example/ssd/tools/caffe_converter/convert_symbol.py
@@ -16,202 +16,245 @@
 # under the License.
 
 from __future__ import print_function
-from google.protobuf import text_format
 import argparse
 import re
-import sys
-import math
+import caffe_parser
 
-caffe_flag = True
-try:
-    import caffe
-    from caffe.proto import caffe_pb2
-except ImportError:
-    caffe_flag = False
-    import caffe_parse.caffe_pb2
-
-
-def read_proto_solver_file(file_path):
-    solver_config = ''
-    if caffe_flag:
-        solver_config = caffe.proto.caffe_pb2.NetParameter()
+def _get_input(proto):
+    """Get input size
+    """
+    layer = caffe_parser.get_layers(proto)
+    if len(proto.input_dim) > 0:
+        input_dim = proto.input_dim
+    elif len(proto.input_shape) > 0:
+        input_dim = proto.input_shape[0].dim
+    elif layer[0].type == "Input":
+        input_dim = layer[0].input_param.shape[0].dim
+        layer.pop(0)
     else:
-        solver_config = caffe_parse.caffe_pb2.NetParameter()
-    return read_proto_file(file_path, solver_config)
+        raise ValueError('Cannot find input size')
 
+    assert layer[0].type != "Input", 'only support single input'
+    # We assume the first bottom blob of first layer is the output from data layer
+    input_name = layer[0].bottom[0]
+    return input_name, input_dim, layer
 
-def read_proto_file(file_path, parser_object):
-    file = open(file_path, "r")
-    if not file:
-        raise Exception("ERROR (" + file_path + ")!")
-    text_format.Merge(str(file.read()), parser_object)
-    file.close()
-    return parser_object
-
+def _convert_conv_param(param):
+    """
+    Convert convolution layer parameter from Caffe to MXNet
+    """
+    param_string = "num_filter=%d" % param.num_output
 
-def conv_param_to_string(param):
-    pad = 0
+    pad_w = 0
+    pad_h = 0
     if isinstance(param.pad, int):
         pad = param.pad
+        param_string += ", pad=(%d, %d)" % (pad, pad)
     else:
-        pad = 0 if len(param.pad) == 0 else param.pad[0]
+        if len(param.pad) > 0:
+            pad = param.pad[0]
+            param_string += ", pad=(%d, %d)" % (pad, pad)
+        else:
+            if isinstance(param.pad_w, int):
+                pad_w = param.pad_w
+            if isinstance(param.pad_h, int):
+                pad_h = param.pad_h
+            param_string += ", pad=(%d, %d)" % (pad_h, pad_w)
+
+    if isinstance(param.kernel_size, int):
+        kernel_size = param.kernel_size
+        param_string += ", kernel=(%d,%d)" % (kernel_size, kernel_size)
+    else:
+        if len(param.kernel_size) > 0:
+            kernel_size = param.kernel_size[0]
+            param_string += ", kernel=(%d,%d)" % (kernel_size, kernel_size)
+        else:
+            assert isinstance(param.kernel_w, int)
+            kernel_w = param.kernel_w
+            assert isinstance(param.kernel_h, int)
+            kernel_h = param.kernel_h
+            param_string += ", kernel=(%d,%d)" % (kernel_h, kernel_w)
+
     stride = 1
     if isinstance(param.stride, int):
         stride = param.stride
     else:
         stride = 1 if len(param.stride) == 0 else param.stride[0]
-    kernel_size = ''
-    if isinstance(param.kernel_size, int):
-        kernel_size = param.kernel_size
-    else:
-        kernel_size = param.kernel_size[0]
+
+    param_string += ", stride=(%d,%d)" % (stride, stride)
+
     dilate = 1
-    if isinstance(param.dilation, int):
-        dilate = param.dilation
-    else:
-        dilate = 1 if len(param.dilation) == 0 else param.dilation[0]
-    # convert to string except for dilation
-    param_string = "num_filter=%d, pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d), no_bias=%s" % \
-                   (param.num_output, pad, pad, kernel_size, kernel_size, stride, stride, not param.bias_term)
+    if hasattr(param, 'dilation'):
+        if isinstance(param.dilation, int):
+            dilate = param.dilation
+        else:
+            dilate = 1 if len(param.dilation) == 0 else param.dilation[0]
+
+    param_string += ", no_bias=%s" % (not param.bias_term)
+
     # deal with dilation. Won't be in deconvolution
     if dilate > 1:
         param_string += ", dilate=(%d, %d)" % (dilate, dilate)
+
+    if isinstance(param.group, int):
+        if param.group != 1:
+            param_string += ", num_group=%d" % param.group
+
+    return param_string
+
+def _convert_pooling_param(param):
+    """Convert the pooling layer parameter
+    """
+    param_string = "pooling_convention='full', "
+    if param.global_pooling:
+        param_string += "global_pool=True, kernel=(1,1)"
+    else:
+        param_string += "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" % (
+            param.pad, param.pad, param.kernel_size, param.kernel_size,
+            param.stride, param.stride)
+    if param.pool == 0:
+        param_string += ", pool_type='max'"
+    elif param.pool == 1:
+        param_string += ", pool_type='avg'"
+    else:
+        raise ValueError("Unknown Pooling Method!")
     return param_string
 
-def find_layer(layers, name):
+def _find_layer(layers, name):
     for layer in layers:
         if layer.name == name:
             return layer
     return None
 
-def proto2script(proto_file):
-    proto = read_proto_solver_file(proto_file)
-    connection = dict()
-    symbols = dict()
-    top = dict()
-    flatten_count = 0
-    symbol_string = ""
-    layer = ''
-    if len(proto.layer):
-        layer = proto.layer
-    elif len(proto.layers):
-        layer = proto.layers
-    else:
-        raise Exception('Invalid proto file.')
-        # Get input size to network
-    input_dim = [1, 3, 224, 224]  # default
-    if len(proto.input_dim) > 0:
-        input_dim = proto.input_dim
-    elif len(proto.input_shape) > 0:
-        input_dim = proto.input_shape[0].dim
-    elif layer[0].type == "Input":
-        input_dim = layer[0].input_param.shape._values[0].dim
-        layer.pop(0)
-    else:
-        raise Exception('Invalid proto file.')
-        # We assume the first bottom blob of first layer is the output from data layer
-    input_name = layer[0].bottom[0]
-    output_name = ""
+def _parse_proto(prototxt_fname):
+    """Parse Caffe prototxt into symbol string
+    """
+    proto = caffe_parser.read_prototxt(prototxt_fname)
+
+    # process data layer
+    input_name, input_dim, layers = _get_input(proto)
+    # only support single input, so always use `data` as the input data
     mapping = {input_name: 'data'}
     need_flatten = {input_name: False}
-    for i in range(len(layer)):
+    symbol_string = "import mxnet as mx\ndata = mx.symbol.Variable(name='data')\n"
+
+    flatten_count = 0
+    output_name = ""
+    prev_name = None
+
+    # convert reset layers one by one
+    for i, layer in enumerate(layers):
         type_string = ''
         param_string = ''
-        name = re.sub('[-/]', '_', layer[i].name)
-        from_name = 'data='
+        skip_layer = False
         bottom_order = []
-        if layer[i].type == 'Convolution' or layer[i].type == 4:
+        name = re.sub('[-/]', '_', layer.name)
+        if layer.type == 'Convolution' or layer.type == 4:
             type_string = 'mx.symbol.Convolution'
-            param_string = conv_param_to_string(layer[i].convolution_param)
+            param_string = _convert_conv_param(layer.convolution_param)
             need_flatten[name] = True
-        if layer[i].type == 'Deconvolution' or layer[i].type == 39:
+        if layer.type == 'Deconvolution' or layer.type == 39:
             type_string = 'mx.symbol.Deconvolution'
-            param_string = conv_param_to_string(layer[i].convolution_param)
+            param_string = _convert_conv_param(layer.convolution_param)
             need_flatten[name] = True
-        if layer[i].type == 'Pooling' or layer[i].type == 17:
+        if layer.type == 'Pooling' or layer.type == 17:
             type_string = 'mx.symbol.Pooling'
-            param = layer[i].pooling_param
-            param_string = ''
-            param_string += "pooling_convention='full', "
-            if param.global_pooling:
-                # there must be a param `kernel` in a pooling layer
-                param_string += "global_pool=True, kernel=(1,1)"
-            else:
-                param_string += "pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d)" % \
-                                (param.pad, param.pad, param.kernel_size, param.kernel_size, param.stride, param.stride)
-            if param.pool == 0:
-                param_string += ", pool_type='max'"
-            elif param.pool == 1:
-                param_string += ", pool_type='avg'"
-            else:
-                raise Exception("Unknown Pooling Method!")
+            param_string = _convert_pooling_param(layer.pooling_param)
             need_flatten[name] = True
-        if layer[i].type == 'ReLU' or layer[i].type == 18:
+        if layer.type == 'ReLU' or layer.type == 18:
             type_string = 'mx.symbol.Activation'
             param_string = "act_type='relu'"
-            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
-        if layer[i].type == 'TanH' or layer[i].type == 23:
+            need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
+        if layer.type == 'TanH' or layer.type == 23:
             type_string = 'mx.symbol.Activation'
             param_string = "act_type='tanh'"
-            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
-        if layer[i].type == 'Sigmoid' or layer[i].type == 19:
+            need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
+        if layer.type == 'Sigmoid' or layer.type == 19:
             type_string = 'mx.symbol.Activation'
             param_string = "act_type='sigmoid'"
-            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
-        if layer[i].type == 'LRN' or layer[i].type == 15:
+            need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
+        if layer.type == 'LRN' or layer.type == 15:
             type_string = 'mx.symbol.LRN'
-            param = layer[i].lrn_param
-            param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" % \
-                           (param.alpha, param.beta, param.k, param.local_size)
+            param = layer.lrn_param
+            param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" % (
+                param.alpha, param.beta, param.k, param.local_size)
             need_flatten[name] = True
-        if layer[i].type == 'InnerProduct' or layer[i].type == 14:
+        if layer.type == 'InnerProduct' or layer.type == 14:
             type_string = 'mx.symbol.FullyConnected'
-            param = layer[i].inner_product_param
-            param_string = "num_hidden=%d, no_bias=%s" % (param.num_output, not param.bias_term)
+            param = layer.inner_product_param
+            param_string = "num_hidden=%d, no_bias=%s" % (
+                param.num_output, not param.bias_term)
             need_flatten[name] = False
-        if layer[i].type == 'Dropout' or layer[i].type == 6:
+        if layer.type == 'Dropout' or layer.type == 6:
             type_string = 'mx.symbol.Dropout'
-            param = layer[i].dropout_param
+            param = layer.dropout_param
             param_string = "p=%f" % param.dropout_ratio
-            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
-        if layer[i].type == 'Softmax' or layer[i].type == 20:
-            if layer[i].softmax_param.axis == 2:
+            need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
+        if layer.type == 'Softmax' or layer.type == 20:
+            if layer.softmax_param.axis == 2:
                 symbol_string += "%s = mx.symbol.transpose(%s, axes=(0,2,1))\n" %\
-                    (mapping[layer[i].bottom[0]], mapping[layer[i].bottom[0]])
+                    (mapping[layer.bottom[0]], mapping[layer.bottom[0]])
                 type_string = 'mx.symbol.SoftmaxActivation'
                 param_string = "mode='channel'"
                 need_flatten[name] = False
             else:
                 type_string = 'mx.symbol.SoftmaxOutput'
-        if layer[i].type == 'Flatten' or layer[i].type == 8:
-            if 'softmax' in layer[i].bottom[0]:
-                type_string = 'identical'
+        if layer.type == 'Flatten' or layer.type == 8:
+            if 'softmax' in layer.bottom[0]:
+                prev_name = re.sub('[-/]', '_', layers[i-1].name)
+                skip_layer = True
             else:
                 type_string = 'mx.symbol.Flatten'
             need_flatten[name] = False
-        if layer[i].type == 'Split' or layer[i].type == 22:
-            type_string = 'split'
-        if layer[i].type == 'Concat' or layer[i].type == 3:
+        if layer.type == 'Split' or layer.type == 22:
+            type_string = 'split'  # will process later
+        if layer.type == 'Concat' or layer.type == 3:
             type_string = 'mx.symbol.Concat'
             need_flatten[name] = True
-        if layer[i].type == 'Crop':
+        if layer.type == 'Crop':
             type_string = 'mx.symbol.Crop'
             need_flatten[name] = True
             param_string = 'center_crop=True'
-        if layer[i].type == 'BatchNorm':
+        if layer.type == 'BatchNorm':
             type_string = 'mx.symbol.BatchNorm'
-            param = layer[i].batch_norm_param
-            param_string = 'use_global_stats=%s' % param.use_global_stats
-        if layer[i].type == 'PReLU':
+            param = layer.batch_norm_param
+            # CuDNN requires eps to be greater than 1e-05
+            # We compensate for this change in convert_model
+            epsilon = param.eps
+            if (epsilon <= 1e-05):
+                epsilon = 1e-04
+            # if next layer is scale, don't fix gamma
+            fix_gamma = layers[i+1].type != 'Scale'
+            param_string = 'use_global_stats=%s, fix_gamma=%s, eps=%f' % (
+                param.use_global_stats, fix_gamma, epsilon)
+            need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
+        if layer.type == 'Scale':
+            assert layers[i-1].type == 'BatchNorm'
+            need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
+            skip_layer = True
+            prev_name = re.sub('[-/]', '_', layers[i-1].name)
+        if layer.type == 'PReLU':
             type_string = 'mx.symbol.LeakyReLU'
-            param = layer[i].prelu_param
+            param = layer.prelu_param
             param_string = "act_type='prelu', slope=%f" % param.filler.value
-            need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
-        if layer[i].type == 'Normalize':
-            bottom = re.sub('[-/]', '_', layer[i].bottom[0])
-            conv_layer = find_layer(layer, bottom)
+            need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
+        if layer.type == 'Eltwise':
+            type_string = 'mx.symbol.broadcast_add'
+            param_string = ""
+            need_flatten[name] = False
+        if layer.type == 'Reshape':
+            type_string = 'mx.symbol.Reshape'
+            param = layer.reshape_param
+            param_string = 'shape=(' + ','.join([str(x) for x in list(param.shape.dim)]) + ')'
+            need_flatten[name] = True
+        if layer.type == 'AbsVal':
+            type_string = 'mx.symbol.abs'
+            need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
+        if layer.type == 'Normalize':
+            bottom = re.sub('[-/]', '_', layer.bottom[0])
+            conv_layer = _find_layer(layers, bottom)
             assert conv_layer is not None
-            param = layer[i].norm_param
+            param = layer.norm_param
             assert not param.across_spatial and not param.channel_shared
             assert param.scale_filler.type == 'constant'
             if conv_layer.type == 'Convolution':
@@ -220,25 +263,26 @@ def proto2script(proto_file):
                     (scale_name, scale_name, conv_layer.convolution_param.num_output,
                     param.scale_filler.value)
                 symbol_string += "%s=mx.symbol.L2Normalization(name='%s', data=%s, mode='channel')\n" %\
-                    (name, name, mapping[layer[i].bottom[0]])
+                    (name, name, mapping[layer.bottom[0]])
                 symbol_string += "%s=mx.symbol.broadcast_mul(lhs=%s, rhs=%s)\n" %\
                     (name, scale_name, name)
                 type_string = 'split'
                 need_flatten[name] = True
             else:
                 raise ValueError('Unknown/Invalid normalize layer!')
-        if layer[i].type == 'Permute':
+        if layer.type == 'Permute':
             type_string = 'mx.symbol.transpose'
-            param_string = "axes=(%s)" % (','.join([str(x) for x in layer[i].permute_param.order]))
+            param_string = "axes=(%s)" % (','.join([str(x) for x in layer.permute_param.order]))
             need_flatten[name] = True
             from_name = ''
-        if layer[i].type == 'PriorBox':
-            param = layer[i].prior_box_param
-            if layer[i].bottom[0] == 'data':
+        if layer.type == 'PriorBox':
+            param = layer.prior_box_param
+            if layer.bottom[0] == 'data':
                 bottom_order = [1]
             else:
                 bottom_order = [0]
             try:
+                import math
                 min_size = param.min_size[0] / input_dim[2]
                 max_size = math.sqrt(param.min_size[0] * param.max_size[0]) / input_dim[2]
                 sizes = '(%f, %f)' %(min_size, max_size)
@@ -262,82 +306,89 @@ def proto2script(proto_file):
             else:
                 step_h = -1
                 step_w = -1
-            finput_dim = float(input_dim[2])
-            step = '(%f, %f)' % (step_h / finput_dim, step_w / finput_dim)
+            finput_dimh = float(input_dim[2])
+            finput_dimw = float(input_dim[3])
+            step = '(%f, %f)' % (step_h / finput_dimh, step_w / finput_dimw)
             assert param.offset == 0.5, "currently only support offset = 0.5"
             symbol_string += '%s = mx.contrib.symbol.MultiBoxPrior(%s, sizes=%s, ratios=%s, clip=%s, steps=%s, name="%s")\n' % \
-                (name, mapping[layer[i].bottom[0]], sizes, ratios_string, clip, step, name)
+                (name, mapping[layer.bottom[0]], sizes, ratios_string, clip, step, name)
             symbol_string += '%s = mx.symbol.Flatten(data=%s)\n' % (name, name)
             type_string = 'split'
             need_flatten[name] = False
-        if layer[i].type == 'Reshape':
-            type_string = 'mx.symbol.Reshape'
-            param = layer[i].reshape_param
-            param_string = 'shape=(' + ','.join([str(x) for x in list(param.shape.dim)]) + ')'
-            need_flatten[name] = True
-        if layer[i].type == 'DetectionOutput':
+        if layer.type == 'DetectionOutput':
             bottom_order = [1, 0, 2]
-            param = layer[i].detection_output_param
+            param = layer.detection_output_param
             assert param.share_location == True
             assert param.background_label_id == 0
             nms_param = param.nms_param
             type_string = 'mx.contrib.symbol.MultiBoxDetection'
-            param_string = "nms_threshold=%f, nms_topk=%d" % \
+            param_string = "nms_threshold=%f, nms_topk=%d, clip=False" % \
                 (nms_param.nms_threshold, nms_param.top_k)
-        if type_string == '':
-            raise Exception('Unknown Layer %s!' % layer[i].type)
-        if type_string == 'identical':
-            bottom = layer[i].bottom
-            symbol_string += "%s = %s\n" % (name, mapping[bottom[0]])
+        if skip_layer:
+            assert len(layer.bottom) == 1
+            symbol_string += "%s = %s\n" % (name, prev_name)
+        elif type_string == '':
+            raise ValueError('Unknown layer %s!' % layer.type)
         elif type_string != 'split':
-            bottom = layer[i].bottom
+            bottom = layer.bottom
             if param_string != "":
                 param_string = ", " + param_string
             if len(bottom) == 1:
+                # print(need_flatten)
                 if need_flatten[mapping[bottom[0]]] and type_string == 'mx.symbol.FullyConnected':
                     flatten_name = "flatten_%d" % flatten_count
-                    symbol_string += "%s=mx.symbol.Flatten(name='%s', data=%s)\n" % \
-                                     (flatten_name, flatten_name, mapping[bottom[0]])
+                    symbol_string += "%s=mx.symbol.Flatten(name='%s', data=%s)\n" % (
+                        flatten_name, flatten_name, mapping[bottom[0]])
                     flatten_count += 1
                     need_flatten[flatten_name] = False
                     bottom[0] = flatten_name
                     mapping[bottom[0]] = bottom[0]
-                symbol_string += "%s = %s(%s%s %s, name='%s')\n" % \
-                                 (name, type_string, from_name, mapping[bottom[0]], param_string, name)
+                symbol_string += "%s = %s(name='%s', data=%s %s)\n" % (
+                    name, type_string, name, mapping[bottom[0]], param_string)
             else:
                 if not bottom_order:
                     bottom_order = range(len(bottom))
                 symbol_string += "%s = %s(name='%s', *[%s] %s)\n" % \
                                  (name, type_string, name, ','.join([mapping[bottom[x]] for x in bottom_order]), param_string)
-                if layer[i].type == 'Concat' and layer[i].concat_param.axis == 2:
+                if layer.type == 'Concat' and layer.concat_param.axis == 2:
                     symbol_string += "%s = mx.symbol.Reshape(data=%s, shape=(0, -1, 4), name='%s')\n" %\
                         (name, name, name)
-        for j in range(len(layer[i].top)):
-            mapping[layer[i].top[j]] = name
+        for j in range(len(layer.top)):
+            mapping[layer.top[j]] = name
         output_name = name
     return symbol_string, output_name, input_dim
 
+def convert_symbol(prototxt_fname):
+    """Convert caffe model definition into Symbol
 
-def proto2symbol(proto_file):
-    sym, output_name, input_dim = proto2script(proto_file)
-    sym = "import mxnet as mx\n" \
-          + "data = mx.symbol.Variable(name='data')\n" \
-          + sym
-    exec(sym)
+    Parameters
+    ----------
+    prototxt_fname : str
+        Filename of the prototxt file
+
+    Returns
+    -------
+    Symbol
+        Converted Symbol
+    tuple
+        Input shape
+    """
+    sym, output_name, input_dim = _parse_proto(prototxt_fname)
+    exec(sym)                   # pylint: disable=exec-used
     _locals = locals()
-    exec("ret = " + output_name, globals(), _locals)
+    exec("ret = " + output_name, globals(), _locals)  # pylint: disable=exec-used
     ret = _locals['ret']
     return ret, input_dim
 
-
 def main():
-    symbol_string, output_name, input_dim = proto2script(sys.argv[1])
-    if len(sys.argv) > 2:
-        with open(sys.argv[2], 'w') as fout:
-            fout.write(symbol_string)
-    else:
-        print(symbol_string)
+    parser = argparse.ArgumentParser(
+        description='Convert caffe prototxt into Symbol')
+    parser.add_argument('prototxt', help='The prototxt filename')
+    parser.add_argument('output', help='filename for the output json file')
+    args = parser.parse_args()
 
+    sym, _ = convert_symbol(args.prototxt)
+    sym.save(args.output)
 
 if __name__ == '__main__':
     main()
diff --git a/example/ssd/tools/prepare_coco.sh b/example/ssd/tools/prepare_coco.sh
old mode 100644
new mode 100755
diff --git a/example/ssd/tools/prepare_pascal.sh b/example/ssd/tools/prepare_pascal.sh
old mode 100644
new mode 100755
diff --git a/example/ssd/train/train_net.py b/example/ssd/train/train_net.py
index 767e3244d406..304a43b3d949 100644
Binary files a/example/ssd/train/train_net.py and b/example/ssd/train/train_net.py differ
diff --git a/example/torch/data.py b/example/torch/data.py
deleted file mode 100644
index 0ca8e1fd6653..000000000000
--- a/example/torch/data.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: skip-file
-""" data iterator for mnist """
-import sys
-import os
-# code to automatically download dataset
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
-import get_data
-import mxnet as mx
-
-def mnist_iterator(batch_size, input_shape):
-    """return train and val iterators for mnist"""
-    # download data
-    get_data.GetMNIST_ubyte()
-    flat = False if len(input_shape) == 3 else True
-
-    train_dataiter = mx.io.MNISTIter(
-        image="data/train-images-idx3-ubyte",
-        label="data/train-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        shuffle=True,
-        flat=flat)
-
-    val_dataiter = mx.io.MNISTIter(
-        image="data/t10k-images-idx3-ubyte",
-        label="data/t10k-labels-idx1-ubyte",
-        input_shape=input_shape,
-        batch_size=batch_size,
-        flat=flat)
-
-    return (train_dataiter, val_dataiter)
diff --git a/example/torch/torch_module.py b/example/torch/torch_module.py
index 1595173b02d4..e2f782136238 100644
--- a/example/torch/torch_module.py
+++ b/example/torch/torch_module.py
@@ -16,7 +16,11 @@
 # under the License.
 
 # pylint: skip-file
-from data import mnist_iterator
+import sys
+import os
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+sys.path.append(os.path.join(curr_path, "../../tests/python/common"))
+from get_data import MNISTIterator
 import mxnet as mx
 import numpy as np
 import logging
@@ -42,7 +46,7 @@
 
 # data
 
-train, val = mnist_iterator(batch_size=100, input_shape = (784,))
+train, val = MNISTIterator(batch_size=100, input_shape = (784,))
 
 # train
 
diff --git a/example/vae/VAE.py b/example/vae/VAE.py
new file mode 100644
index 000000000000..9de1abf07a30
--- /dev/null
+++ b/example/vae/VAE.py
@@ -0,0 +1,147 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+import mxnet as mx
+import numpy as np
+import os
+import logging
+
+
+class VAE:
+    '''This class implements the Variational Auto Encoder'''
+    
+    def Bernoulli(x_hat,loss_label):
+        return(-mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(x_hat)) + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-x_hat)),axis=1))
+
+    
+    def __init__(self,n_latent=5,num_hidden_ecoder=400,num_hidden_decoder=400,x_train=None,x_valid=None,batch_size=100,learning_rate=0.001,weight_decay=0.01,num_epoch=100,optimizer='sgd',model_prefix=None, initializer = mx.init.Normal(0.01),likelihood=Bernoulli):
+        
+
+        self.n_latent = n_latent                            #dimension of the latent space Z
+        self.num_hidden_ecoder = num_hidden_ecoder          #number of hidden units in the encoder
+        self.num_hidden_decoder = num_hidden_decoder        #number of hidden units in the decoder
+        self.batch_size = batch_size                        #mini batch size
+        self.learning_rate = learning_rate                  #learning rate during training
+        self.weight_decay = weight_decay                    #weight decay during training, for regulariization of parameters
+        self.num_epoch = num_epoch                          #total number of training epoch
+        self.optimizer = optimizer
+
+
+
+        #train the model
+        self.model, self.training_loss = VAE.train_vae(x_train,x_valid,batch_size,n_latent,num_hidden_ecoder,num_hidden_decoder,learning_rate,weight_decay,num_epoch,optimizer,model_prefix,likelihood,initializer)
+        #save model parameters (i.e. weights and biases)
+        self.arg_params = self.model.get_params()[0]
+        #save loss(ELBO) for the training set 
+        nd_iter = mx.io.NDArrayIter(data={'data':x_train},label={'loss_label':x_train},batch_size = batch_size)     
+
+        #if saved parameters, can access them at specific iteration e.g. last epoch using
+        #   sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, self.num_epoch)
+        #   assert sym.tojson() == output.tojson()
+        #   self.arg_params = arg_params 
+    def train_vae(x_train,x_valid,batch_size,n_latent,num_hidden_ecoder,num_hidden_decoder,learning_rate,weight_decay,num_epoch,optimizer,model_prefix,likelihood,initializer):
+        [N,features] = np.shape(x_train)          #number of examples and features
+
+        #create data iterator to feed into NN
+        nd_iter = mx.io.NDArrayIter(data={'data':x_train},label={'loss_label':x_train},batch_size = batch_size)
+        if x_valid is not None:
+            nd_iter_val = mx.io.NDArrayIter(data={'data':x_valid},label={'loss_label':x_valid},batch_size = batch_size)
+        else:
+            nd_iter_val = None
+        data = mx.sym.var('data')
+        loss_label = mx.sym.var('loss_label')
+
+
+        #build network architucture
+        encoder_h  = mx.sym.FullyConnected(data=data, name="encoder_h",num_hidden=num_hidden_ecoder)
+        act_h = mx.sym.Activation(data=encoder_h, act_type="tanh",name="activation_h")
+
+        
+        mu  = mx.sym.FullyConnected(data=act_h, name="mu",num_hidden = n_latent)
+        logvar  = mx.sym.FullyConnected(data=act_h, name="logvar",num_hidden = n_latent)
+        #latent manifold
+        z = mu + mx.symbol.broadcast_mul(mx.symbol.exp(0.5*logvar),mx.symbol.random_normal(loc=0, scale=1,shape=(batch_size,n_latent))) 
+        decoder_z = mx.sym.FullyConnected(data=z, name="decoder_z",num_hidden=num_hidden_decoder)
+        act_z = mx.sym.Activation(data=decoder_z, act_type="tanh",name="actication_z")
+
+        decoder_x = mx.sym.FullyConnected(data=act_z, name="decoder_x",num_hidden=features)
+        act_x = mx.sym.Activation(data=decoder_x, act_type="sigmoid",name='activation_x')
+
+        KL = -0.5*mx.symbol.sum(1+logvar-pow( mu,2)-mx.symbol.exp(logvar),axis=1)
+
+        #compute minus ELBO to minimize 
+        loss = likelihood(act_x,loss_label)+KL
+        output = mx.symbol.MakeLoss(sum(loss),name='loss')
+
+        #train the model
+        nd_iter.reset()
+        logging.getLogger().setLevel(logging.DEBUG)  # logging to stdout
+
+        model = mx.mod.Module(
+            symbol = output ,
+            data_names=['data'],
+            label_names = ['loss_label'])
+
+             #initialize the weights and bias 
+
+
+        training_loss = list()
+        def log_to_list(period, lst):
+                def _callback(param):
+                        """The checkpoint function."""
+                        if param.nbatch % period == 0:
+                                name, value = param.eval_metric.get()
+                                lst.append(value)
+                return _callback
+
+        model.fit(nd_iter,  # train data
+                    initializer = initializer,
+                    eval_data = nd_iter_val,
+                    optimizer = optimizer,  # use SGD to train
+                    optimizer_params = {'learning_rate':learning_rate,'wd':weight_decay},  
+                    epoch_end_callback  = None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),   #save parameters for each epoch if model_prefix is supplied
+                    batch_end_callback = log_to_list(int(N/batch_size),training_loss),  #this can save the training loss
+                    num_epoch = num_epoch,
+                    eval_metric = 'Loss')
+
+        return model,training_loss
+
+
+    def encoder(model,x):
+        params = model.arg_params
+        encoder_n = np.shape(params['encoder_h_bias'].asnumpy())[0]
+        encoder_h = np.dot(params['encoder_h_weight'].asnumpy(),np.transpose(x)) + np.reshape(params['encoder_h_bias'].asnumpy(),(encoder_n,1))
+        act_h = np.tanh(encoder_h)
+        mu = np.transpose(np.dot(params['mu_weight'].asnumpy(),act_h)) + params['mu_bias'].asnumpy()
+        logvar = np.transpose(np.dot(params['logvar_weight'].asnumpy(),act_h)) + params['logvar_bias'].asnumpy()
+        return mu,logvar
+
+    def sampler(mu,logvar):
+        z = mu + np.multiply(np.exp(0.5*logvar),np.random.normal(loc=0, scale=1,size=np.shape(logvar))) 
+        return z
+
+
+
+    def decoder(model,z):
+        params = model.arg_params
+        decoder_n = np.shape(params['decoder_z_bias'].asnumpy())[0]
+        decoder_z = np.dot(params['decoder_z_weight'].asnumpy(),np.transpose(z)) + np.reshape(params['decoder_z_bias'].asnumpy(),(decoder_n,1))
+        act_z = np.tanh(decoder_z)
+        decoder_x = np.transpose(np.dot(params['decoder_x_weight'].asnumpy(),act_z)) + params['decoder_x_bias'].asnumpy()
+        reconstructed_x = 1/(1+np.exp(-decoder_x))
+        return reconstructed_x
diff --git a/example/vae/VAE_example.ipynb b/example/vae/VAE_example.ipynb
new file mode 100644
index 000000000000..c29348a58d50
--- /dev/null
+++ b/example/vae/VAE_example.ipynb
@@ -0,0 +1,1167 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import mxnet as mx\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import logging\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.cm as cm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Building a Variational Autoencoder in MXNet\n",
+    "\n",
+    "#### Xiaoyu Lu,  July 5th, 2017\n",
+    "\n",
+    "This tutorial guides you through the process of building a variational encoder in MXNet. in this notebook we'll focus on an example unsing the MNIST handwritten digit recognition dataset. Refer to [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/) for more details on the model description.\n",
+    "\n",
+    "## 1. Loading the Data\n",
+    "\n",
+    "We first load the MNIST dataset, which contains 60000 trainings and 10000 test examples. The following code import required modules and load the data. These images are stored in a 4-D matrix with shape (`batch_size, num_channels, width, height`). For the MNIST dataset, there is only one color channel, and both width and height are 28, so we reshape each image as a 28x28 array. See below for a visualization.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "mnist = mx.test_utils.get_mnist()\n",
+    "image = np.reshape(mnist['train_data'],(60000,28*28))\n",
+    "label = image\n",
+    "image_test = np.reshape(mnist['test_data'],(10000,28*28))\n",
+    "label_test = image_test\n",
+    "[N,features] = np.shape(image)          #number of examples and features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAADFCAYAAACxSv92AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFzNJREFUeJzt3Xuw1XW5x/HPg8rdGMEtoYfj9gJqOZyNbdBGOYOCRFp5\noRwtHUpHgkzFQHFwTNMsrGMKURYIgdnxWIi3xjlSdPIymYlhBwEvpRBX2YxQQolcnvMH68wQ6/nF\nWnvd9lrf92umYe/P/u7fen6sZ+8efq7fd5m7CwAAAEhRp1oXAAAAANQKwzAAAACSxTAMAACAZDEM\nAwAAIFkMwwAAAEgWwzAAAACSxTAMAACAZDEMAwAAIFkMwwAAAEjWwaV8s5mNljRd0kGS7nP3af9s\n/eGHH+7Nzc2lPCSgl156abO7N1XzMeldlGrVqlXavHmzVfMx6VuUA79zUa8K7d12D8NmdpCk70k6\nW9JaSS+a2ePuviLre5qbm7VkyZL2PiQgSTKz1dV+THoXpWptba36Y9K3KAd+56JeFdq7pbxMYqik\nP7r7m+7+vqT/knReCccDAAAAqqqUYfgoSWv2+XxtLvsHZjbOzJaY2ZK2trYSHg6oLnoX9Yi+Rb2i\nd1ErFb+Bzt1nuXuru7c2NVX1JUdASehd1CP6FvWK3kWtlDIMr5PUf5/P/yWXAQAAAHWhlGH4RUkD\nzOwYM+ss6WJJj5enLAAAAKDy2r2bhLvvMrMvS3pKe7dWm+vuy8tWGQAAAFBhJe0z7O5PSnqyTLUA\nAAAAVcU70AEAACBZDMMAAABIFsMwAAAAksUwDAAAgGQxDAMAACBZDMMAAABIFsMwAAAAksUwDAAA\ngGQxDAMAACBZDMMAAABIFsMwAAAAksUwDAAAgGQxDAMAACBZB9e6AACNbc2aNWE+ffr0ML/77rvz\nsuuuuy5ce+2114Z5//79C6wOAJA6rgwDAAAgWQzDAAAASBbDMAAAAJLFMAwAAIBkMQwDAAAgWewm\nUWF79uwJ8x07dpTl+PPnzw/z7du352UrVqwI195zzz1hPnXq1DCfOXNmmHfr1i3M77rrrrxswoQJ\n4VrUr3Xr1oX54MGDw3zr1q1hbmZ5WVaPZvV/W1tbmAMd2cqVK8N85MiRYf7yyy+HeVNTU9lqQppm\nz54d5uPHjw/zrFnntddeC/OBAwe2r7AKKWkYNrNVkt6VtFvSLndvLUdRAAAAQDWU48rwme6+uQzH\nAQAAAKqK1wwDAAAgWaUOwy7pl2b2kpmNixaY2TgzW2JmS3gdH+oJvYt6RN+iXtG7qJVSh+Ez3L1F\n0sclXWVm/77/Anef5e6t7t7Ki/pRT+hd1CP6FvWK3kWtlPSaYXdfl/tzk5k9ImmopGfKUVg1/eUv\nfwnz3bt3h/kf/vCHvGzRokXh2qw75mfNmlVgdeXT3Nwc5pMmTQrzOXPmhHmvXr3CfNiwYWF+1lln\nHbg41I3Vq1eH+fDhw8N8y5YtYR7tGiHF/dWlS5dw7aZNm8L8zTffDPOjjz46zA866KAwT9kbb7yR\nl2U9l0OHDq10OUl44YUXwnzEiBFVrgQpWbx4cV72la98JVzbqVNx11Czfs93NO2+MmxmPczs0P//\nWNIoSa+UqzAAAACg0kq5MtxX0iO5qf9gSf/p7v9dlqoAAACAKmj3MOzub0r6tzLWAgAAAFQVW6sB\nAAAgWQzDAAAASFY53oGubqxduzbMW1pawjzrzul6EN3xmbU7RLdu3cL8iiuuCPMjjjgizHv27Bnm\nbJHTse3cuTPMs3aNGD16dJivWbOmLPVEP4933HFHuPaMM84I8wEDBoR51i4uWb2esugO81dffTVc\ny24SxXH3MI928JCk119/vZLlIHFRf7333ns1qKR2uDIMAACAZDEMAwAAIFkMwwAAAEgWwzAAAACS\nxTAMAACAZCW1m0SfPn3CvG/fvmFei90kRo0aFeZZtS9cuDDMu3TpkpcNHz683XWhcV1//fVhPnPm\nzCpXstfTTz+dl23fvj1ce8EFF4R51s/F0qVL219YYmbMmJGXZf1+QnG2bdsW5t/85jfD/Nprrw1z\ndupBMVasWBHmt956a8HHOOWUU8J80aJFYd6jR4+Cj11LXBkGAABAshiGAQAAkCyGYQAAACSLYRgA\nAADJYhgGAABAspLaTaJbt25hPm/evDBfsGBBmH/0ox/Ny8aMGVNULWeccUaYP/bYY2HeuXPnMN+4\ncWOYT58+vah6kIY1a9bkZQ888EC41t2LOnbWzg5ZPxuXXnppmPfv3z8vO+mkk8K1U6ZMCfOsn91i\nzyllu3fvrnUJDWv8+PFFrc/qfyDyxz/+MczPOeecMH/nnXcKPva0adPCvFevXgUfoyPiyjAAAACS\nxTAMAACAZDEMAwAAIFkMwwAAAEgWwzAAAACSdcDdJMxsrqRPSNrk7ifnst6SHpLULGmVpIvcfUvl\nyqysIUOGhPmgQYPCPNrZ4YYbbgjXfutb3wrz22+/veBj/zMf/OAHwzzrPe6RhnXr1oX54MGD87Kt\nW7eGa80szD/3uc+F+ezZs8N8xYoVRa2/+OKL87Lu3buHa4888sgw79Qp/nf+j3/84zC/8cYbwzza\n2aLRrF+/PsyzegilK+bufUk6++yzK1QJGtF9990X5tFuQlkuvPDCMD/zzDPbVVNHV8iV4XmSRu+X\n3ShpsbsPkLQ49zkAAABQVw44DLv7M5L2/2fseZLm5z6eL+n8MtcFAAAAVFx7XzPc19035D7eKKlv\n1kIzG2dmS8xsSVtbWzsfDqg+ehf1iL5FvaJ3USsl30Dne9/SKfNtndx9lru3untrU1NTqQ8HVA29\ni3pE36Je0buolfa+HfPbZtbP3TeYWT9Jm8pZVEfRpUuXgtcedthhRR17xowZYT5s2LAwz7qZCWnb\nvHlzmN95551hvmVL/n2uffvG/2HnmGOOCfMJEyaEedbNny0tLUXllfS3v/0tzL/97W+HedbPaSNZ\ntGhRmGf9XaFw27dvD/Nly5YVdZw+ffqUoxw0mGJ/n2XdWBz1V9ZN/o2qvVeGH5c0NvfxWEmPlacc\nAAAAoHoOOAyb2YOSnpd0gpmtNbMrJE2TdLaZvSFpZO5zAAAAoK4c8GUS7n5JxpdGlLkWAAAAoKp4\nBzoAAAAki2EYAAAAyWrvbhLYz8SJE8P8d7/7XZg/8sgjYb58+fIwP/nkk9tXGBrCrl27wnzy5Mlh\n/sADD4R5r1698rKnnnoqXHv88ceH+c6dO8O8nr311lu1LqFmXnnllYLX1mIHkHp20003hXnWW2AP\nGjQozLN2akEatm7dGubnnXdeWY5/66235mUnnnhiWY5dL7gyDAAAgGQxDAMAACBZDMMAAABIFsMw\nAAAAksUwDAAAgGSxm0SZZN3tO2vWrDBfvHhxmGfdHXr++eeH+emnnx7mF1xwQV5mZuFadHx//vOf\nwzxr14gsv/3tb/OygQMHFnWMbt26FbUejePUU0+tdQlVs2PHjjB/6aWXwjz6Xf/QQw8V9ZgzZswI\n865duxZ1HDSWZ599Nsx/85vfFHWcz3zmM2H++c9/vtiSGg5XhgEAAJAshmEAAAAki2EYAAAAyWIY\nBgAAQLIYhgEAAJAsdpOosN69e4f5U089FeajR48O83vuuaeofO7cuXnZmDFjwrU9e/YMc3QcV111\nVZi7e5hHu4lIxe8cUa/27NkT5p06xf/+z/p7xD/aunVrRY+/fv36MI+ez6effjpc+9Zbb4X5+++/\nH+bf/e53w3z37t1h3qNHjzAfNWpUXpa1C8TOnTvD/KSTTgpzpOHFF18M87FjxxZ1nE9+8pNhPnv2\n7DBntxKuDAMAACBhDMMAAABIFsMwAAAAksUwDAAAgGQxDAMAACBZB9xNwszmSvqEpE3ufnIuu1XS\nlZLacsumuvuTlSqyEQ0dOjTMly9fHubXXXddmP/sZz8L88svvzwv+9Of/hSuvf7668P80EMPDXNU\nztKlS8P8mWeeCXMzC/Os96BPRdauEVl/X62trZUsp0Pr3r17mEd/V5/61KfCtSeccEJZann++efD\nPNrt4+CD4//7ytod59RTTw3zyZMnh/mwYcPCvKWlJcyjXSb69+8frt2+fXuYNzU1hTkaS9auLKed\ndlpZjn/88ceHedZOKCjsyvA8SdF+X3e7e0vufwzCAAAAqDsHHIbd/RlJ71ShFgAAAKCqSnnN8NVm\n9r9mNtfMDstaZGbjzGyJmS1pa2vLWgZ0OPQu6hF9i3pF76JW2jsM3yvpWEktkjZIuitrobvPcvdW\nd2/l9VCoJ/Qu6hF9i3pF76JW2jUMu/vb7r7b3fdImi0pvhsMAAAA6MAOuJtExMz6ufuG3KcXSHql\nfCWlrV+/fmE+b968MB8/fnyYjxw5Mi+74447wrWvvfZamD/00ENhjsp57733wnzHjh1hfuSRR4b5\nueeeW7aaOoJdu3aF+YwZM4o6zqc//ekwnzp1atE1NYrbbrstzI877ri87Ne//nVFaxkwYECYf/az\nn83Lsu6YP+aYY8paU6GefDL/PvKNGzeGa0888cRKl4MO7K674v+YnrULTrGmTJlSluOkpJCt1R6U\nNFzS4Wa2VtItkoabWYskl7RK0hcrWCMAAABQEQccht39kiCeU4FaAAAAgKriHegAAACQLIZhAAAA\nJIthGAAAAMlq124SqL6uXbuG+fDhw8P8oIMOysuy7sh/9NFHwzxrl4kTTjghzFF9WX3Rs2fPKldS\nPlGf3nvvveHaG264Icybm5vD/Kabbgrzzp07F1ZcQsaOHVtQhr1+/vOfF7z28ssvr2Al6EjWrVuX\nly1YsKAsx/7CF74Q5uzRXDyuDAMAACBZDMMAAABIFsMwAAAAksUwDAAAgGRxA10Hs379+jBfuHBh\nmD///PNhnnWzXGTIkCFhPnDgwIKPgdq47LLLal1Cu0U3lkjSnXfemZd9//vfD9dm3UAye/bs9hcG\nVNiFF15Y6xJQJa2trXnZ5s2bizrGxz72sTCfOXNmu2pCPq4MAwAAIFkMwwAAAEgWwzAAAACSxTAM\nAACAZDEMAwAAIFnsJlFhbW1tYf69730vzH/0ox+F+dq1a0uuJXqLZin7rWvNrOTHRHHcvah83rx5\nYX7zzTeXq6SSPfjgg2F+9dVXh/mWLVvysmuuuSZce/fdd7e/MACosE2bNuVlnToVdx1yypQpYc7b\nyJcPV4YBAACQLIZhAAAAJIthGAAAAMliGAYAAECyGIYBAACQrAPuJmFm/SXdL6mvJJc0y92nm1lv\nSQ9Japa0StJF7p5/G3gD2rZtW172xBNPhGtvu+22MH/99dfLWtP+zjrrrLxs2rRp4dqPfOQjFa0F\nhcvawSMrz9plJKvvrrjiirzs0EMPDdcuX748zH/4wx+G+bPPPhvmq1atCvPjjjsuzC+++OK8LGs3\nCaAjy9oFZvXq1WF+7LHHVrIcVNDkyZPDfM+ePSUfe9CgQSUfA/9cIVeGd0ma5O4fknSapKvM7EOS\nbpS02N0HSFqc+xwAAACoGwccht19g7v/Pvfxu5JWSjpK0nmS5ueWzZd0fqWKBAAAACqhqNcMm1mz\npMGSXpDU19035L60UXtfRhF9zzgzW2JmS7LegALoiOhd1CP6FvWK3kWtFDwMm1lPSQ9Lmujuf933\na773hVHhi6PcfZa7t7p7a1NTU0nFAtVE76Ie0beoV/QuaqWgYdjMDtHeQfgn7r4wF79tZv1yX+8n\nKf89BwEAAIAOrJDdJEzSHEkr3f07+3zpcUljJU3L/flYRSqsgu3bt4f5mjVrwvzSSy/Ny5YuXVrW\nmvY3atSoMP/a174W5kOGDMnLsnYkQP3avXt3mGftJjFnzpy8rHfv3uHaZcuWtb+wfXz84x8P89Gj\nR4f5l7/85bI8LlBrWb9zy7HDAGpj3bp1Yb5gwYIw79Qp/5pjly5dwrW33HJLmPfo0aPA6tBeBxyG\nJZ0u6TJJy8zs5Vw2VXuH4J+a2RWSVku6qDIlAgAAAJVxwGHY3Z+TlHVJcUR5ywEAAACqh3egAwAA\nQLIYhgEAAJAshmEAAAAkq5Ab6OrO3//+9zCfOHFimD/33HNh/uqrr5atpv2dc845Yf7Vr341zFta\nWsL8kEMOKVtNqL0Pf/jDYT5y5Mgw/+Uvf1nU8deuXZuXZd0dneWII44I8wkTJoT5zTffXNTxgUb3\nq1/9KsxHjOA2nI5u27ZtYV7M79Hm5uYwnzJlSntKQhlwZRgAAADJYhgGAABAshiGAQAAkCyGYQAA\nACSLYRgAAADJqpvdJFatWpWXfeMb3wjXZt1hv3r16nKW9A+6d+8e5rfffnuYf+lLXwrzzp07l60m\n1J8PfOADYZ71vvf3339/mF9zzTUl1/L1r389zK+88sow79OnT8mPCTQSd691CQAKwJVhAAAAJIth\nGAAAAMliGAYAAECyGIYBAACQLIZhAAAAJKtudpN4+OGH87I5c+aU5dinnHJKmF9yySVhfvDB+X9t\n48aNC9d27dq1/YUBOT179gzzrF1JsnIAlTFmzJi87Ac/+EENKkElHXXUUWF+7rnnhvkTTzxRyXJQ\nJlwZBgAAQLIYhgEAAJAshmEAAAAki2EYAAAAyTrgDXRm1l/S/ZL6SnJJs9x9upndKulKSW25pVPd\n/clKFTpp0qSCMgAAqm3EiBF52Z49e2pQCSop62bmRx99tMqVoJwK2U1il6RJ7v57MztU0ktm9ovc\n1+529/+oXHkAAABA5RxwGHb3DZI25D5+18xWSor3FgEAAADqSFGvGTazZkmDJb2Qi642s/81s7lm\ndljG94wzsyVmtqStrS1aAnRI9C7qEX2LekXvolYKHobNrKekhyVNdPe/SrpX0rGSWrT3yvFd0fe5\n+yx3b3X31qampjKUDFQHvYt6RN+iXtG7qJWChmEzO0R7B+GfuPtCSXL3t919t7vvkTRb0tDKlQkA\nAACU3wGHYTMzSXMkrXT37+yT99tn2QWSXil/eQAAAEDlFLKbxOmSLpO0zMxezmVTJV1iZi3au93a\nKklfrEiFAAAAQIUUspvEc5Is+FLF9hQGAAAAqoF3oAMAAECyGIYBAACQLIZhAAAAJIthGAAAAMli\nGAYAAECyGIYBAACQLIZhAAAAJIthGAAAAMliGAYAAECyzN2r92BmbZJWV+0BK+9wSZtrXUSVdKRz\nPdrdm6r5gPRu3epI50nflkdHek4rqSOdJ71buo70fFZaRzrXgnq3qsNwozGzJe7eWus6qiGlc01B\nKs9nKueZklSe01TOMxUpPZ/1eK68TAIAAADJYhgGAABAshiGSzOr1gVUUUrnmoJUns9UzjMlqTyn\nqZxnKlJ6PuvuXHnNMAAAAJLFlWEAAAAki2EYAAAAyWIYLpCZzTWzTWb2yj5ZbzP7hZm9kfvzsFrW\nWA5m1t/M/sfMVpjZcjO7Npc33Lmmgt5tvHNNAX3beOeaCnq3/s6VYbhw8ySN3i+7UdJidx8gaXHu\n83q3S9Ikd/+QpNMkXWVmH1Jjnmsq5onebbRzTcE80beNdq6pmCd6t67OlWG4QO7+jKR39ovPkzQ/\n9/F8SedXtagKcPcN7v773MfvSlop6Sg14Lmmgt5tvHNNAX3beOeaCnq3/s6VYbg0fd19Q+7jjZL6\n1rKYcjOzZkmDJb2gBj/XBDX080nvNqyGfi7p24bW0M9nvfcuw3CZ+N496hpmnzoz6ynpYUkT3f2v\n+36t0c41dY32fNK7aWi055K+TUejPZ+N0LsMw6V528z6SVLuz001rqcszOwQ7W3sn7j7wlzckOea\nsIZ8PundhteQzyV9m4SGfD4bpXcZhkvzuKSxuY/HSnqshrWUhZmZpDmSVrr7d/b5UsOda+Ia7vmk\nd5PQcM8lfZuMhns+G6l3eQe6ApnZg5KGSzpc0tuSbpH0qKSfSvpXSaslXeTu+79ovq6Y2RmSnpW0\nTNKeXDxVe18H1FDnmgp6l96tR/QtfVuv6N36612GYQAAACSLl0kAAAAgWQzDAAAASBbDMAAAAJLF\nMAwAAIBkMQwDAAAgWQzDAAAASBbDMAAAAJL1f1iM7ql26a6bAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x11234e9e8>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "f, (ax1, ax2, ax3, ax4) = plt.subplots(1,4,  sharex='col', sharey='row',figsize=(12,3))\n",
+    "ax1.imshow(np.reshape(image[0,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax2.imshow(np.reshape(image[1,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax3.imshow(np.reshape(image[2,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax4.imshow(np.reshape(image[3,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can optionally save the parameters in the directory variable 'model_prefix'. We first create data iterators for MXNet, with each batch of data containing 100 images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "model_prefix = None\n",
+    "\n",
+    "batch_size = 100\n",
+    "nd_iter = mx.io.NDArrayIter(data={'data':image},label={'loss_label':label},\n",
+    "                            batch_size = batch_size)\n",
+    "nd_iter_test = mx.io.NDArrayIter(data={'data':image_test},label={'loss_label':label_test},\n",
+    "                            batch_size = batch_size)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2.  Building the Network Architecture\n",
+    "\n",
+    "### 2.1 Gaussian MLP as encoder\n",
+    "Next we constuct the neural network, as in the paper, we use *Multilayer Perceptron (MLP)* for both the encoder and decoder. For encoder, a Gaussian MLP is used:\n",
+    "\n",
+    "\\begin{align}\n",
+    "\\log q_{\\phi}(z|x) &= \\log \\mathcal{N}(z:\\mu,\\sigma^2I) \\\\\n",
+    "\\textit{ where } \\mu &= W_2h+b_2, \\log \\sigma^2 = W_3h+b_3\\\\\n",
+    "h &= \\tanh(W_1x+b_1)\n",
+    "\\end{align}\n",
+    "\n",
+    "where $\\{W_1,W_2,W_3,b_1,b_2,b_3\\}$ are the weights and biases of the MLP.\n",
+    "Note below that `encoder_mu` and `encoder_logvar` are symbols, can use `get_internals()` to get the values of them, after which we can sample the latent variable $z$.\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "## define data and loss labels as symbols \n",
+    "data = mx.sym.var('data')\n",
+    "loss_label = mx.sym.var('loss_label')\n",
+    "\n",
+    "## define fully connected and activation layers for the encoder, where we used tanh activation function.\n",
+    "encoder_h  = mx.sym.FullyConnected(data=data, name=\"encoder_h\",num_hidden=400)\n",
+    "act_h = mx.sym.Activation(data=encoder_h, act_type=\"tanh\",name=\"activation_h\")\n",
+    "\n",
+    "## define mu and log variance which are the fully connected layers of the previous activation layer\n",
+    "mu  = mx.sym.FullyConnected(data=act_h, name=\"mu\",num_hidden = 5)\n",
+    "logvar  = mx.sym.FullyConnected(data=act_h, name=\"logvar\",num_hidden = 5)\n",
+    "\n",
+    "## sample the latent variables z according to Normal(mu,var)\n",
+    "z = mu + np.multiply(mx.symbol.exp(0.5*logvar),mx.symbol.random_normal(loc=0, scale=1,shape=np.shape(logvar.get_internals()[\"logvar_output\"])))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.2 Bernoulli MLP as decoder\n",
+    "\n",
+    "In this case let $p_\\theta(x|z)$ be a multivariate Bernoulli whose probabilities are computed from $z$ with a feed forward neural network with a single hidden layer:\n",
+    "\n",
+    "\\begin{align}\n",
+    "\\log p(x|z) &= \\sum_{i=1}^D x_i\\log y_i + (1-x_i)\\log (1-y_i) \\\\\n",
+    "\\textit{ where }  y &= f_\\sigma(W_5\\tanh (W_4z+b_4)+b_5)\n",
+    "\\end{align}\n",
+    "\n",
+    "where $f_\\sigma(\\dot)$ is the elementwise sigmoid activation function, $\\{W_4,W_5,b_4,b_5\\}$ are the weights and biases of the decoder MLP. A Bernouilli likelihood is suitable for this type of data but you can easily extend it to other likelihood types by parsing into the argument `likelihood` in the `VAE` class, see section 4 for details."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# define fully connected and tanh activation layers for the decoder\n",
+    "decoder_z = mx.sym.FullyConnected(data=z, name=\"decoder_z\",num_hidden=400)\n",
+    "act_z = mx.sym.Activation(data=decoder_z, act_type=\"tanh\",name=\"activation_z\")\n",
+    "\n",
+    "# define the output layer with sigmoid activation function, where the dimension is equal to the input dimension\n",
+    "decoder_x = mx.sym.FullyConnected(data=act_z, name=\"decoder_x\",num_hidden=features)\n",
+    "y = mx.sym.Activation(data=decoder_x, act_type=\"sigmoid\",name='activation_x')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2.3 Joint Loss Function for the Encoder and the Decoder\n",
+    "\n",
+    "The variational lower bound can be estimated as:\n",
+    "\n",
+    "\\begin{align}\n",
+    "\\mathcal{L}(\\theta,\\phi;x_{(i)}) \\approx \\frac{1}{2}\\left(1+\\log ((\\sigma_j^{(i)})^2)-(\\mu_j^{(i)})^2-(\\sigma_j^{(i)})^2\\right) + \\log p_\\theta(x^{(i)}|z^{(i)})\n",
+    "\\end{align}\n",
+    "\n",
+    "where the first term is the KL divergence of the approximate posterior from the prior, and the second term is an expected negative reconstruction error. We would like to maximize this lower bound, so we can define the loss to be $-\\mathcal{L}$ for MXNet to minimize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# define the objective loss function that needs to be minimized\n",
+    "KL = 0.5*mx.symbol.sum(1+logvar-pow( mu,2)-mx.symbol.exp(logvar),axis=1)\n",
+    "loss = -mx.symbol.sum(mx.symbol.broadcast_mul(loss_label,mx.symbol.log(y)) + mx.symbol.broadcast_mul(1-loss_label,mx.symbol.log(1-y)),axis=1)-KL\n",
+    "output = mx.symbol.MakeLoss(sum(loss),name='loss')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Training the model\n",
+    "Now we can define the model and train it, we initilize the weights and the biases to be Gaussian(0,0.01), and use stochastic gradient descent for optimization. To warm start the training, one may initilize with pre-trainined parameters `arg_params` using `init=mx.initializer.Load(arg_params)`. To save intermediate results, we can optionally use `epoch_end_callback  = mx.callback.do_checkpoint(model_prefix, 1)` which saves the parameters to the path given by model_prefix, and with period every $1$ epoch. To assess the performance, we output minus ELBO $-\\mathcal{L}$ after each epoch, with the command `eval_metric = 'Loss'`. We can plot the training loss for mini batches by accessing the log and save it to a list, then parse it to the argument `batch_end_callback`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# set up the log\n",
+    "nd_iter.reset()\n",
+    "logging.getLogger().setLevel(logging.DEBUG)  \n",
+    "\n",
+    "#define function to trave back training loss\n",
+    "def log_to_list(period, lst):\n",
+    "    def _callback(param):\n",
+    "        \"\"\"The checkpoint function.\"\"\"\n",
+    "        if param.nbatch % period == 0:\n",
+    "            name, value = param.eval_metric.get()\n",
+    "            lst.append(value)\n",
+    "    return _callback\n",
+    "\n",
+    "# define the model\n",
+    "model = mx.mod.Module(\n",
+    "    symbol = output ,\n",
+    "    data_names=['data'],\n",
+    "    label_names = ['loss_label'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Epoch[0] Train-loss=375.023381\n",
+      "INFO:root:Epoch[0] Time cost=6.127\n",
+      "INFO:root:Epoch[1] Train-loss=212.780315\n",
+      "INFO:root:Epoch[1] Time cost=6.409\n",
+      "INFO:root:Epoch[2] Train-loss=208.209400\n",
+      "INFO:root:Epoch[2] Time cost=6.619\n",
+      "INFO:root:Epoch[3] Train-loss=206.146854\n",
+      "INFO:root:Epoch[3] Time cost=6.648\n",
+      "INFO:root:Epoch[4] Train-loss=204.530598\n",
+      "INFO:root:Epoch[4] Time cost=7.000\n",
+      "INFO:root:Epoch[5] Train-loss=202.799992\n",
+      "INFO:root:Epoch[5] Time cost=6.778\n",
+      "INFO:root:Epoch[6] Train-loss=200.333474\n",
+      "INFO:root:Epoch[6] Time cost=7.187\n",
+      "INFO:root:Epoch[7] Train-loss=197.506393\n",
+      "INFO:root:Epoch[7] Time cost=6.712\n",
+      "INFO:root:Epoch[8] Train-loss=195.969775\n",
+      "INFO:root:Epoch[8] Time cost=6.896\n",
+      "INFO:root:Epoch[9] Train-loss=195.418288\n",
+      "INFO:root:Epoch[9] Time cost=6.887\n",
+      "INFO:root:Epoch[10] Train-loss=194.739763\n",
+      "INFO:root:Epoch[10] Time cost=6.745\n",
+      "INFO:root:Epoch[11] Train-loss=194.380536\n",
+      "INFO:root:Epoch[11] Time cost=6.706\n",
+      "INFO:root:Epoch[12] Train-loss=193.955462\n",
+      "INFO:root:Epoch[12] Time cost=6.592\n",
+      "INFO:root:Epoch[13] Train-loss=193.493671\n",
+      "INFO:root:Epoch[13] Time cost=6.775\n",
+      "INFO:root:Epoch[14] Train-loss=192.958739\n",
+      "INFO:root:Epoch[14] Time cost=6.600\n",
+      "INFO:root:Epoch[15] Train-loss=191.928542\n",
+      "INFO:root:Epoch[15] Time cost=6.586\n",
+      "INFO:root:Epoch[16] Train-loss=189.797939\n",
+      "INFO:root:Epoch[16] Time cost=6.700\n",
+      "INFO:root:Epoch[17] Train-loss=186.672446\n",
+      "INFO:root:Epoch[17] Time cost=6.869\n",
+      "INFO:root:Epoch[18] Train-loss=184.616599\n",
+      "INFO:root:Epoch[18] Time cost=7.144\n",
+      "INFO:root:Epoch[19] Train-loss=183.305978\n",
+      "INFO:root:Epoch[19] Time cost=6.997\n",
+      "INFO:root:Epoch[20] Train-loss=181.944634\n",
+      "INFO:root:Epoch[20] Time cost=6.481\n",
+      "INFO:root:Epoch[21] Train-loss=181.005329\n",
+      "INFO:root:Epoch[21] Time cost=6.754\n",
+      "INFO:root:Epoch[22] Train-loss=178.363118\n",
+      "INFO:root:Epoch[22] Time cost=7.000\n",
+      "INFO:root:Epoch[23] Train-loss=176.363421\n",
+      "INFO:root:Epoch[23] Time cost=6.923\n",
+      "INFO:root:Epoch[24] Train-loss=174.573954\n",
+      "INFO:root:Epoch[24] Time cost=6.510\n",
+      "INFO:root:Epoch[25] Train-loss=173.245940\n",
+      "INFO:root:Epoch[25] Time cost=6.926\n",
+      "INFO:root:Epoch[26] Train-loss=172.082522\n",
+      "INFO:root:Epoch[26] Time cost=6.733\n",
+      "INFO:root:Epoch[27] Train-loss=171.123084\n",
+      "INFO:root:Epoch[27] Time cost=6.616\n",
+      "INFO:root:Epoch[28] Train-loss=170.239300\n",
+      "INFO:root:Epoch[28] Time cost=7.004\n",
+      "INFO:root:Epoch[29] Train-loss=169.538416\n",
+      "INFO:root:Epoch[29] Time cost=6.341\n",
+      "INFO:root:Epoch[30] Train-loss=168.952901\n",
+      "INFO:root:Epoch[30] Time cost=6.736\n",
+      "INFO:root:Epoch[31] Train-loss=168.169076\n",
+      "INFO:root:Epoch[31] Time cost=6.616\n",
+      "INFO:root:Epoch[32] Train-loss=167.208973\n",
+      "INFO:root:Epoch[32] Time cost=6.446\n",
+      "INFO:root:Epoch[33] Train-loss=165.732213\n",
+      "INFO:root:Epoch[33] Time cost=6.405\n",
+      "INFO:root:Epoch[34] Train-loss=163.606801\n",
+      "INFO:root:Epoch[34] Time cost=6.139\n",
+      "INFO:root:Epoch[35] Train-loss=161.985880\n",
+      "INFO:root:Epoch[35] Time cost=6.678\n",
+      "INFO:root:Epoch[36] Train-loss=160.763072\n",
+      "INFO:root:Epoch[36] Time cost=8.749\n",
+      "INFO:root:Epoch[37] Train-loss=160.025193\n",
+      "INFO:root:Epoch[37] Time cost=6.519\n",
+      "INFO:root:Epoch[38] Train-loss=159.319723\n",
+      "INFO:root:Epoch[38] Time cost=7.584\n",
+      "INFO:root:Epoch[39] Train-loss=158.670701\n",
+      "INFO:root:Epoch[39] Time cost=6.874\n",
+      "INFO:root:Epoch[40] Train-loss=158.225733\n",
+      "INFO:root:Epoch[40] Time cost=6.402\n",
+      "INFO:root:Epoch[41] Train-loss=157.741337\n",
+      "INFO:root:Epoch[41] Time cost=8.617\n",
+      "INFO:root:Epoch[42] Train-loss=157.301411\n",
+      "INFO:root:Epoch[42] Time cost=6.515\n",
+      "INFO:root:Epoch[43] Train-loss=156.765170\n",
+      "INFO:root:Epoch[43] Time cost=6.447\n",
+      "INFO:root:Epoch[44] Train-loss=156.389668\n",
+      "INFO:root:Epoch[44] Time cost=6.130\n",
+      "INFO:root:Epoch[45] Train-loss=155.815434\n",
+      "INFO:root:Epoch[45] Time cost=6.155\n",
+      "INFO:root:Epoch[46] Train-loss=155.432254\n",
+      "INFO:root:Epoch[46] Time cost=6.158\n",
+      "INFO:root:Epoch[47] Train-loss=155.114027\n",
+      "INFO:root:Epoch[47] Time cost=6.749\n",
+      "INFO:root:Epoch[48] Train-loss=154.612441\n",
+      "INFO:root:Epoch[48] Time cost=6.255\n",
+      "INFO:root:Epoch[49] Train-loss=154.137659\n",
+      "INFO:root:Epoch[49] Time cost=7.813\n",
+      "INFO:root:Epoch[50] Train-loss=153.634072\n",
+      "INFO:root:Epoch[50] Time cost=7.408\n",
+      "INFO:root:Epoch[51] Train-loss=153.417397\n",
+      "INFO:root:Epoch[51] Time cost=7.747\n",
+      "INFO:root:Epoch[52] Train-loss=152.851887\n",
+      "INFO:root:Epoch[52] Time cost=8.587\n",
+      "INFO:root:Epoch[53] Train-loss=152.575068\n",
+      "INFO:root:Epoch[53] Time cost=7.554\n",
+      "INFO:root:Epoch[54] Train-loss=152.084419\n",
+      "INFO:root:Epoch[54] Time cost=6.628\n",
+      "INFO:root:Epoch[55] Train-loss=151.724836\n",
+      "INFO:root:Epoch[55] Time cost=6.535\n",
+      "INFO:root:Epoch[56] Train-loss=151.302525\n",
+      "INFO:root:Epoch[56] Time cost=7.148\n",
+      "INFO:root:Epoch[57] Train-loss=150.960916\n",
+      "INFO:root:Epoch[57] Time cost=7.195\n",
+      "INFO:root:Epoch[58] Train-loss=150.603895\n",
+      "INFO:root:Epoch[58] Time cost=6.649\n",
+      "INFO:root:Epoch[59] Train-loss=150.237795\n",
+      "INFO:root:Epoch[59] Time cost=6.222\n",
+      "INFO:root:Epoch[60] Train-loss=149.936080\n",
+      "INFO:root:Epoch[60] Time cost=8.450\n",
+      "INFO:root:Epoch[61] Train-loss=149.514617\n",
+      "INFO:root:Epoch[61] Time cost=6.113\n",
+      "INFO:root:Epoch[62] Train-loss=149.229345\n",
+      "INFO:root:Epoch[62] Time cost=6.088\n",
+      "INFO:root:Epoch[63] Train-loss=148.893769\n",
+      "INFO:root:Epoch[63] Time cost=6.558\n",
+      "INFO:root:Epoch[64] Train-loss=148.526837\n",
+      "INFO:root:Epoch[64] Time cost=7.590\n",
+      "INFO:root:Epoch[65] Train-loss=148.249951\n",
+      "INFO:root:Epoch[65] Time cost=6.180\n",
+      "INFO:root:Epoch[66] Train-loss=147.940414\n",
+      "INFO:root:Epoch[66] Time cost=6.242\n",
+      "INFO:root:Epoch[67] Train-loss=147.621304\n",
+      "INFO:root:Epoch[67] Time cost=8.501\n",
+      "INFO:root:Epoch[68] Train-loss=147.294314\n",
+      "INFO:root:Epoch[68] Time cost=7.645\n",
+      "INFO:root:Epoch[69] Train-loss=147.074479\n",
+      "INFO:root:Epoch[69] Time cost=7.092\n",
+      "INFO:root:Epoch[70] Train-loss=146.796387\n",
+      "INFO:root:Epoch[70] Time cost=6.914\n",
+      "INFO:root:Epoch[71] Train-loss=146.508842\n",
+      "INFO:root:Epoch[71] Time cost=6.606\n",
+      "INFO:root:Epoch[72] Train-loss=146.230444\n",
+      "INFO:root:Epoch[72] Time cost=7.755\n",
+      "INFO:root:Epoch[73] Train-loss=145.970296\n",
+      "INFO:root:Epoch[73] Time cost=6.409\n",
+      "INFO:root:Epoch[74] Train-loss=145.711610\n",
+      "INFO:root:Epoch[74] Time cost=6.334\n",
+      "INFO:root:Epoch[75] Train-loss=145.460053\n",
+      "INFO:root:Epoch[75] Time cost=7.269\n",
+      "INFO:root:Epoch[76] Train-loss=145.156451\n",
+      "INFO:root:Epoch[76] Time cost=6.744\n",
+      "INFO:root:Epoch[77] Train-loss=144.957674\n",
+      "INFO:root:Epoch[77] Time cost=7.100\n",
+      "INFO:root:Epoch[78] Train-loss=144.729749\n",
+      "INFO:root:Epoch[78] Time cost=6.242\n",
+      "INFO:root:Epoch[79] Train-loss=144.481728\n",
+      "INFO:root:Epoch[79] Time cost=6.865\n",
+      "INFO:root:Epoch[80] Train-loss=144.236061\n",
+      "INFO:root:Epoch[80] Time cost=6.632\n",
+      "INFO:root:Epoch[81] Train-loss=144.030473\n",
+      "INFO:root:Epoch[81] Time cost=6.764\n",
+      "INFO:root:Epoch[82] Train-loss=143.776374\n",
+      "INFO:root:Epoch[82] Time cost=6.564\n",
+      "INFO:root:Epoch[83] Train-loss=143.538847\n",
+      "INFO:root:Epoch[83] Time cost=6.181\n",
+      "INFO:root:Epoch[84] Train-loss=143.326444\n",
+      "INFO:root:Epoch[84] Time cost=6.220\n",
+      "INFO:root:Epoch[85] Train-loss=143.078987\n",
+      "INFO:root:Epoch[85] Time cost=6.823\n",
+      "INFO:root:Epoch[86] Train-loss=142.877117\n",
+      "INFO:root:Epoch[86] Time cost=7.755\n",
+      "INFO:root:Epoch[87] Train-loss=142.667316\n",
+      "INFO:root:Epoch[87] Time cost=6.068\n",
+      "INFO:root:Epoch[88] Train-loss=142.461755\n",
+      "INFO:root:Epoch[88] Time cost=6.111\n",
+      "INFO:root:Epoch[89] Train-loss=142.270438\n",
+      "INFO:root:Epoch[89] Time cost=6.221\n",
+      "INFO:root:Epoch[90] Train-loss=142.047086\n",
+      "INFO:root:Epoch[90] Time cost=8.061\n",
+      "INFO:root:Epoch[91] Train-loss=141.855774\n",
+      "INFO:root:Epoch[91] Time cost=6.433\n",
+      "INFO:root:Epoch[92] Train-loss=141.688955\n",
+      "INFO:root:Epoch[92] Time cost=7.153\n",
+      "INFO:root:Epoch[93] Train-loss=141.442910\n",
+      "INFO:root:Epoch[93] Time cost=7.113\n",
+      "INFO:root:Epoch[94] Train-loss=141.279274\n",
+      "INFO:root:Epoch[94] Time cost=7.152\n",
+      "INFO:root:Epoch[95] Train-loss=141.086522\n",
+      "INFO:root:Epoch[95] Time cost=6.472\n",
+      "INFO:root:Epoch[96] Train-loss=140.901925\n",
+      "INFO:root:Epoch[96] Time cost=6.767\n",
+      "INFO:root:Epoch[97] Train-loss=140.722496\n",
+      "INFO:root:Epoch[97] Time cost=7.044\n",
+      "INFO:root:Epoch[98] Train-loss=140.579295\n",
+      "INFO:root:Epoch[98] Time cost=7.040\n",
+      "INFO:root:Epoch[99] Train-loss=140.386067\n",
+      "INFO:root:Epoch[99] Time cost=6.669\n"
+     ]
+    }
+   ],
+   "source": [
+    "# training the model, save training loss as a list.\n",
+    "training_loss=list()\n",
+    "\n",
+    "# initilize the parameters for training using Normal.\n",
+    "init = mx.init.Normal(0.01)\n",
+    "model.fit(nd_iter,  # train data\n",
+    "              initializer=init,\n",
+    "              #if eval_data is supplied, test loss will also be reported\n",
+    "              #eval_data = nd_iter_test,\n",
+    "              optimizer='sgd',  # use SGD to train\n",
+    "              optimizer_params={'learning_rate':1e-3,'wd':1e-2},  \n",
+    "              epoch_end_callback  = None if model_prefix==None else mx.callback.do_checkpoint(model_prefix, 1),   #save parameters for each epoch if model_prefix is supplied\n",
+    "              batch_end_callback = log_to_list(N/batch_size,training_loss), \n",
+    "              num_epoch=100,\n",
+    "              eval_metric = 'Loss')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZEAAAEWCAYAAACnlKo3AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmcZGV97/HPt6v3bfZ9BmaEEdlRRwSjUSO5EGIC90oM\nMYkYt/iSxJhrouISlStJNIkmvhKNxusVIooaF9C4gUaMMYADso6AA8MwM8zSs/Z0Ty+1/O4f53RT\n9PSpGmqmunq6v+/Xq15z6jmnqp7ndM3zq2c5z1FEYGZmVoumRmfAzMyOXw4iZmZWMwcRMzOrmYOI\nmZnVzEHEzMxq5iBiZmY1cxCxY0rSP0t677E+diaQ1CHpG5IOSPpyAz7/XZI+fbTHSnqJpK3HNneZ\n+XhM0gVT8VlWm+ZGZ8CmD0mPAa+PiFtqfY+IeFM9jp0hLgOWAAsiojDVHx4Rf1mPY58OSauBTUBL\nI86BHXtuidgRkzSrfnTUobwnAg/XUnnOtnNvxw8HEQNA0r8CJwDfkDQg6e2SVksKSa+T9Djwg/TY\nL0vakXbL/EjS6WXv81lJH0y3XyJpq6S3SdolabukP6jx2AVpV1C/pJ9K+qCkH1cozwsl/UTSfklb\nJL0mTf+hpNeXHfea8vdJy3ulpF8Av5D0CUl/O+G9b5T0v9Pt5ZK+IqlP0iZJb8nIzweAvwB+Oz2/\nr5PUJOk9kjanZb5O0pz0+EnP/YT3HDtnby87Z5dKuljSw5L2SnpX2fHvl/S5Ce9/haTHJe2W9O7J\njq1wjt+Vvu4xSb9blv7rkn6W/q22SHp/2ct+lP67Pz0P56eveYOkn0s6KGmDpOeUveYcSfem37cv\nSmov+6yXS7o7/Tv/RNJZZfveIWlb+p4PSXpZpfJYjSLCDz+ICIDHgAvKnq8GArgO6AI60vTXAj1A\nG/D3wN1lr/ks8MF0+yVAAbgaaAEuBg4B82o49ob00QmcBmwBfpxRjhOBg8DvpO+1ADgn3fdDki67\nsWNfU/4+aXlvBuYDHcAvp5+ldP88YAhYTvIj7E6S4NAKPAN4FLgwI1/vBz5X9vy1wMb0dd3AV4F/\nrXTuJ7zf2Dn7i7ScbwD6gM+nf5/T07yumfj5Ze//L2k5zwZGgFMny2vG534k/Q68GBgETinbf2Z6\nfs4CdgKXTvjc5rL3+y1gG/A8QMDJwIll38k70vM9H/g58KZ037OBXcDzgRxwRXp8G3BK+ndbXva5\nJzX6/9hMfLglYkfi/RExGBFDABHxmYg4GBEjJJXN2WO/oCeRB66OiHxEfAsYIPkPfsTHSsoBrwDe\nFxGHImIDcG2F/L4KuCUivpC+156IuPtplPevImJvWt7/JKn0XpTuuwz474h4gqTSWxQRV0fEaEQ8\nSlIpX36En/O7wEci4tGIGACuAi6f0HX1lHM/iTxwTUTkSYLsQuAf0r/PA8AGkgCR5QMRMRQR9wD3\nVDl2ovdGxEhE3Ar8O/BKgIj4YUTcFxGliLgX+AJJoMnyeuDDEfHTSGyMiM1l+z8WEU9ExF7gG8A5\nafobgU9GxO0RUYyIa0kC4XlAkSSYnCapJSIei4hHnkbZ7Ag5iNiR2DK2ISkn6a8lPSKpn+SXHySV\n12T2xFPHAA6R/Op+OscuIpkEsqVsX/n2RKuAo6kwxt87IoKkcv6dNOlVwPXp9onA8rQrZb+k/cC7\nSAbPj8RyoLyy3ExSzvLXVyonJOesmG6PBZqdZfuHyD7fADvKtiv9bSbaFxGDZc83k5QHSc+X9B9p\nF98B4E1kfz+g+t8rK48nAm+bcP5XkbQ+NgJvJfmRs0vSDZKWH2HZ7GlwELFyWUs6l6e/CrgEuACY\nQ9JNAEk3RL30kXSfrCxLW1Xh+C3ASRn7Bkm6xMYsneSYiefhC8Blkk4k6Tr5StnnbIqIuWWPnoi4\nuELeyj1BUhGOOYGknOVBYLousz1PUlfZ8xNIygNJd9pNwKqImAP8M09+PyYrT6W/VyVbSFph5ee/\nMyK+ABARn4+IF5Kc4wA+VMNnWBUOIlZuJ0n/fCU9JF0Ge0gq47pMBS2X/tL+KvB+SZ2SngW8usJL\nrgcukPRKSc3poPxYF8jdwP9K3+dk4HVH8Pk/A3YDnwa+GxH70113AAfTAdyOtJV2hqTnHWHRvgD8\nqaQ1krpJzuUX4/iZ+voBSa2SXgS8HBi79qUH2BsRw5LOJfnhMaYPKPHU79mngT+T9FwlTk4DdjX/\nArwpbflIUlc6qN8j6RRJvyKpDRgmaZGVjrK8NgkHESv3V8B70q6BP8s45jqSrottJP3tt01R3v6I\npOWzA/hXkgp4ZLIDI+JxkoH5twF7SQLHWF//R4FRkoB5LU92TVXzeZLW1+fLPqdIUnmeQ3Ltw1ig\nyRofmugzaVl+lL5+GPjjI3xto+0A9pG0Pq4nGex+MN33ZuBqSQdJBv2/NPaiiDgEXAP8V/o9Oy8i\nvpymfZ5kQsTXSQbRK4qI9SSTCf4xzctGkokSkIyH/DXJ32QHsJhkzMmOsbEZJ2bHFUkfApZGxBWN\nzovZbOaWiB0XJD1L0llpt8W5JN1QX2t0vsxmO18Fa8eLHpIurOUkXVF/B9zY0ByZmbuzzMysdu7O\nMjOzms347qyFCxfG6tWrG50NM7Pjyp133rk7IhZVO27GB5HVq1ezfv36RmfDzOy4Imlz9aPcnWVm\nZkfBQcTMzGrmIGJmZjVzEDEzs5o5iJiZWc0cRMzMrGYOImZmVrMZf52Imdl0FxEM5YsMjBTIF4NS\nKShNWJKqFFAsxfijFEEEyb/pewznS/QNjNB3cIQ9AyP8+YWnINXzfnEOImZmNRkaLbJp9yAb+wZ4\nYv8QAnJNoiXXRE97M3M7W+hsbWbf4Cg7+4fZdXCEwZECh0aLHMoX2TswOl7hHxzOUzrGyxi25pp4\n80tPprutvtW8g4iZHbcigpFCiUOjxfEKenC0wKGRIvliiXyxRKEU5IslRgslRoslhkaTX/wDwwX6\nh/PsP5TnwFCeUgQdrc10tuTIF0vs6B9mZ/8w/UMFJGhKf9EXI2kpFJ5mrZ9rEl2tOTpbm+lszTG/\nq5W1i7t5wUkLmNPRQldbM12tOVqbm5BETmKsEREBTU2Qa2qiuUk0pflpSo+RQIi25iYW9bSxqKeN\nOR0tdW+FgIOImTVARDA4WmTPwAgDIwVyTUmlOVossat/hO0Hkgp8d/pL/cBQnuacaM01kWsSuwdG\n2dU/TN/ACPlibT/hO1py9HY0M7ejlTkdLTQ3NXFgKM+OA0PkmppY2tvGWSvnMKejlSDpOooImtK8\ndrbmWLOwm5MWd7FyXiciCTCjhRIHhwscGMozOFJgbmcLS3rbmd/ZSlNT/Sv1qeYgYmZV5Ysltuw9\nxKHRIqVI+uQPjRY5MJSnfyhPrkn0tLfQ097MwEiB7fuH2H5geDwY7CrrsilFMDRaZKRQ/Zbn87ta\nWdjdytyOVobzJfqHChRKwcLuVk5atJBFPW30tDeP/8Lvamumsy1HZ0uOtpYczU0i1yRam5tozTXR\n2txER2uOrtZmcnWs0Bd2t9XtvacbBxGzWSZfTLp/BkYK7DgwzI4Dw/QdHGak8GSXz6HRIodGC/QP\nFXikb4BH+wYZLVav9Mu1NjextLedpb3tnL68l96OFnJKumLaWnIs6GplQXcb3W258UHj5iaxuLed\npXPaWdzTRkvOE0inOwcRs+NERFAKGBgpsHdwlL2DoxwYGqV/KOnb3z0wyo4DSQtg98Ao/Wkr4VC+\niEj60IOo2v0jQVfab9/d1syJCzp58SmLeObiHrrbm5NA0JQc09vRQm9HC6VScHC4wMHhPJ2tzSyb\n286CrtYp6ZO3xnIQMWuAQ6MFdvaPsKt/mEP5IqOFEiOFEvsGR9k9MMLugWRc4In9Q2zfP8zgaKHq\n7B0JFve0sbS3nRVz2zl1WQ9zOlrobM0l/fnpcZ0tOTrSALFkTjvL5rSzqLuNjtYcLblk4NaVvx0p\nBxGzGo0WSmw/METfwRGG8kWGRosM5Yv0DxfoH8ozMFJI+/6LDI4U6Ts4wq6Dw+zqH+HgSCHzfZsE\n87vaWDqnjdULunjBSQvpbmseH9Dtaktm9szvamVuZzIo3NvenAwOu/vHppiDiM1qhWKJx/ceon+4\nMH4B19iA8dig8Vg3zYGhZDro/qHRNCCMEBVaBy050d6So70lR2drjkXdbZyytIcXrV3Ekt52lvS2\nsbinnc62HG3NTbQ1NzG3s5V5na11HfQ1O5YcRGzGyhdLbN5ziI27DvKLnQM80jfAUL4IJPPut+4b\nYmPfAKNVZgm15ERve9L3P7ezhUXdbTxraS8r5nawYl4HS3rb6WpNgkVHa46e9mZ621tob8lNRTHN\nGspBxGaEgZEC927dzz1bDnDftv08vHOAzXsGnzKIvGJuBz3tyVc+ApbOaeeFaxeydnE3C3va0ou3\nkusH5nS0JN1EHS20pRd/mdnhHETsuFQqBRu29/PDh3bxw4f6uOvxfeMDzyfM7+SUpT386mlLOHlR\nN2uXdHPy4m46W/11NzvW/L/Kjhtb9x3iBw/u4icb93D7pj3sO5QH4MwVc3jzS05m3ep5nL1yLvO6\nWhucU7PZw0HEpp3hfJHNew6ND25v3DXAd+7fzj1bDwBJt9TLTl3C+c9YwIueuZDFPe0NzrHZ7OUg\nYg1RLAV3b9nPY7sHKaaL2W0/MMTtm/Zy95b9hw12n7VyDu+46FlcdMZS1izsalCuzWwiBxGrq4jg\nwFCeXQdHxpfYWL95Lz94cBe7B0afcmyuSZyxvJcrzj+RM1fOZX5nK70dzel0WLc2zKYjBxE7pvYf\nGuXf79vON+/ZzqO7B9gzMHrYktk97c289JTFXHDaEs5aMYeW5uQq6Z72Zg9+mx1n/D/Wjtqh0QI3\nb9jJN+7Zzq0P7yJfDE5a1MWLn7mIhd1tLOhO7m8wthjfsrntXljPbIZwELGnpVgKHukb4OGdB9m4\na4Cfb+/nRw/vZihfZElvG1ecv5pLn72C05f3+toKs1nAQcQqigju3XqA7z+4i7s27+PuLfsZSNd9\nkpJrMl7x3BX8xlnLed7q+TPypjtmls1BxIAkWPxi1wC3b9rL4EiB4XyRXQdH+MHPd7Gjf5gmwanL\nern02cs5Z9U8Tl3Ww0mLur20h9ks5yAyi/UdHOG+bfu5/dG9fG/DTjbtHnzK/u62Zl5w0gL+/PRT\neNmpi5nb6Yv4zOypHERmmYPDeT5566N85a6tbD8wDEBzkzj/pAW87oVrePEzF7Ggu5X25py7psys\nKgeRWWK0UOKGnz7OP9zyC/YMjnLBqUt43QvXcNbKuZy+vJeuNn8VzOzpc80xQ5VKwc6Dw6x/bB83\nb9jJfzy4i4MjBZ6/Zj6fufhUzl41t9FZNLMZwEHkOBAR4/fV3j3w5O1TB0cKjOST26oeHM6z91Ce\nfYOjbD8wxJZ9Q+NLhyzoauXXzlzKb5y9nBeevNBTb83smHEQaaBiKXhszyAbnujnkb4BettbWDan\nnYU9bTy88yA/3bSXux7fz44Dw4wWs2+c1KRkEHxBdxvzOltYu7iHC05dwsr5nZy6tIdnnzDPd8oz\ns7pwEJliEcFPH9vHl9dv4dv37xi/5mIyi3raWHfiPC4+cxnzu1qY19nKwu625NHTSk97csOk5ia5\ndWFmDeEgMkW27R/ia3dt5St3bWPT7kG625q5+MylnLtmAacu6+Hkxd0MjhR5Yv8QfQMjPGNhFyfM\n73RwMLNpzUGkjvYOjnLzhh18457t/Ncju4mAc1fP58qXnszFZy49bLHBtuYc831DJTM7jjQkiEj6\nG+A3gFHgEeAPImJ/uu8q4HVAEXhLRHw3TX8u8FmgA/gW8CcREYe/e2Pt6h/mext28p37d/Dfj+6h\nWApWze/gLb+yllc8ZyUnLOhsdBbNzI6ZRrVEbgauioiCpA8BVwHvkHQacDlwOrAcuEXSMyOiCHwC\neANwO0kQuQj4dkNyP8Hjew7x3Qd28N0HdnDn4/uIgDULu3jTi5/Br52xzIsRmtmM1ZAgEhHfK3t6\nG3BZun0JcENEjACbJG0EzpX0GNAbEbcBSLoOuJQGBpEtew/x9Z9t49/v286DOw4CcNqyXv70gmdy\n0RlLWbu424HDzGa86TAm8lrgi+n2CpKgMmZrmpZPtyemT0rSG4E3ApxwwgnHMq/8ZONuPnLzw6zf\nvA+A562ex3t+/VQuPH0pq+a7q8rMZpe6BRFJtwBLJ9n17oi4MT3m3UABuP5YfnZEfAr4FMC6deuO\n2bjJjgPD/OHn7mRORwt/fuEpXHLOclbOc+Aws9mrbkEkIi6otF/Sa4CXAy8rGyDfBqwqO2xlmrYt\n3Z6YPmUignd+9V7yxRKfe93zWb2wayo/3sxsWmrIPUolXQS8HfjNiDhUtusm4HJJbZLWAGuBOyJi\nO9Av6TwlAw2vBm6cyjz/251b+eFDfbzjomc5gJiZpRo1JvKPQBtwczr4fFtEvCkiHpD0JWADSTfX\nlenMLIA38+QU328zhYPqOw4Mc/U3N3Du6vlccf7qqfpYM7Npr1Gzs06usO8a4JpJ0tcDZ9QzX1k+\n9J0HyRdLfPiys3yPDTOzMg3pzjrebN4zyLoT57sby8xsAgeRI1AoBc05t0DMzCZyEDkC+WLQ3ORT\nZWY2kWvGI1AolmhxS8TM7DAOIkcg6c7yqTIzm8g14xHIF0u0eFaWmdlhHESOQKHogXUzs8k4iByB\nQqnk7iwzs0m4ZjwC+WK4O8vMbBIOIkegUHRLxMxsMq4Zj0DeFxuamU3KQeQIFIolWnyxoZnZYVwz\nVlEqBaXALREzs0k4iFSRL5UAaPGYiJnZYVwzVlEoJjddbPbsLDOzwziIVDEeRNwSMTM7jGvGKsa6\ns9wSMTM7nINIFU+2RBxEzMwmchCpIl9MB9Y9xdfM7DCuGasoltwSMTPL4iBSRWFsTMQD62Zmh3HN\nWEU+HRPxAoxmZodzEKnCU3zNzLK5ZqxifIqvx0TMzA7jIFJFYbw7y6fKzGwi14xVFIpuiZiZZXEQ\nqSKfTvFtcRAxMzuMg0gV4y0Rd2eZmR3GNWMVeS97YmaWyUGkioLvJ2Jmlsk1YxW+n4iZWTYHkSrG\nF2B0S8TM7DCuGasoeAFGM7NMDiJVeHaWmVk214xVjC/A6JaImdlhHESq8FLwZmbZXDNWkffsLDOz\nTA4iVXiKr5lZNgeRKsa6s3IOImZmh2lIEJH0fyTdK+luSd+TtLxs31WSNkp6SNKFZenPlXRfuu9j\nkqakVi+UgpacmKKPMzM7rjSqJfI3EXFWRJwDfBP4CwBJpwGXA6cDFwEfl5RLX/MJ4A3A2vRx0VRk\ntFAseXqvmVmGhtSOEdFf9rQLiHT7EuCGiBiJiE3ARuBcScuA3oi4LSICuA64dCrymi+GLzQ0M8vQ\n3KgPlnQN8GrgAPDSNHkFcFvZYVvTtHy6PTE9673fCLwR4IQTTjiqfBZKJS95YmaWoW61o6RbJN0/\nyeMSgIh4d0SsAq4H/uhYfnZEfCoi1kXEukWLFh3VexWK4ZlZZmYZ6tYSiYgLjvDQ64FvAe8DtgGr\nyvatTNO2pdsT0+suXwy3RMzMMjRqdtbasqeXAA+m2zcBl0tqk7SGZAD9jojYDvRLOi+dlfVq4Map\nyGuhVPKYiJlZhkaNify1pFOAErAZeBNARDwg6UvABqAAXBkRxfQ1bwY+C3QA304fdefuLDOzbA0J\nIhHxigr7rgGumSR9PXBGPfM1mXzRA+tmZllcO1ZRKHmKr5lZFgeRKvK+2NDMLFPV2lHSGZKuk7Q+\nfVwr6aypyNx0UCiG7yViZpahYhBJr+n4GvBD4LXp41bgK2PXe8x0hZJbImZmWaoNrF8N/GpEPFaW\ndq+kH5BMsZ2SabaNlC8G7S1uiZiZTabaT+zmCQEEgDStpR4Zmm687ImZWbZqtWNB0mGLT0k6keQ6\njhnP14mYmWWr1p31PuAWSX8J3JmmrQPeCbyjnhmbLvJFX7FuZpalYhCJiK9L2gS8DfjjNHkD8MqI\nuKfemZsOCqXwwLqZWYaqV6ynweLVU5CXaang+4mYmWWqNsV3oaT3SXqLpG5Jn0iXc79R0slTlclG\nKpRKtLglYmY2qWq14+eBNtLVdIFNwGUkt7T9dH2zNj24JWJmlq1ad9aSiHhXuvz65oj4cJr+oKQr\n65y3acELMJqZZatWOxYB0vua756wr1SXHE0zycC6WyJmZpOp1hJ5hqSbAJVtkz5fU9ecTRNJd5Zb\nImZmk6kWRMrXx/rbCfsmPp+R8qWSF2A0M8tQ7TqRW7P2SfoiyWKMM1axFETg60TMzDIcTe14/jHL\nxTSVLybDPp6dZWY2Of/ErqBQCgB3Z5mZZajYnSXpOVm7mAWr+BbGWiLuzjIzm1S1gfW/q7DvwWOZ\nkekoX3RLxMyskmoD6y+dqoxMR4XS2JiIWyJmZpOptnbW28u2f2vCvr+sV6ami0LaEvHFhmZmk6v2\nE/vysu2rJuy76BjnZdoZm53lZU/MzCZXrXZUxvZkz2ecsdlZnuJrZja5akEkMrYnez7j5D07y8ys\nomqzs86W1E/S6uhIt0mft9c1Z9NAwbOzzMwqqjY7KzdVGZmOxmZn5TywbmY2KffTVPDkdSI+TWZm\nk3HtWEGx5Cm+ZmaVOIhU8OQCjD5NZmaTce1YgQfWzcwqcxCpYHzZE0/xNTOblGvHCrwAo5lZZQ4i\nFXgBRjOzylw7VpD3AoxmZhU5iFRQ8HUiZmYVuXas4MnuLLdEzMwm4yBSwfjAumdnmZlNqqG1o6S3\nSQpJC8vSrpK0UdJDki4sS3+upPvSfR+TVPfmwfg91t0SMTObVMOCiKRVwP8AHi9LO43kRlink9z0\n6uOSxhaB/ATwBmBt+qj7TbF8PxEzs8oa2RL5KPB2nnpfkkuAGyJiJCI2ARuBcyUtA3oj4raICOA6\n4NJ6Z3D8zobuzjIzm1RDakdJlwDbIuKeCbtWAFvKnm9N01ak2xPTs97/jZLWS1rf19dXcz4LxaBJ\n0OQpvmZmk6p2U6qaSboFWDrJrncD7yLpyqqLiPgU8CmAdevW1XwHxnyp5AsNzcwqqFsQiYgLJkuX\ndCawBrgnHRtfCdwl6VxgG7Cq7PCVadq2dHtiel0VikGLWyFmZpmm/Gd2RNwXEYsjYnVErCbpmnpO\nROwAbgIul9QmaQ3JAPodEbEd6Jd0Xjor69XAjfXOa6HoloiZWSV1a4nUIiIekPQlYANQAK6MiGK6\n+83AZ4EO4Nvpo67ypfCSJ2ZmFTQ8iKStkfLn1wDXTHLceuCMKcoWMNYScRAxM8vivpoKCqXwvUTM\nzCpwDVlBoRi+l4iZWQUOIhUUPMXXzKwi15AV5IseWDczq8RBpIJCseR7iZiZVeAasoJCKTw7y8ys\nAgeRCvLFkhdfNDOrwDVkBYWiWyJmZpU4iFSQL4VnZ5mZVeAasoJCseQFGM3MKnAQqcDdWWZmlTmI\nVOD7iZiZVeYasgLfT8TMrDIHkQp8PxEzs8pcQ1aQL3kBRjOzShxEKigUS14K3sysAteQFXh2lplZ\nZQ4iFeRLXoDRzKwS15AVFIpBzrOzzMwyOYhkiAgKJU/xNTOrxEEkQ7EUAJ7ia2ZWgWvIDIXxIOKW\niJlZFgeRDPliCcD3EzEzq8A1ZIZC0S0RM7NqHEQy5EtJS8RjImZm2VxDZhhriXh2lplZNgeRDE92\nZ/kUmZllcQ2ZYaw7ywswmpllcxDJMN4S8ewsM7NMriEzjE3x9ewsM7NsDiIZxi42dHeWmVk2B5EM\nhbGWiLuzzMwyuYbMkPfFhmZmVTmIZCiMz87yKTIzy+IaMsOTs7PcEjEzy+IgkmF8AUa3RMzMMrmG\nzOCl4M3MqnMQyTB+nYi7s8zMMjmIZPAV62Zm1TWkhpT0fknbJN2dPi4u23eVpI2SHpJ0YVn6cyXd\nl+77mKS6NhGK7s4yM6uqkT+zPxoR56SPbwFIOg24HDgduAj4uKRcevwngDcAa9PHRfXMXN5TfM3M\nqppuNeQlwA0RMRIRm4CNwLmSlgG9EXFbRARwHXBpPTPiKb5mZtU1Moj8saR7JX1G0rw0bQWwpeyY\nrWnainR7YvqkJL1R0npJ6/v6+mrK3JMLME63OGtmNn3UrYaUdIuk+yd5XELSNfUM4BxgO/B3x/Kz\nI+JTEbEuItYtWrSopvfwAoxmZtU11+uNI+KCIzlO0r8A30yfbgNWle1emaZtS7cnpteNF2A0M6uu\nUbOzlpU9/Z/A/en2TcDlktokrSEZQL8jIrYD/ZLOS2dlvRq4sZ55HFuA0S0RM7NsdWuJVPFhSecA\nATwG/CFARDwg6UvABqAAXBkRxfQ1bwY+C3QA304fdVMolcg1iTrPJDYzO641JIhExO9X2HcNcM0k\n6euBM+qZr3KFYnhmlplZFe7wz5Avhq8RMTOrwrVkhkKp5KvVzcyqcBDJkC+GZ2aZmVXhWjJDoVjy\nzCwzsyocRDIUSuHuLDOzKhxEMuSLJVrcnWVmVpFryQyFolsiZmbVOIhkKJRKHlg3M6vCtWSGvFsi\nZmZVOYhkSFoiDiJmZpU4iGRIxkR8eszMKnEtmaFQCl8nYmZWhYNIhkLRA+tmZtW4lsyQLMDoloiZ\nWSUOIhk8xdfMrLpG3ZRq2nvR2kUsm9Pe6GyYmU1rDiIZ3vvy0xqdBTOzac/9NWZmVjMHETMzq5mD\niJmZ1cxBxMzMauYgYmZmNXMQMTOzmjmImJlZzRxEzMysZoqIRuehriT1AZtrfPlCYPcxzM7xYDaW\nGWZnuWdjmWF2lruWMp8YEYuqHTTjg8jRkLQ+ItY1Oh9TaTaWGWZnuWdjmWF2lrueZXZ3lpmZ1cxB\nxMzMauYgUtmnGp2BBpiNZYbZWe7ZWGaYneWuW5k9JmJmZjVzS8TMzGrmIGJmZjVzEJmEpIskPSRp\no6R3Njo/9SJplaT/kLRB0gOS/iRNny/pZkm/SP+d1+i8HmuScpJ+Jumb6fPZUOa5kv5N0oOSfi7p\n/Jlebkl/mn6375f0BUntM7HMkj4jaZek+8vSMssp6aq0fntI0oVH89kOIhNIygH/BPwacBrwO5Jm\n6m0OC8CgrOS/AAAEb0lEQVTbIuI04DzgyrSs7wS+HxFrge+nz2eaPwF+XvZ8NpT5H4DvRMSzgLNJ\nyj9jyy1pBfAWYF1EnAHkgMuZmWX+LHDRhLRJy5n+H78cOD19zcfTeq8mDiKHOxfYGBGPRsQocANw\nSYPzVBcRsT0i7kq3D5JUKitIyntteti1wKWNyWF9SFoJ/Drw6bLkmV7mOcAvA/8XICJGI2I/M7zc\nJLcA75DUDHQCTzADyxwRPwL2TkjOKuclwA0RMRIRm4CNJPVeTRxEDrcC2FL2fGuaNqNJWg08G7gd\nWBIR29NdO4AlDcpWvfw98HagVJY208u8BugD/l/ajfdpSV3M4HJHxDbgb4HHge3AgYj4HjO4zBNk\nlfOY1nEOIoakbuArwFsjor98XyRzwGfMPHBJLwd2RcSdWcfMtDKnmoHnAJ+IiGcDg0zoxplp5U7H\nAC4hCaDLgS5Jv1d+zEwrc5Z6ltNB5HDbgFVlz1emaTOSpBaSAHJ9RHw1Td4paVm6fxmwq1H5q4Nf\nAn5T0mMkXZW/IulzzOwyQ/Jrc2tE3J4+/zeSoDKTy30BsCki+iIiD3wVeAEzu8zlssp5TOs4B5HD\n/RRYK2mNpFaSAaibGpynupAkkj7yn0fER8p23QRckW5fAdw41Xmrl4i4KiJWRsRqkr/tDyLi95jB\nZQaIiB3AFkmnpEkvAzYws8v9OHCepM70u/4yknG/mVzmclnlvAm4XFKbpDXAWuCOWj/EV6xPQtLF\nJP3mOeAzEXFNg7NUF5JeCPwncB9Pjg+8i2Rc5EvACSTL6L8yIiYO2h33JL0E+LOIeLmkBczwMks6\nh2QyQSvwKPAHJD8kZ2y5JX0A+G2SmYg/A14PdDPDyizpC8BLSJZ83wm8D/g6GeWU9G7gtSTn5a0R\n8e2aP9tBxMzMauXuLDMzq5mDiJmZ1cxBxMzMauYgYmZmNXMQMTOzmjmImE1Tkl4ytsqw2XTlIGJm\nZjVzEDE7SpJ+T9Idku6W9Mn0XiUDkj6a3svi+5IWpceeI+k2SfdK+trYPR4knSzpFkn3SLpL0knp\n23eX3QPk+vTKa7Npw0HE7ChIOpXkiuhfiohzgCLwu0AXsD4iTgduJbmCGOA64B0RcRbJSgFj6dcD\n/xQRZ5Os7zS2+uqzgbeS3NvmGSRrf5lNG82NzoDZce5lwHOBn6aNhA6She5KwBfTYz4HfDW9p8fc\niLg1Tb8W+LKkHmBFRHwNICKGAdL3uyMitqbP7wZWAz+uf7HMjoyDiNnREXBtRFz1lETpvROOq3V9\noZGy7SL+P2vTjLuzzI7O94HLJC2G8ftan0jyf+uy9JhXAT+OiAPAPkkvStN/H7g1vavkVkmXpu/R\nJqlzSkthViP/qjE7ChGxQdJ7gO9JagLywJUkN306N923i2TcBJIluf85DRJjK+lCElA+Kenq9D1+\nawqLYVYzr+JrVgeSBiKiu9H5MKs3d2eZmVnN3BIxM7OauSViZmY1cxAxM7OaOYiYmVnNHETMzKxm\nDiJmZlaz/w/UuBmpwDVyRQAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x145e16898>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ELBO = [-training_loss[i] for i in range(len(training_loss))]\n",
+    "plt.plot(ELBO)\n",
+    "plt.ylabel('ELBO');plt.xlabel('epoch');plt.title(\"training curve for mini batches\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As expected, the ELBO is monotonically increasing over epoch, and we reproduced the resutls given in the paper [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114/). Now we can extract/load the parameters and then feed the network forward to calculate $y$ which is the reconstructed image, and we can also calculate the ELBO for the test set. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "arg_params = model.get_params()[0]\n",
+    "\n",
+    "#if saved the parameters, can load them at e.g. 100th epoch\n",
+    "#sym, arg_params, aux_params = mx.model.load_checkpoint(model_prefix, 100)\n",
+    "#assert sym.tojson() == output.tojson()\n",
+    "\n",
+    "e = y.bind(mx.cpu(), {'data': nd_iter_test.data[0][1],\n",
+    "                     'encoder_h_weight': arg_params['encoder_h_weight'],\n",
+    "                     'encoder_h_bias': arg_params['encoder_h_bias'],\n",
+    "                     'mu_weight': arg_params['mu_weight'],\n",
+    "                     'mu_bias': arg_params['mu_bias'],\n",
+    "                     'logvar_weight':arg_params['logvar_weight'],\n",
+    "                     'logvar_bias':arg_params['logvar_bias'],\n",
+    "                     'decoder_z_weight':arg_params['decoder_z_weight'],\n",
+    "                     'decoder_z_bias':arg_params['decoder_z_bias'],\n",
+    "                     'decoder_x_weight':arg_params['decoder_x_weight'],\n",
+    "                     'decoder_x_bias':arg_params['decoder_x_bias'],                \n",
+    "                     'loss_label':label})\n",
+    "\n",
+    "x_fit = e.forward()\n",
+    "x_construction = x_fit[0].asnumpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAADSCAYAAACvmc1VAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XuUXGWZ7/HfkyvkSkKHJARICDAkxMkBaUm4CZrAwQku\nRAdnogzxNjhneRhdOjOiZ80ZdIYj5yxvzNHlDB4QFEcFuSsjIANyRxIMQhJCQi6E0Em6SUI693R4\nzx+1w5Sp56F7d1d1d9X+ftbKSvev3q7au/dT1W9X7+fdllISAAAAUEQD+noDAAAAgL7CZBgAAACF\nxWQYAAAAhcVkGAAAAIXFZBgAAACFxWQYAAAAhcVkuB8ws/9nZl/u6+0AesLMpphZMrNBwe1LzOzc\nXt4soFPULuoRdVs9TIYlmdn2sn9vmtmuss8/WuvHTyl9KqX0v2r9OKgfZrbGzOb29XZUU0ppRkrp\n4b7eDtQWtYt6RN0Wm/vbRNGklEYc+NjM1kj6VErp19F4MxuUUurojW0DaoU6Rr2idlGPqNv+i3eG\nu8DM/snMfmZmPzGzdkmXmtnNZnZV2Zi52UT6wOdHmdkdZtZqZqvN7DNvc/9v3deB+zGzL2Vf+5qZ\nvd/MLjSzFWa22cz+ruxrTzezp8xsq5m1mNk/m9ngstvfZ2YvmdkbZvZ/zexxM/tY2e2fMrMXzWyL\nmf27mR1dpW8baiSrhcXZMX/CzGaW3Xalmb1sZu1mttTMLi677WPZ8f+Wmb0u6aose8zMvp7VwGoz\ne1/Z14w2s+uz2lqfPRcGZrcNzL6uzcxWSZrXyXa/9c6LmV1lZrdmtd9uZs+b2R9ldb/JzNaZ2fll\nX/txM1uWjV1lZp8+6L7/LtvG17KaTmZ2fHbb0Gw7XzGzjWb2L2Z2aM+OArqD2qV26xF12/h1y2S4\n6y6W9G+SRkv62dsNNLMBkn4h6RlJkySdJ+lvzWxOFx/rKJWOzZGS/lHS9ZL+XNIpks6V9FUzOyYb\n2yHps5KaJJ0p6QJJn8624whJt0j62+z21ZJOK9vOD2W3XSRpnKSns31EP2Vmp0i6QaVjfLikf5V0\nt5kNzYa8LOlsler0K5JuNrOJZXcxS9IqSeMlXV2WLVepRv6PpOvNzLLbblSpxo5Xqf7Ol/Sp7La/\nlHRhljdL+tOcu/N+ST+SNEbS7yTdp1LdT5L01WzfDtiUPdYoSR+X9C0ze2f2PblA0uclzc2289yD\nHucaSX8k6eTs9kmS/mfObUUPUbvUbj2ibgtStykl/pX9k7RG0tyDsn+S9B8HZTdLuqrs87mS1mQf\nnylp1UHj/17S94PHfOu+svvZLmlg9vkYSUnSqWXjn5N0YXBffyPp1uzjT0h6tOw2k9Qi6WPZ5w9I\nWlB2+yBJeyRN6uvjUPR/Xh1m+fck/eNB2XJJ5wT3s1jSRdnHH5P0ykG3f0zSyrLPh2X1NkGlF+89\nkg4tu32+pIeyj/9D0l+V3XZ+9rWDOtsnSVdJeqDstvcfVPcjs/s6LLivOyV9Nvv4BklfK7vt+Oxr\nj89qfoek48puP13S6r4+xo36j9qlduvxH3Vb7LrlnOGuW5dj7GRJx5jZ1rJsoKSHu/j1bSml/dnH\nu7L/N5bdvkvSCEkys2mSviHpVJWeVINUeodXKr2z/NZ2p5SSmb160HZ+18yuLcveVOmd6fVd3Fb0\nrsmSFpjZFWXZEJWOtczsMpV+Y5+S3TZCpXcfDvDqeMOBD1JKO7M3KEZIGitpsKSW/3zTQgPK7uMP\n6kvS2pz7cnBNe3U/QtLW7M+I/6DSuw0DVKr158u2Y2HZfZVv07hs7KKyfTCVno/oXdQutVuPqNsC\n1C2T4a5LB32+Q6UDfsCEso/XSVqRUppe860q/VnjKUl/llLabmZ/o9KfNqTSu8Dl5wCZSn+uKN/O\nv08pve1pH+hX1km6OqV09cE3mNlkSd+XNEfSkyml/Wa2WKUXogMOruPOHmuPpKbkN320SCo/x/wY\nZ0yPZX+OvE3SZZLuSintM7M79Z/71aLSL3AHlG9Tm0ov8jNSSvyC17eoXWq3HlG3BahbzhnuvsWS\n5pnZmOz8oL8uu+1JSXvN7Atmdkh20vsfm9mpNdiOkZLekLTDzKYrO1848wtJ77RSA94glc4tHld2\n+79I+h/Z18nMDjOzvOcgoXYGZ/Vz4N8glV54/8rMZlnJcDObZ2YjJQ1X6YW3VSo1QEh6R3cfPKXU\nIul+Sd8ws1FmNsDMjjOzc7Iht0j6ays1i46RdGUP9vXtDJE0VKX96sjesTi/7PZbJH3czKab2TCV\nTkk6sA9vqvQ9+1Z2Dr3MbJKZ/dcabStKqN0Sare+ULclhatbJsPdd6OkZSr9meJXkn564IbsN7o/\nUalZbY1Kvyn9q0onolfbFyQtkNSePcZb7/KmlDZK+jNJ35T0uqTjVDppfk92+63Zbbea2TZJv5fU\nrwu2YO5V6TfsA/+uSiktVKmJ4juStkhaqdI5aEopLVXplJknVfpz2B9LeryH23CZSi+MS7PH+7mk\nA80h31epAeM5Sc9Kur2Hj+VKKbWr9MvmLdk2fETS3WW3/7ukf5b0kErfj6eym/Zk/3/xQJ7V+a8l\nnViLbcVbqF1Ru3WIulUx69ayk5tRAFZanuU1SX+aUnq0r7cHqIXsLx0vSBoa/KkR6JeoXdSjRqhb\n3hlucGZ2QXb6w1CV/pSxT9Jv+3izgKoys4uttLblGEn/W9I99fqijGKhdlGPGq1umQw3vrNUWuOw\nVaVTIC5OKe15+y8B6s6nVVoX82VJ+yX9t77dHKDLqF3Uo4aqW06TAAAAQGHxzjAAAAAKq0eT4ex8\n1OVmttLMarXEBwAAAFAT3T5NIluZ4CVJ50l6VdIzkuZnS424mpqa0pQpU7r1eMABixYtakspjet8\nZPVQu+ipNWvWqK2tzTofWT3ULaqB11zUq67Wbk+uQHeaStfXXiVJZvZTSReptDaea8qUKVq4cGF0\nM9AlZpb3EpQ9Ru2ip5qbm3v9MalbVAOvuahXXa3dnpwmMUl/eD3qV/WHl/oFAAAA+rWaN9CZ2eVm\nttDMFra2ttb64YCqoXZRj6hb1CtqF32lJ5Ph9ZKOLvv8qCz7Ayml61JKzSml5nHjevWUI6BHqF3U\nI+oW9YraRV/pyTnDz0g6wcyOVWkS/OcqXb8aAAAAVdAX14Mw69Ve3z7X7clwSqnDzP67pPskDZR0\nQ0ppSdW2DAAAAKixnrwzrJTSvZLurdK2AAAAAL2KK9ABAACgsJgMAwAAoLCYDAMAAKCwenTOMAAA\nQNFFKz54+ZtvvumO7ejocPPdu3e7+bZt29x88+bNbv7GG2+4+cCBAyuyCRMmuGPHjh3r5sOGDXPz\nwYMHu/mAAf57sX21igXvDAMAAKCwmAwDAACgsJgMAwAAoLCYDAMAAKCwmAwDAACgsFhNAgAAoEy0\nOkQkWiFi//79Fdm+ffvcsVu2bHHzVatWufmDDz7o5kuWLMl1/+PHj6/IZs6c6Y6dO3eum0+dOtXN\nq7HKRG+sMME7wwAAACgsJsMAAAAoLCbDAAAAKCwmwwAAACgsGugAAHUnb4NTXn11WVj0b3nrzhsf\nXXY5urzyokWL3Pzpp5928xUrVrh51MzmXXo5uuzykCFD3LyWou95NZ+jvDMMAACAwmIyDAAAgMJi\nMgwAAIDCYjIMAACAwmIyDAAAgMLq0WoSZrZGUruk/ZI6UkrN1dgoAMVV61UCelsRViWoRof92+Xe\nJW2jy99GvMu8vl0eHbe8216Nx8ybo+eq9b3Ncz/t7e1u/tprr7l5W1ubm0erRkybNs3NZ82aVZGd\neuqp7thx48a5ebTKRFTrefRGnVdjabX3pJT8IwIAAAD0Y5wmAQAAgMLq6WQ4Sfq1mS0ys8ursUEA\nAABAb+npZPislNLJkt4n6TNm9u6DB5jZ5Wa20MwWtra29vDhgN5D7aIeUbeoV9Qu+kqPJsMppfXZ\n/5sk3SHpNGfMdSml5pRSc3TiNdAfUbuoR9Qt6hW1i77S7QY6MxsuaUBKqT37+HxJX63algENIk+X\ned5O9aijPk+nfX/rVM/zuNXa9lrua72ujlGNuo3qsKOjw82jbvrNmzdXZNu3b3fH7tu3z82jYxx1\n3kf5oEH+j00vP/TQQ92xUR515Odd2YJVJmon77Hw7Nq1y81feuklN3/llVfcPKrRGTNmuPmcOXPc\nvLm5ciGwsWPHumOHDx/u5tHzIu/KKX2lJ6tJjJd0R7ZDgyT9W0rpV1XZKgAAAKAXdHsynFJaJem/\nVHFbAAAAgF7F0moAAAAoLCbDAAAAKCwmwwAAACisalyOud956qmn3Pzaa69180mTJrl51PG7YMGC\niizqvIxy1K9qrPiwd+9ed+y2bdvcPFpzc+vWrW4eddR79u/f7+YDBw5087wrB+S9f68refDgwe7Y\nqLN5woQJbj5mzBg3j+4/6oRuJHnqOe+x37Nnj5u3tLS4+SOPPOLmzz//fEX2+uuvu2Oj59bQoUPd\nfPz48W4eLe01atQoN/dq68QTT3THHnPMMW5e7x35qOStnLJ27Vp37GOPPebmmzZtcvPode7MM890\n89mzZ7t5U1NTRZb3NTHvyj79rXYb/5UeAAAACDAZBgAAQGExGQYAAEBhMRkGAABAYTEZBgAAQGE1\n5GoS3moPkrRixYqq3P/VV19dkY0ePdodG3Vv1oMpU6a4+Ze+9CU3jzqkG03UUe91DUt+R/2rr77q\njn344Yfd/Omnn3bzDRs2uPmWLVvcfOfOnRVZtBJA1DUcrQ7h3ffb3f+QIUPc3OvWj1YC8LqgJens\ns89280svvdTNo5UDvI7n/tYFXStenUfHMqr9aMWHX/ziF25+//33u/mqVasqssMOO8wdG71uRcc4\nqqFI9HPEq4vo+3X44Ye7+YgRI3JtC3pf3hVV2traKrI777zTHfvcc8+5ebSy1fTp0938tNNOc/Po\nORC9vnry7n80vr/hnWEAAAAUFpNhAAAAFBaTYQAAABQWk2EAAAAUFpNhAAAAFFZDriYRdWouXrzY\nzWfMmOHmS5YscXOvs/+uu+5yx953331ufuyxx7r56tWr3Twv7xr3EydOdMeuW7cu131H3dpf/OIX\nc91PvYq6Y6NVFnbs2FGRrVy50h3729/+1s2ja9ZHq0Z4K1hIfl14mRRfmz7v/kf3f8ghh7i5t0/R\nfkai1Q3mzp3r5lF3/8CBA3M9bn8WHbc83d55V1JpaWlx86VLl7r5+vXr3fzII4+syM455xx3bN5O\n+qhTv7W11c1/+ctfurn38yLq0m9ubnbzeum8L7Jo1YRt27a5+e23316RRfMC72eFJE2bNs3N58yZ\n4+be80WK6zFaOcgTvc5HeXTfUe6tyhI9L6q5sg/vDAMAAKCwmAwDAACgsJgMAwAAoLCYDAMAAKCw\nOp0Mm9kNZrbJzF4oy8aa2QNmtiL7f0xtNxMAAACovq6sJnGjpO9I+mFZdqWkB1NK15jZldnn/WYp\ngeh63VEemTlzppvPnz+/IrvmmmvcsWvWrHHzaDWJVatWdW3jOjFkyJCKLFpNItqWqJs66mxtdJ11\nekedrV7X7OjRo92xRxxxhJtHtbt9+3Y3j+7f6zKeMGGCOzZaYSFaZWLnzp25xkfdxIsWLarI7rnn\nHnfs5s2bc21L3q7kanQrV7Pjub+KOuzzHocTTzzRzWfPnl2RXXDBBe7YqG6jVSPyruAQrZzx4osv\nVmQjRoxwx0arBrCaRP8RHYvdu3e7+RNPPOHmt956a0W2YcMGd2w055g3b56bH3/88W4+bNgwN8+z\ngkP0nI7qf9euXW4erRoUvS56qw/1xmtop+8Mp5QekXTwT5yLJN2UfXyTpA9UebsAAACAmuvuOcPj\nU0oHFpDcIGl8lbYHAAAA6DU9bqBLpb8lhH/bMbPLzWyhmS2M/uwO9EfULuoRdYt6Re2ir3R3MrzR\nzCZKUvb/pmhgSum6lFJzSqk5ugIQ0B9Ru6hH1C3qFbWLvtLdyzHfLWmBpGuy//1rERdIdJJ43maz\nvE1+eXiXkZaktrY2N581a5abn3/++VXbpnpy4CT+PI1ykt/MMHXqVHfsBz7gn35/xhlnuHnU5BE1\nS3rNcsOHD3fHRo1vkaixIhI1/3n5vffe646NtnHSpEluPnbsWDfPcznSelWNJpTo+xRdtjo6Pk1N\nTW4e1e2ZZ55ZkUXNplGjXLSN+/btc/OoPqNLRntNUVu3bnXH5tUbl6Mtquh7G9XF6tWr3fynP/2p\nm3tN9FHtfuhDH3Jzr/4ladSoUW4ePU+jffUupRw1xG3a5L/3Gc0joktARz8DvX3ymuqkfA2BnenK\n0mo/kfSkpBPN7FUz+6RKk+DzzGyFpLnZ5wAAAEBd6fSd4ZRS5TpiJXOqvC0AAABAr2r8vw0CAAAA\nASbDAAAAKCwmwwAAACis7q4mgX7Ou9znxRdf7I6NLrv47W9/282jbu2iiDpVo25179LY0SVjR44c\n6ebRZTcj0THyuvur1ZEb1VHeS/W+8sorFVnU2Rx1U0fd19H3vZpdyY0sb+1H9Tx+vH+dpqjz3Luf\n6DEjXse8JG3bts3Nly5d6ubLly938z179lRk0eol0epD6H3RCgvt7e1u/qtf/crNf/e737m595p7\n7rnnumPnzPFbsaLXuUh0yei9e/e6uff6unbtWnfsM8884+be67YUrygTfQ9OP/30iizaf+9na3fx\nzjAAAAAKi8kwAAAACovJMAAAAAqLyTAAAAAKi8kwAAAACovVJBrUjTfeWJFt2LDBHRt12E+ePLma\nm9Tw8nTaR6sXRNdgz9t9HnXa13J1hOi+Ozo63HzlypVu/vjjj1dkXqe+5HceS9LZZ5/t5tFqBdHx\nKLI8tRJ9/6LVJI444gg3jzr4t2zZUpFFxzLqMI9qKKrDhQsXunlra6ube9+v0aNHu2Or1QUfrYTA\nKiiVou9VtMJCtDrEww8/7ObR6jhTp06tyObNm+eOHTNmjJtHK6Fs3brVzTdv3uzm0fOrpaWlIluy\nZIk79qWXXnLzaPWJaDWh6Pl45JFHVmTTpk1zx7KaBAAAAFAFTIYBAABQWEyGAQAAUFhMhgEAAFBY\nTIYBAABQWKwmUedefvllN//85z/f5ft48skn3XzChAnd2qZGV41O7aj7Pup4jlaHqEY3eXQfeUVd\nw95KAJJ02223ufnq1asrsqgr/4ILLnDzo446ys2j1TqKLKoVry6isXlXk2hqanJzr6tdkp577rmK\nbN26de7Y6BhHdbh8+XI3f+GFF9w86tT3RN+vaHWA6LkYPbei14Wi876P0fd806ZNbn7rrbe6eVQX\n0TGaMWNGRRa9Pu3bt8/No5qLanfjxo1uHm2j972Jvl/bt2/P9ZjRqhGjRo1yc29ViuOOO84dW83V\nVHhnGAAAAIXFZBgAAACFxWQYAAAAhcVkGAAAAIXV6WTYzG4ws01m9kJZdpWZrTezxdm/P6ntZgIA\nAADV15XW6hslfUfSDw/Kv5VS+nrVtwi53HPPPW7udaVecskl7ljv2umonjydrXm7YPOsBFCt+4jy\nvXv3uvkjjzzi5vfff7+bd3R0VGSzZs1yx77nPe9x82HDhrl5tOoBKnl1kXe1k+g4RCvVrFy50s1f\nffXVisxbdUSKj/HOnTvdfMOGDW6+detWN4868r189+7d7tjouRJ18EcrZFTjed6IvO9LtKrBQw89\n5OZPPPGEm0d1MXbsWDc/9thjK7KoRl9//XU3X7ZsmZu/9NJLbu69hkrxz/qjjz66IotWfIlWcdm1\na1euvL293c29ba/Gz7nOdPqTIaX0iKSuryUDAAAA1ImevE1yhZn9PjuNYkzVtggAAADoJd2dDH9P\n0lRJJ0tqkfSNaKCZXW5mC81sYWtrazcfDuh91C7qEXWLekXtoq90azKcUtqYUtqfUnpT0vclnfY2\nY69LKTWnlJrHjRvX3e0Eeh21i3pE3aJeUbvoK92aDJvZxLJPL5bkX58QAAAA6Mc6XU3CzH4i6VxJ\nTWb2qqR/kHSumZ0sKUlaI+nTNdxGKL5m+R133OHmQ4cOrci+9rWvuWO5vn119EW3dt5u2mp030Yd\n71Fn8/XXX+/mLS0tbn7kkUdWZB/84Afdscccc4ybDx482M3RM3lrfMiQIW7uHWNJOuWUU9zce43a\ntGlTrm2JVraIVmqIVh9YtWqVm3td8HlWnqi16Llfz6tMRPvkHYstW7a4Y5999lk3j07TiFYCiV5z\nvOfAtm3b3LFtbW1uHtVctFJD9Lo4Y8YMNz/ssMMqsqhGo1VZtm/f7ubRMRo1apSbT5w4sSKLnqPV\nrN1OJ8MppflO7P90AwAAAOoIi24CAACgsJgMAwAAoLCYDAMAAKCwmAwDAACgsDptoEP/EHXkP/ro\no27+kY98pCKLrksOeKJu4s2b/auzf/e733Xzp556ys2jlQYuvPDCiuy9732vO/bQQw918wED/N/z\n67lzvh5FK9WMHDnSzU844QQ3Hzt2bEUWrQ6Qd1ui7vj77rvPzZ955hk397rmo475qPajuqWe8/Fe\nu6IVHKKVmqJ6ifJo1QRv1ZxodYhoNYnXX3/dzaPXv2iN5mi8tyrFY4895o6NXs+j55H33JWkmTNn\nuvmUKVMqst5YHYh3hgEAAFBYTIYBAABQWEyGAQAAUFhMhgEAAFBYNND1M4sXL3bzK664ws29yyhK\n0le/+tWqbRPqT57GmqhRLrrsaNRYdPfdd7v57t273fz000938wULFlRkUUNIdJlO9K6o3qLGr6gh\nJmqs8xp/JkyYUJVtaW9vd/OVK1e6edQQ5DXFRdsYXRo6as6qRqNckZrtvH2NmhabmprcPHrNiV4X\nowY9rxEt76XEd+zY4ebRPkWv6dHjrl69uiL7+c9/7o7duHGjm0c1PXv2bDe/5JJL3Nw7Hr1xOWbe\nGQYAAEBhMRkGAABAYTEZBgAAQGExGQYAAEBhMRkGAABAYdGK3Ue8yx9K0vz58918//79bv7Rj37U\nzbn0cjFE3bTRpUE9UefxunXr3PyGG25w8+iSoVH3/WWXXebmxx13XEXWG93EqL68KztE3fHe8c9T\n4283vqOjw82jFS+iSyx7+dFHH+2OPeSQQ9y8WvVc9OeFtypHtGpEc3Ozmy9btszNo8vRR69/ixYt\nqsiWLl3qjo1WE4lqNO8KGVHtepc237p1qzt20qRJbh6tGnH55Ze7+YwZM9x86NChFVn0elFNvDMM\nAACAwmIyDAAAgMJiMgwAAIDCYjIMAACAwmIyDAAAgMLqdDUJMzta0g8ljZeUJF2XUrrWzMZK+pmk\nKZLWSPpwSqmyJbHgok79efPmufny5cvdfPr06W7+la98pXsbhkLyOup37tzpjr3lllvcfPHixW4e\ndfyed955uXKvQ7ro3fGNJu/x9Gor72oS0Wtx1Km/Y8cONx82bJibH3744RXZmDFjurh16I6ojrxV\nGYYPH+6OnTVrlpuvX7/ezbdv3+7mzz77rJt7KzVEr7l5azpaTSK6H2+lBkk64YQTKrJzzjnHHTtz\n5kw3j8ZPnjw517Z4z/XeeP3vyjvDHZK+kFI6SdJsSZ8xs5MkXSnpwZTSCZIezD4HAAAA6kank+GU\nUktK6dns43ZJyyRNknSRpJuyYTdJ+kCtNhIAAACohVznDJvZFEmnSHpa0viUUkt20waVTqPwvuZy\nM1toZgtbW1t7sKlA76J2UY+oW9Qrahd9pcuTYTMbIek2SZ9LKW0rvy2VTlBxT1JJKV2XUmpOKTWP\nGzeuRxsL9CZqF/WIukW9onbRV7o0GTazwSpNhH+cUro9izea2cTs9omSNtVmEwEAAIDa6MpqEibp\neknLUkrfLLvpbkkLJF2T/X9XTbawzkXXMX/44Ydz3c+PfvQjNx87dmzeTUIBRN3E+/btq8hefPFF\nd+xvfvMbN4+68qdMmeLmn/zkJ9086rSv5XXoo+8Lq1X0b3mOT3SMo3zPnj1uHq0yEdXtoYceWpHt\n3r3bHes9D6X8qwnA59XLoEH+dKepqcnNL730Ujd/5zvf6eYPPfSQm7/wwgsV2Zo1a9yx0XwhqosJ\nEya4+SmnnOLmZ5xxhptPmzatIhs9erQ7dtSoUW7u1b8Uf9+j53RfvRZ3OhmWdKakv5D0vJkdWFPp\nyypNgm8xs09KWivpw7XZRAAAAKA2Op0Mp5QekxRN1edUd3MAAACA3sMV6AAAAFBYTIYBAABQWEyG\nAQAAUFhdaaBDF7zxxhtuPnv27Fz3c/PNN7t51B2KYou6jPfv3+/mW7Zsqcgef/xxd2zU2Tx+vHt9\nHc2fP9/No+7rwYMHu7nXTRx1GOftvq/G/bDyRO3kOT55j0Pe50q0asrOnTvdfO/evRVZW1ubOzZa\nwSLvShjUYtdF36uBAwe6+YgRI9z8Xe96l5tHP6O9lUOi1USiPKrRaKWGaGWHaF+9FXyiVX2i72Pe\n8f0N7wwDAACgsJgMAwAAoLCYDAMAAKCwmAwDAACgsJgMAwAAoLBYTaJKfvCDH7j5qlWrct3PWWed\n5eb10pGJ2oi6yaOO96hb/bXXXqvI1q9f745tampy85kzZ7r5vHnz3DzqbM5T03lXjch7Pzy/+jfv\n+ORdeSHqyG9vb3fz1tZWN9+2bZubjxw5ssuPGeXVqnP0XPSaEOVDhgzJlXvq+fjX+2so7wwDAACg\nsJgMAwAAoLCYDAMAAKCwmAwDAACgsJgMAwAAoLBYTaIbVqxYUZFdddVVvb8hqAt5OoTzrhrR0dHh\n5lHH+8qVKyuyXbt2uWOPOOIIN49Wk5gwYYKbR13GfbGyQ713PKNzUV1Fz5V9+/a5+dChQ9382GOP\ndfMxY8ZUZFOnTnXHjhgxws0HDPDfn6Jui4Hj3Hd4ZxgAAACFxWQYAAAAhcVkGAAAAIXFZBgAAACF\n1WkDnZkdLemHksZLSpKuSylda2ZXSfpLSQeuWfnllNK9tdrQ/uTRRx+tyKKGpcj06dPdPLp0LepX\nNS4lGzX/7Ny5082jSyyvXbu2Itu9e7c7dtiwYW4eNcoNGtT7/bi1bjihoaVxRPUZ1XN0efF3v/vd\nbu41xU2u/Qu7AAAEuElEQVSaNMkdO3r0aDePtjHvpYEB5NOVn14dkr6QUnrWzEZKWmRmD2S3fSul\n9PXabR4AAABQO51OhlNKLZJaso/bzWyZJP/XXQAAAKCO5Dpn2MymSDpF0tNZdIWZ/d7MbjCzykUW\nS19zuZktNLOFra2t3hCgX6J2UY+oW9Qrahd9pcuTYTMbIek2SZ9LKW2T9D1JUyWdrNI7x9/wvi6l\ndF1KqTml1Dxu3LgqbDLQO6hd1CPqFvWK2kVf6dJk2MwGqzQR/nFK6XZJSiltTCntTym9Ken7kk6r\n3WYCAAAA1deV1SRM0vWSlqWUvlmWT8zOJ5akiyW9UJtNrG9nnHGGmz/wwANuzmoSxRZ1h0eXaR08\neLCbH3744W5+0kknVWTRpWG9y8tK0jve8Q43HzlypJvn7Xj3VtSo9SWd6cpvHNFzZciQIW4e1Xm0\n4kNUcwMHDuxS9nY5q0YAfaMrq0mcKekvJD1vZouz7MuS5pvZySott7ZG0qdrsoUAAABAjXRlNYnH\nJHm/lhZiTWEAAAA0Lq5ABwAAgMJiMgwAAIDCYjIMAACAwupKAx0O8olPfKJLGRDJ2zUerRoR5cOH\nD3fzyZMnV2RRd3yU5xV19+fpkKfLHgfLWxNRHq3sUK3tAdD/8c4wAAAACovJMAAAAAqLyTAAAAAK\ni8kwAAAACovJMAAAAArLqtUx3qUHM2uVtLbXHrD2miS19fVG9JL+tK+TU0rjevMBqd261Z/2k7qt\njv50TGupP+0ntdtz/el41lp/2tcu1W6vToYbjZktTCk19/V29IYi7WsRFOV4FmU/i6Qox7Qo+1kU\nRTqe9bivnCYBAACAwmIyDAAAgMJiMtwz1/X1BvSiIu1rERTleBZlP4ukKMe0KPtZFEU6nnW3r5wz\nDAAAgMLinWEAAAAUFpPhLjKzG8xsk5m9UJaNNbMHzGxF9v+YvtzGajCzo83sITNbamZLzOyzWd5w\n+1oU1G7j7WsRULeNt69FQe3W374yGe66GyVdcFB2paQHU0onSHow+7zedUj6QkrpJEmzJX3GzE5S\nY+5rUdwoarfR9rUIbhR122j7WhQ3itqtq31lMtxFKaVHJG0+KL5I0k3ZxzdJ+kCvblQNpJRaUkrP\nZh+3S1omaZIacF+LgtptvH0tAuq28fa1KKjd+ttXJsM9Mz6l1JJ9vEHS+L7cmGozsymSTpH0tBp8\nXwuooY8ntduwGvpYUrcNraGPZ73XLpPhKkmlZTkaZmkOMxsh6TZJn0spbSu/rdH2tega7XhSu8XQ\naMeSui2ORjuejVC7TIZ7ZqOZTZSk7P9Nfbw9VWFmg1Uq7B+nlG7P4obc1wJryONJ7Ta8hjyW1G0h\nNOTxbJTaZTLcM3dLWpB9vEDSXX24LVVhZibpeknLUkrfLLup4fa14BrueFK7hdBwx5K6LYyGO56N\nVLtcdKOLzOwnks6V1CRpo6R/kHSnpFskHSNpraQPp5QOPmm+rpjZWZIelfS8pDez+MsqnQfUUPta\nFNQutVuPqFvqtl5Ru/VXu0yGAQAAUFicJgEAAIDCYjIMAACAwmIyDAAAgMJiMgwAAIDCYjIMAACA\nwmIyDAAAgMJiMgwAAIDCYjIMAACAwvr/nULp1p8Yt+AAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x141020f98>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# learning images on the test set\n",
+    "f, ((ax1, ax2, ax3, ax4)) = plt.subplots(1,4,  sharex='col', sharey='row',figsize=(12,3))\n",
+    "ax1.imshow(np.reshape(image_test[0,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax1.set_title('True image')\n",
+    "ax2.imshow(np.reshape(x_construction[0,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax2.set_title('Learned image')\n",
+    "ax3.imshow(np.reshape(x_construction[999,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax3.set_title('Learned image')\n",
+    "ax4.imshow(np.reshape(x_construction[9999,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax4.set_title('Learned image')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('loss', 139.73684648437501)]"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#calculate the ELBO which is minus the loss for test set\n",
+    "metric = mx.metric.Loss()\n",
+    "model.score(nd_iter_test, metric)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. All together: MXNet-based class VAE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from VAE import VAE"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "One can directly call the class `VAE` to do the training. The outputs are the learned model and training loss.\n",
+    "```VAE(n_latent=5,num_hidden_ecoder=400,num_hidden_decoder=400,x_train=None,x_valid=None,batch_size=100,learning_rate=0.001,weight_decay=0.01,num_epoch=100,optimizer='sgd',model_prefix=None, initializer = mx.init.Normal(0.01),likelihood=Bernoulli)```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Epoch[0] Train-loss=377.146422\n",
+      "INFO:root:Epoch[0] Time cost=5.989\n",
+      "INFO:root:Epoch[1] Train-loss=211.998043\n",
+      "INFO:root:Epoch[1] Time cost=6.303\n",
+      "INFO:root:Epoch[2] Train-loss=207.103096\n",
+      "INFO:root:Epoch[2] Time cost=7.368\n",
+      "INFO:root:Epoch[3] Train-loss=204.958183\n",
+      "INFO:root:Epoch[3] Time cost=7.530\n",
+      "INFO:root:Epoch[4] Train-loss=203.342700\n",
+      "INFO:root:Epoch[4] Time cost=8.887\n",
+      "INFO:root:Epoch[5] Train-loss=201.649251\n",
+      "INFO:root:Epoch[5] Time cost=9.147\n",
+      "INFO:root:Epoch[6] Train-loss=199.782661\n",
+      "INFO:root:Epoch[6] Time cost=8.924\n",
+      "INFO:root:Epoch[7] Train-loss=198.044015\n",
+      "INFO:root:Epoch[7] Time cost=8.920\n",
+      "INFO:root:Epoch[8] Train-loss=195.732077\n",
+      "INFO:root:Epoch[8] Time cost=8.857\n",
+      "INFO:root:Epoch[9] Train-loss=194.070547\n",
+      "INFO:root:Epoch[9] Time cost=9.216\n",
+      "INFO:root:Epoch[10] Train-loss=193.186871\n",
+      "INFO:root:Epoch[10] Time cost=8.966\n",
+      "INFO:root:Epoch[11] Train-loss=192.700208\n",
+      "INFO:root:Epoch[11] Time cost=8.843\n",
+      "INFO:root:Epoch[12] Train-loss=192.191504\n",
+      "INFO:root:Epoch[12] Time cost=8.152\n",
+      "INFO:root:Epoch[13] Train-loss=191.842837\n",
+      "INFO:root:Epoch[13] Time cost=6.180\n",
+      "INFO:root:Epoch[14] Train-loss=191.310450\n",
+      "INFO:root:Epoch[14] Time cost=6.067\n",
+      "INFO:root:Epoch[15] Train-loss=190.520681\n",
+      "INFO:root:Epoch[15] Time cost=6.058\n",
+      "INFO:root:Epoch[16] Train-loss=189.784146\n",
+      "INFO:root:Epoch[16] Time cost=6.046\n",
+      "INFO:root:Epoch[17] Train-loss=188.515020\n",
+      "INFO:root:Epoch[17] Time cost=6.062\n",
+      "INFO:root:Epoch[18] Train-loss=187.530712\n",
+      "INFO:root:Epoch[18] Time cost=6.088\n",
+      "INFO:root:Epoch[19] Train-loss=186.194826\n",
+      "INFO:root:Epoch[19] Time cost=6.491\n",
+      "INFO:root:Epoch[20] Train-loss=185.492288\n",
+      "INFO:root:Epoch[20] Time cost=6.182\n",
+      "INFO:root:Epoch[21] Train-loss=184.922654\n",
+      "INFO:root:Epoch[21] Time cost=6.058\n",
+      "INFO:root:Epoch[22] Train-loss=184.677911\n",
+      "INFO:root:Epoch[22] Time cost=6.042\n",
+      "INFO:root:Epoch[23] Train-loss=183.921396\n",
+      "INFO:root:Epoch[23] Time cost=5.994\n",
+      "INFO:root:Epoch[24] Train-loss=183.600690\n",
+      "INFO:root:Epoch[24] Time cost=6.038\n",
+      "INFO:root:Epoch[25] Train-loss=183.388476\n",
+      "INFO:root:Epoch[25] Time cost=6.025\n",
+      "INFO:root:Epoch[26] Train-loss=182.972208\n",
+      "INFO:root:Epoch[26] Time cost=6.014\n",
+      "INFO:root:Epoch[27] Train-loss=182.561678\n",
+      "INFO:root:Epoch[27] Time cost=6.064\n",
+      "INFO:root:Epoch[28] Train-loss=182.475261\n",
+      "INFO:root:Epoch[28] Time cost=5.983\n",
+      "INFO:root:Epoch[29] Train-loss=182.308808\n",
+      "INFO:root:Epoch[29] Time cost=6.371\n",
+      "INFO:root:Epoch[30] Train-loss=182.135900\n",
+      "INFO:root:Epoch[30] Time cost=6.038\n",
+      "INFO:root:Epoch[31] Train-loss=181.978367\n",
+      "INFO:root:Epoch[31] Time cost=6.924\n",
+      "INFO:root:Epoch[32] Train-loss=181.677153\n",
+      "INFO:root:Epoch[32] Time cost=8.205\n",
+      "INFO:root:Epoch[33] Train-loss=181.677775\n",
+      "INFO:root:Epoch[33] Time cost=6.017\n",
+      "INFO:root:Epoch[34] Train-loss=181.257998\n",
+      "INFO:root:Epoch[34] Time cost=6.056\n",
+      "INFO:root:Epoch[35] Train-loss=181.125288\n",
+      "INFO:root:Epoch[35] Time cost=6.020\n",
+      "INFO:root:Epoch[36] Train-loss=181.018858\n",
+      "INFO:root:Epoch[36] Time cost=6.035\n",
+      "INFO:root:Epoch[37] Train-loss=180.785110\n",
+      "INFO:root:Epoch[37] Time cost=6.049\n",
+      "INFO:root:Epoch[38] Train-loss=180.452598\n",
+      "INFO:root:Epoch[38] Time cost=6.083\n",
+      "INFO:root:Epoch[39] Train-loss=180.362733\n",
+      "INFO:root:Epoch[39] Time cost=6.198\n",
+      "INFO:root:Epoch[40] Train-loss=180.060788\n",
+      "INFO:root:Epoch[40] Time cost=6.049\n",
+      "INFO:root:Epoch[41] Train-loss=180.022728\n",
+      "INFO:root:Epoch[41] Time cost=6.135\n",
+      "INFO:root:Epoch[42] Train-loss=179.648499\n",
+      "INFO:root:Epoch[42] Time cost=6.055\n",
+      "INFO:root:Epoch[43] Train-loss=179.507952\n",
+      "INFO:root:Epoch[43] Time cost=6.108\n",
+      "INFO:root:Epoch[44] Train-loss=179.303132\n",
+      "INFO:root:Epoch[44] Time cost=6.020\n",
+      "INFO:root:Epoch[45] Train-loss=178.945211\n",
+      "INFO:root:Epoch[45] Time cost=6.004\n",
+      "INFO:root:Epoch[46] Train-loss=178.808598\n",
+      "INFO:root:Epoch[46] Time cost=6.016\n",
+      "INFO:root:Epoch[47] Train-loss=178.550906\n",
+      "INFO:root:Epoch[47] Time cost=6.050\n",
+      "INFO:root:Epoch[48] Train-loss=178.403674\n",
+      "INFO:root:Epoch[48] Time cost=6.115\n",
+      "INFO:root:Epoch[49] Train-loss=178.237544\n",
+      "INFO:root:Epoch[49] Time cost=6.004\n",
+      "INFO:root:Epoch[50] Train-loss=178.033747\n",
+      "INFO:root:Epoch[50] Time cost=6.051\n",
+      "INFO:root:Epoch[51] Train-loss=177.802884\n",
+      "INFO:root:Epoch[51] Time cost=6.028\n",
+      "INFO:root:Epoch[52] Train-loss=177.533980\n",
+      "INFO:root:Epoch[52] Time cost=6.052\n",
+      "INFO:root:Epoch[53] Train-loss=177.490143\n",
+      "INFO:root:Epoch[53] Time cost=6.019\n",
+      "INFO:root:Epoch[54] Train-loss=177.136637\n",
+      "INFO:root:Epoch[54] Time cost=6.014\n",
+      "INFO:root:Epoch[55] Train-loss=177.062524\n",
+      "INFO:root:Epoch[55] Time cost=6.024\n",
+      "INFO:root:Epoch[56] Train-loss=176.869033\n",
+      "INFO:root:Epoch[56] Time cost=6.065\n",
+      "INFO:root:Epoch[57] Train-loss=176.704606\n",
+      "INFO:root:Epoch[57] Time cost=6.037\n",
+      "INFO:root:Epoch[58] Train-loss=176.470091\n",
+      "INFO:root:Epoch[58] Time cost=6.012\n",
+      "INFO:root:Epoch[59] Train-loss=176.261440\n",
+      "INFO:root:Epoch[59] Time cost=6.215\n",
+      "INFO:root:Epoch[60] Train-loss=176.133904\n",
+      "INFO:root:Epoch[60] Time cost=6.042\n",
+      "INFO:root:Epoch[61] Train-loss=175.941920\n",
+      "INFO:root:Epoch[61] Time cost=6.000\n",
+      "INFO:root:Epoch[62] Train-loss=175.731296\n",
+      "INFO:root:Epoch[62] Time cost=6.025\n",
+      "INFO:root:Epoch[63] Train-loss=175.613303\n",
+      "INFO:root:Epoch[63] Time cost=6.002\n",
+      "INFO:root:Epoch[64] Train-loss=175.438844\n",
+      "INFO:root:Epoch[64] Time cost=5.982\n",
+      "INFO:root:Epoch[65] Train-loss=175.254716\n",
+      "INFO:root:Epoch[65] Time cost=6.016\n",
+      "INFO:root:Epoch[66] Train-loss=175.090210\n",
+      "INFO:root:Epoch[66] Time cost=6.008\n",
+      "INFO:root:Epoch[67] Train-loss=174.895443\n",
+      "INFO:root:Epoch[67] Time cost=6.008\n",
+      "INFO:root:Epoch[68] Train-loss=174.701321\n",
+      "INFO:root:Epoch[68] Time cost=6.418\n",
+      "INFO:root:Epoch[69] Train-loss=174.553292\n",
+      "INFO:root:Epoch[69] Time cost=6.072\n",
+      "INFO:root:Epoch[70] Train-loss=174.349379\n",
+      "INFO:root:Epoch[70] Time cost=6.048\n",
+      "INFO:root:Epoch[71] Train-loss=174.174641\n",
+      "INFO:root:Epoch[71] Time cost=6.036\n",
+      "INFO:root:Epoch[72] Train-loss=173.966333\n",
+      "INFO:root:Epoch[72] Time cost=6.017\n",
+      "INFO:root:Epoch[73] Train-loss=173.798454\n",
+      "INFO:root:Epoch[73] Time cost=6.018\n",
+      "INFO:root:Epoch[74] Train-loss=173.635657\n",
+      "INFO:root:Epoch[74] Time cost=5.985\n",
+      "INFO:root:Epoch[75] Train-loss=173.423795\n",
+      "INFO:root:Epoch[75] Time cost=6.016\n",
+      "INFO:root:Epoch[76] Train-loss=173.273981\n",
+      "INFO:root:Epoch[76] Time cost=6.018\n",
+      "INFO:root:Epoch[77] Train-loss=173.073401\n",
+      "INFO:root:Epoch[77] Time cost=5.996\n",
+      "INFO:root:Epoch[78] Train-loss=172.888044\n",
+      "INFO:root:Epoch[78] Time cost=6.035\n",
+      "INFO:root:Epoch[79] Train-loss=172.694943\n",
+      "INFO:root:Epoch[79] Time cost=8.492\n",
+      "INFO:root:Epoch[80] Train-loss=172.504260\n",
+      "INFO:root:Epoch[80] Time cost=7.380\n",
+      "INFO:root:Epoch[81] Train-loss=172.323245\n",
+      "INFO:root:Epoch[81] Time cost=6.063\n",
+      "INFO:root:Epoch[82] Train-loss=172.131274\n",
+      "INFO:root:Epoch[82] Time cost=6.209\n",
+      "INFO:root:Epoch[83] Train-loss=171.932986\n",
+      "INFO:root:Epoch[83] Time cost=6.060\n",
+      "INFO:root:Epoch[84] Train-loss=171.755262\n",
+      "INFO:root:Epoch[84] Time cost=6.068\n",
+      "INFO:root:Epoch[85] Train-loss=171.556803\n",
+      "INFO:root:Epoch[85] Time cost=6.004\n",
+      "INFO:root:Epoch[86] Train-loss=171.384773\n",
+      "INFO:root:Epoch[86] Time cost=6.059\n",
+      "INFO:root:Epoch[87] Train-loss=171.185034\n",
+      "INFO:root:Epoch[87] Time cost=6.001\n",
+      "INFO:root:Epoch[88] Train-loss=170.995980\n",
+      "INFO:root:Epoch[88] Time cost=6.143\n",
+      "INFO:root:Epoch[89] Train-loss=170.818701\n",
+      "INFO:root:Epoch[89] Time cost=6.690\n",
+      "INFO:root:Epoch[90] Train-loss=170.629929\n",
+      "INFO:root:Epoch[90] Time cost=6.869\n",
+      "INFO:root:Epoch[91] Train-loss=170.450824\n",
+      "INFO:root:Epoch[91] Time cost=7.156\n",
+      "INFO:root:Epoch[92] Train-loss=170.261806\n",
+      "INFO:root:Epoch[92] Time cost=6.972\n",
+      "INFO:root:Epoch[93] Train-loss=170.070318\n",
+      "INFO:root:Epoch[93] Time cost=6.595\n",
+      "INFO:root:Epoch[94] Train-loss=169.906993\n",
+      "INFO:root:Epoch[94] Time cost=6.561\n",
+      "INFO:root:Epoch[95] Train-loss=169.734455\n",
+      "INFO:root:Epoch[95] Time cost=6.744\n",
+      "INFO:root:Epoch[96] Train-loss=169.564318\n",
+      "INFO:root:Epoch[96] Time cost=6.601\n",
+      "INFO:root:Epoch[97] Train-loss=169.373926\n",
+      "INFO:root:Epoch[97] Time cost=6.725\n",
+      "INFO:root:Epoch[98] Train-loss=169.215408\n",
+      "INFO:root:Epoch[98] Time cost=6.391\n",
+      "INFO:root:Epoch[99] Train-loss=169.039854\n",
+      "INFO:root:Epoch[99] Time cost=6.677\n",
+      "INFO:root:Epoch[100] Train-loss=168.869222\n",
+      "INFO:root:Epoch[100] Time cost=6.370\n",
+      "INFO:root:Epoch[101] Train-loss=168.703175\n",
+      "INFO:root:Epoch[101] Time cost=6.607\n",
+      "INFO:root:Epoch[102] Train-loss=168.523054\n",
+      "INFO:root:Epoch[102] Time cost=6.368\n",
+      "INFO:root:Epoch[103] Train-loss=168.365964\n",
+      "INFO:root:Epoch[103] Time cost=10.267\n",
+      "INFO:root:Epoch[104] Train-loss=168.181174\n",
+      "INFO:root:Epoch[104] Time cost=11.132\n",
+      "INFO:root:Epoch[105] Train-loss=168.021498\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Epoch[105] Time cost=10.187\n",
+      "INFO:root:Epoch[106] Train-loss=167.858251\n",
+      "INFO:root:Epoch[106] Time cost=10.676\n",
+      "INFO:root:Epoch[107] Train-loss=167.690670\n",
+      "INFO:root:Epoch[107] Time cost=10.973\n",
+      "INFO:root:Epoch[108] Train-loss=167.535069\n",
+      "INFO:root:Epoch[108] Time cost=10.108\n",
+      "INFO:root:Epoch[109] Train-loss=167.373971\n",
+      "INFO:root:Epoch[109] Time cost=11.013\n",
+      "INFO:root:Epoch[110] Train-loss=167.207507\n",
+      "INFO:root:Epoch[110] Time cost=11.427\n",
+      "INFO:root:Epoch[111] Train-loss=167.043077\n",
+      "INFO:root:Epoch[111] Time cost=10.349\n",
+      "INFO:root:Epoch[112] Train-loss=166.884060\n",
+      "INFO:root:Epoch[112] Time cost=13.129\n",
+      "INFO:root:Epoch[113] Train-loss=166.746976\n",
+      "INFO:root:Epoch[113] Time cost=11.255\n",
+      "INFO:root:Epoch[114] Train-loss=166.572499\n",
+      "INFO:root:Epoch[114] Time cost=10.037\n",
+      "INFO:root:Epoch[115] Train-loss=166.445170\n",
+      "INFO:root:Epoch[115] Time cost=10.406\n",
+      "INFO:root:Epoch[116] Train-loss=166.284912\n",
+      "INFO:root:Epoch[116] Time cost=10.170\n",
+      "INFO:root:Epoch[117] Train-loss=166.171475\n",
+      "INFO:root:Epoch[117] Time cost=10.034\n",
+      "INFO:root:Epoch[118] Train-loss=166.015457\n",
+      "INFO:root:Epoch[118] Time cost=10.047\n",
+      "INFO:root:Epoch[119] Train-loss=165.882208\n",
+      "INFO:root:Epoch[119] Time cost=10.008\n",
+      "INFO:root:Epoch[120] Train-loss=165.753836\n",
+      "INFO:root:Epoch[120] Time cost=10.056\n",
+      "INFO:root:Epoch[121] Train-loss=165.626045\n",
+      "INFO:root:Epoch[121] Time cost=10.704\n",
+      "INFO:root:Epoch[122] Train-loss=165.492859\n",
+      "INFO:root:Epoch[122] Time cost=10.609\n",
+      "INFO:root:Epoch[123] Train-loss=165.361132\n",
+      "INFO:root:Epoch[123] Time cost=10.027\n",
+      "INFO:root:Epoch[124] Train-loss=165.256487\n",
+      "INFO:root:Epoch[124] Time cost=11.225\n",
+      "INFO:root:Epoch[125] Train-loss=165.119995\n",
+      "INFO:root:Epoch[125] Time cost=11.266\n",
+      "INFO:root:Epoch[126] Train-loss=165.012773\n",
+      "INFO:root:Epoch[126] Time cost=10.547\n",
+      "INFO:root:Epoch[127] Train-loss=164.898748\n",
+      "INFO:root:Epoch[127] Time cost=10.339\n",
+      "INFO:root:Epoch[128] Train-loss=164.775702\n",
+      "INFO:root:Epoch[128] Time cost=10.875\n",
+      "INFO:root:Epoch[129] Train-loss=164.692449\n",
+      "INFO:root:Epoch[129] Time cost=8.412\n",
+      "INFO:root:Epoch[130] Train-loss=164.564323\n",
+      "INFO:root:Epoch[130] Time cost=7.239\n",
+      "INFO:root:Epoch[131] Train-loss=164.468273\n",
+      "INFO:root:Epoch[131] Time cost=10.096\n",
+      "INFO:root:Epoch[132] Train-loss=164.328320\n",
+      "INFO:root:Epoch[132] Time cost=9.680\n",
+      "INFO:root:Epoch[133] Train-loss=164.256156\n",
+      "INFO:root:Epoch[133] Time cost=10.707\n",
+      "INFO:root:Epoch[134] Train-loss=164.151625\n",
+      "INFO:root:Epoch[134] Time cost=13.835\n",
+      "INFO:root:Epoch[135] Train-loss=164.046402\n",
+      "INFO:root:Epoch[135] Time cost=10.049\n",
+      "INFO:root:Epoch[136] Train-loss=163.960676\n",
+      "INFO:root:Epoch[136] Time cost=9.625\n",
+      "INFO:root:Epoch[137] Train-loss=163.873193\n",
+      "INFO:root:Epoch[137] Time cost=9.845\n",
+      "INFO:root:Epoch[138] Train-loss=163.783837\n",
+      "INFO:root:Epoch[138] Time cost=9.618\n",
+      "INFO:root:Epoch[139] Train-loss=163.658903\n",
+      "INFO:root:Epoch[139] Time cost=10.411\n",
+      "INFO:root:Epoch[140] Train-loss=163.588920\n",
+      "INFO:root:Epoch[140] Time cost=9.633\n",
+      "INFO:root:Epoch[141] Train-loss=163.493254\n",
+      "INFO:root:Epoch[141] Time cost=10.668\n",
+      "INFO:root:Epoch[142] Train-loss=163.401188\n",
+      "INFO:root:Epoch[142] Time cost=10.644\n",
+      "INFO:root:Epoch[143] Train-loss=163.334470\n",
+      "INFO:root:Epoch[143] Time cost=9.665\n",
+      "INFO:root:Epoch[144] Train-loss=163.235133\n",
+      "INFO:root:Epoch[144] Time cost=9.612\n",
+      "INFO:root:Epoch[145] Train-loss=163.168029\n",
+      "INFO:root:Epoch[145] Time cost=9.578\n",
+      "INFO:root:Epoch[146] Train-loss=163.092392\n",
+      "INFO:root:Epoch[146] Time cost=10.215\n",
+      "INFO:root:Epoch[147] Train-loss=163.014362\n",
+      "INFO:root:Epoch[147] Time cost=12.296\n",
+      "INFO:root:Epoch[148] Train-loss=162.891574\n",
+      "INFO:root:Epoch[148] Time cost=9.578\n",
+      "INFO:root:Epoch[149] Train-loss=162.831664\n",
+      "INFO:root:Epoch[149] Time cost=9.536\n",
+      "INFO:root:Epoch[150] Train-loss=162.768784\n",
+      "INFO:root:Epoch[150] Time cost=9.607\n",
+      "INFO:root:Epoch[151] Train-loss=162.695416\n",
+      "INFO:root:Epoch[151] Time cost=9.681\n",
+      "INFO:root:Epoch[152] Train-loss=162.620814\n",
+      "INFO:root:Epoch[152] Time cost=9.464\n",
+      "INFO:root:Epoch[153] Train-loss=162.527031\n",
+      "INFO:root:Epoch[153] Time cost=9.518\n",
+      "INFO:root:Epoch[154] Train-loss=162.466575\n",
+      "INFO:root:Epoch[154] Time cost=9.562\n",
+      "INFO:root:Epoch[155] Train-loss=162.409388\n",
+      "INFO:root:Epoch[155] Time cost=9.483\n",
+      "INFO:root:Epoch[156] Train-loss=162.308957\n",
+      "INFO:root:Epoch[156] Time cost=9.545\n",
+      "INFO:root:Epoch[157] Train-loss=162.211725\n",
+      "INFO:root:Epoch[157] Time cost=9.542\n",
+      "INFO:root:Epoch[158] Train-loss=162.141098\n",
+      "INFO:root:Epoch[158] Time cost=9.768\n",
+      "INFO:root:Epoch[159] Train-loss=162.124311\n",
+      "INFO:root:Epoch[159] Time cost=7.155\n",
+      "INFO:root:Epoch[160] Train-loss=162.013039\n",
+      "INFO:root:Epoch[160] Time cost=6.147\n",
+      "INFO:root:Epoch[161] Train-loss=161.954485\n",
+      "INFO:root:Epoch[161] Time cost=9.121\n",
+      "INFO:root:Epoch[162] Train-loss=161.913859\n",
+      "INFO:root:Epoch[162] Time cost=9.936\n",
+      "INFO:root:Epoch[163] Train-loss=161.830799\n",
+      "INFO:root:Epoch[163] Time cost=8.612\n",
+      "INFO:root:Epoch[164] Train-loss=161.768672\n",
+      "INFO:root:Epoch[164] Time cost=9.722\n",
+      "INFO:root:Epoch[165] Train-loss=161.689120\n",
+      "INFO:root:Epoch[165] Time cost=9.478\n",
+      "INFO:root:Epoch[166] Train-loss=161.598279\n",
+      "INFO:root:Epoch[166] Time cost=9.466\n",
+      "INFO:root:Epoch[167] Train-loss=161.551172\n",
+      "INFO:root:Epoch[167] Time cost=9.419\n",
+      "INFO:root:Epoch[168] Train-loss=161.488880\n",
+      "INFO:root:Epoch[168] Time cost=9.457\n",
+      "INFO:root:Epoch[169] Train-loss=161.410458\n",
+      "INFO:root:Epoch[169] Time cost=9.504\n",
+      "INFO:root:Epoch[170] Train-loss=161.340681\n",
+      "INFO:root:Epoch[170] Time cost=9.866\n",
+      "INFO:root:Epoch[171] Train-loss=161.281700\n",
+      "INFO:root:Epoch[171] Time cost=9.526\n",
+      "INFO:root:Epoch[172] Train-loss=161.215523\n",
+      "INFO:root:Epoch[172] Time cost=9.511\n",
+      "INFO:root:Epoch[173] Train-loss=161.152452\n",
+      "INFO:root:Epoch[173] Time cost=9.498\n",
+      "INFO:root:Epoch[174] Train-loss=161.058544\n",
+      "INFO:root:Epoch[174] Time cost=9.561\n",
+      "INFO:root:Epoch[175] Train-loss=161.036475\n",
+      "INFO:root:Epoch[175] Time cost=9.463\n",
+      "INFO:root:Epoch[176] Train-loss=161.009996\n",
+      "INFO:root:Epoch[176] Time cost=9.629\n",
+      "INFO:root:Epoch[177] Train-loss=160.853546\n",
+      "INFO:root:Epoch[177] Time cost=9.518\n",
+      "INFO:root:Epoch[178] Train-loss=160.860520\n",
+      "INFO:root:Epoch[178] Time cost=9.395\n",
+      "INFO:root:Epoch[179] Train-loss=160.810621\n",
+      "INFO:root:Epoch[179] Time cost=9.452\n",
+      "INFO:root:Epoch[180] Train-loss=160.683071\n",
+      "INFO:root:Epoch[180] Time cost=9.411\n",
+      "INFO:root:Epoch[181] Train-loss=160.674101\n",
+      "INFO:root:Epoch[181] Time cost=8.784\n",
+      "INFO:root:Epoch[182] Train-loss=160.554823\n",
+      "INFO:root:Epoch[182] Time cost=7.265\n",
+      "INFO:root:Epoch[183] Train-loss=160.536528\n",
+      "INFO:root:Epoch[183] Time cost=6.108\n",
+      "INFO:root:Epoch[184] Train-loss=160.525913\n",
+      "INFO:root:Epoch[184] Time cost=6.349\n",
+      "INFO:root:Epoch[185] Train-loss=160.399412\n",
+      "INFO:root:Epoch[185] Time cost=7.364\n",
+      "INFO:root:Epoch[186] Train-loss=160.380027\n",
+      "INFO:root:Epoch[186] Time cost=7.651\n",
+      "INFO:root:Epoch[187] Train-loss=160.272921\n",
+      "INFO:root:Epoch[187] Time cost=7.309\n",
+      "INFO:root:Epoch[188] Train-loss=160.243907\n",
+      "INFO:root:Epoch[188] Time cost=7.162\n",
+      "INFO:root:Epoch[189] Train-loss=160.194351\n",
+      "INFO:root:Epoch[189] Time cost=8.941\n",
+      "INFO:root:Epoch[190] Train-loss=160.130400\n",
+      "INFO:root:Epoch[190] Time cost=10.242\n",
+      "INFO:root:Epoch[191] Train-loss=160.073841\n",
+      "INFO:root:Epoch[191] Time cost=10.528\n",
+      "INFO:root:Epoch[192] Train-loss=160.021623\n",
+      "INFO:root:Epoch[192] Time cost=9.482\n",
+      "INFO:root:Epoch[193] Train-loss=159.938673\n",
+      "INFO:root:Epoch[193] Time cost=9.465\n",
+      "INFO:root:Epoch[194] Train-loss=159.885823\n",
+      "INFO:root:Epoch[194] Time cost=9.523\n",
+      "INFO:root:Epoch[195] Train-loss=159.886516\n",
+      "INFO:root:Epoch[195] Time cost=9.599\n",
+      "INFO:root:Epoch[196] Train-loss=159.797400\n",
+      "INFO:root:Epoch[196] Time cost=8.675\n",
+      "INFO:root:Epoch[197] Train-loss=159.705562\n",
+      "INFO:root:Epoch[197] Time cost=9.551\n",
+      "INFO:root:Epoch[198] Train-loss=159.738354\n",
+      "INFO:root:Epoch[198] Time cost=9.919\n",
+      "INFO:root:Epoch[199] Train-loss=159.619932\n",
+      "INFO:root:Epoch[199] Time cost=10.121\n"
+     ]
+    }
+   ],
+   "source": [
+    "# can initilize weights and biases with the learned parameters \n",
+    "#init = mx.initializer.Load(params)\n",
+    "\n",
+    "# call the VAE , output model contains the learned model and training loss\n",
+    "out = VAE(n_latent=2,x_train=image,x_valid=None,num_epoch=200) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# encode test images to obtain mu and logvar which are used for sampling\n",
+    "[mu,logvar] = VAE.encoder(out,image_test)\n",
+    "#sample in the latent space\n",
+    "z = VAE.sampler(mu,logvar)\n",
+    "# decode from the latent space to obtain reconstructed images\n",
+    "x_construction = VAE.decoder(out,z)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAADSCAYAAACvmc1VAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XuU3GWd5/HPl9xJQsilE5JAEgIBQgQCtshNyQpBRVE5\nq86gO6COg7PHdeTozIjumRWdYWX3eJ11jw4uCOIFdcHb6OhGRA0YkA6JQIgxkgSTkPu1cyfJs3/U\nL0yR+n5J/7qrurvqeb/OyUn3p55UP7/+fav6SXU935+llAQAAADk6Li+ngAAAADQV1gMAwAAIFss\nhgEAAJAtFsMAAADIFothAAAAZIvFMAAAALLFYrgfMLP/Y2Yf6+t5AD1hZtPMLJnZwOD2JWY2p5en\nBRwTtYtmRN3WD4thSWa2q+rPYTPbW/X5Oxv99VNK700p/fdGfx00DzNbZWZX9vU86imlNCul9Mu+\nngcai9pFM6Ju8+b+byI3KaURRz42s1WS3ptS+nk03swGppQO9sbcgEahjtGsqF00I+q2/+KV4S4w\ns38ys2+b2bfMrFPSfzKzr5vZLVVjriwW0kc+P9nMvmdmm8xspZm9/yXu/4X7OnI/ZvbR4t8+Z2bX\nmNkbzWy5mW01s7+v+rcXm9kjZrbdzNaZ2T+b2aCq219vZn8wsx1m9r/M7GEze1fV7e81s9+b2TYz\n+zczO6VO3zY0SFELi4tz/hszO7fqtpvN7Bkz6zSzp83s2qrb3lWc/8+Z2RZJtxTZQ2b26aIGVprZ\n66v+zSgzu6OorbXFY2FAcduA4t9tNrMVkt5wjHm/8MqLmd1iZt8tar/TzJ40szOKut9oZqvN7Kqq\nf/tuM1tajF1hZu876r7/vpjjc0VNJzM7vbhtSDHPP5nZBjP7spkN69lZQHdQu9RuM6JuW79uWQx3\n3bWSvilplKRvv9RAMztO0r9KekzSZElzJf2dmV3Rxa91sirnZpKkf5R0h6Q/l3S+pDmSPmlmU4qx\nByV9UNI4SZdKep2k9xXzGC/pO5L+rrh9paQLq+b5H4vb3iypTdKjxTGinzKz8yXdqco5HivpXyT9\n0MyGFEOekfQqVer0E5K+bmYTq+7ilZJWSJog6daqbJkqNfI/Jd1hZlbcdpcqNXa6KvV3laT3Frf9\nlaQ3Fnm7pLeWPJxrJN0jabSkRZJ+pkrdT5b0yeLYjthYfK0TJL1b0ufM7ILie/I6SR+SdGUxzzlH\nfZ3bJJ0haXZx+2RJ/63kXNFD1C6124yo20zqNqXEn6o/klZJuvKo7J8k/eKo7OuSbqn6/EpJq4qP\nL5W04qjx/yDpK8HXfOG+ivvZJWlA8floSUnSy6vG/07SG4P7+ltJ3y0+fo+k+VW3maR1kt5VfD5P\n0g1Vtw+UtF/S5L4+D7n/8eqwyL8k6R+PypZJujy4n8WS3lx8/C5Jfzrq9ndJ+mPV58cX9XaSKk/e\n+yUNq7r9OkkPFh//QtJfV912VfFvBx7rmCTdImle1W3XHFX3I4v7OjG4r+9L+mDx8Z2SPlV12+nF\nvz29qPndkk6ruv1iSSv7+hy36h9ql9ptxj/Ubd51y3uGu251ibFTJU0xs+1V2QBJv+ziv9+cUjpU\nfLy3+HtD1e17JY2QJDM7S9JnJL1clQfVQFVe4ZUqryy/MO+UUjKzNUfN83+b2ReqssOqvDK9totz\nRe+aKukGM/tAVTZYlXMtM7telf+xTytuG6HKqw9HeHW8/sgHKaU9xQsUIySNkTRI0rp/f9FCx1Xd\nx4vqS9KzJY/l6Jr26n6EpO3FrxE/rsqrDcepUutPVs2jo+q+qufUVoxdWHUMpsrjEb2L2qV2mxF1\nm0HdshjuunTU57tVOeFHnFT18WpJy1NKMxs+q8qvNR6R9GcppV1m9req/GpDqrwKXP0eIFPl1xXV\n8/yHlNJLvu0D/cpqSbemlG49+gYzmyrpK5KukLQgpXTIzBar8kR0xNF1fKyvtV/SuORv+lgnqfo9\n5lOcMT1W/DryPknXS/pBSul5M/u+/v241qnyH7gjque0WZUn+VkpJf6D17eoXWq3GVG3GdQt7xnu\nvsWS3mBmo4v3B/1N1W0LJB0wsw+b2dDiTe/nmNnLGzCPkZJ2SNptZjNVvF+48K+SLrDKBryBqry3\nuK3q9i9L+q/Fv5OZnWhmZd+DhMYZVNTPkT8DVXni/Wsze6VVDDezN5jZSEnDVXni3SRVNkBIell3\nv3hKaZ2k/yfpM2Z2gpkdZ2anmdnlxZDvSPobq2wWHS3p5h4c60sZLGmIKsd1sHjF4qqq278j6d1m\nNtPMjlflLUlHjuGwKt+zzxXvoZeZTTaz1zZorqigdiuo3eZC3VZkV7cshrvvLklLVfk1xU8l3Xvk\nhuJ/dFerslltlSr/U/oXVd6IXm8flnSDpM7ia7zwKm9KaYOkP5P0WUlbJJ2mypvm9xe3f7e47btm\ntlPSE5L6dcFm5ieq/A/7yJ9bUkodqmyi+KKkbZL+qMp70JRSelqVt8wsUOXXYedIeriHc7helSfG\np4uv938lHdkc8hVVNmD8TtLjku7v4ddypZQ6VfnP5neKObxD0g+rbv83Sf8s6UFVvh+PFDftL/7+\nyJG8qPOfSzqzEXPFC6hdUbtNiLpVnnVrxZubkQGrtGd5TtJbU0rz+3o+QCMUv+l4StKQ4FeNQL9E\n7aIZtULd8spwizOz1xVvfxiiyq8ynpf02z6eFlBXZnatVXpbjpb0PyT9qFmflJEXahfNqNXqlsVw\n67tMlR6Hm1R5C8S1KaX9L/1PgKbzPlX6Yj4j6ZCk/9y30wG6jNpFM2qpuuVtEgAAAMgWrwwDAAAg\nWz1aDBfvR11mZn80s0a1+AAAAAAaottvkyg6E/xB0lxJayQ9Jum6otWIa9y4cWnatGnd+nrAEQsX\nLtycUmo79sj6oXbRU6tWrdLmzZvt2CPrh7pFPfCci2bV1drtyRXoLlTl+torJMnM7pX0ZlV647mm\nTZumjo6O6GagS8ys7CUoe4zaRU+1t7f3+tekblEPPOeiWXW1dnvyNonJevH1qNfoxZf6BQAAAPq1\nhm+gM7MbzazDzDo2bdrU6C8H1A21i2ZE3aJZUbvoKz1ZDK+VdErV5ycX2YuklG5PKbWnlNrb2nr1\nLUdAj1C7aEbULZoVtYu+0pPF8GOSZpjZqWY2WNKfq+ra1QAAAEB/1+0NdCmlg2b2XyT9TNIASXem\nlJbUbWYAAABAg/Wkm4RSSj+R9JM6zQUAAADoVVyBDgAAANliMQwAAIBssRgGAABAtnr0nmEAAPpC\nSqmh92/Wq1fORotqZJ0ePnzYzQ8ePOjmhw4dcnNvjlH9H3ec/xrqwIH+cjIaH91/Xz3ueGUYAAAA\n2WIxDAAAgGyxGAYAAEC2WAwDAAAgWyyGAQAAkC26SQAAAFQp2wWizPiy9x11gdi3b5+bb9++3c23\nbt3q5gcOHKjJBg8e7I4dNWqUm48ePdrNhw0b5uZluk/0RocJXhkGAABAtlgMAwAAIFsshgEAAJAt\nFsMAAADIFhvoALScel0ClUvy9g+NvvRyPTRyA1WkTH3mXsv12hBXj/uJLqMcbZTbvXu3m69evdrN\nFy1a5OYrVqxwc28j3oknnuiOPf3009181qxZbj5p0iQ3HzFihJsPGjTIzT31rGleGQYAAEC2WAwD\nAAAgWyyGAQAAkC0WwwAAAMgWi2EAAABkq0fdJMxslaROSYckHUwptddjUgBQrdHdBOpx/znv1u+L\nbg+N7shQtpuA1yEg6g5Q9r6jOXqXro3yAQMGlLrvXNSrjsqc06ibxP79+9183bp1bv7oo4+6+WOP\nPebmGzZscHPv0sjjx493x0bdHqLuE0OHDi11P16dRrVb9vHyUurRWu0/pJQ21+F+AAAAgF7F2yQA\nAACQrZ4uhpOkn5vZQjO7sR4TAgAAAHpLTxfDl6WUZkt6vaT3m9mrjx5gZjeaWYeZdWzatKmHXw7o\nPdQumhF1i2ZF7aKv9GgxnFJaW/y9UdL3JF3ojLk9pdSeUmpva2vryZcDehW1i2ZE3aJZUbvoK93e\nQGdmwyUdl1LqLD6+StIn6zYzIEP1uO59d+6nL+bSyLmX2WUvldt9XLb7QLPu1i9zHup1jqNd9l4e\njY06OETjo7mUvR9vfHQfkWjXfNkd+V53gLIdKZq1bqW+6W4S8erl+eefd8dGr4ZH3SGibhIrVqxw\n82HDhrn5mDFjujw2mvvGjRvdPOoycfzxx7v54MGDa7LeqNGedJOYIOl7xWQGSvpmSumndZkVAAAA\n0Au6vRhOKa2QdF4d5wIAAAD0KlqrAQAAIFsshgEAAJAtFsMAAADIVj0ux9zvPPLII27+hS98wc0n\nT57s5tFuyhtuuKEm83ZjvlSO1lNm53y0y/zAgQNuvmfPHjffvXt3qfH79+/vUvZSc9m7d6+bR7uM\no/vp7Ox08127dtVk0Ryj3fSnnXaam8+cOdPNx48f7+bezuZox3+z7r4vU7f16uAQ1Yp37iVp27Zt\nNdnmzZvdsTt27HDzgwcPunnZziNepwbJP9aoPqOfLdHO+9GjR7v5yJEj3TyaO7quXt10vMdGVKNL\nlixx8wULFrj50qVL3TyqizPOOMPNTz311Josqrl9+/a5efQzZ+vWrW4e1foJJ5xQk3nPw/XGIwYA\nAADZYjEMAACAbLEYBgAAQLZYDAMAACBbLIYBAACQrZbsJuF1e5Ck5cuX1+X+b7311pps1KhR7tiL\nLrqoLl+zL0ybNs3NP/rRj7r5lClTGjib/qPM7nvJ300c7chdv369m0e1G12Dfu3atW7uXT8+uqb8\n6tWr3TzaCR0df7QTOOrK4HUaiHYqDxkyxM1f/vKXu/mb3vQmN7/iiivcfOzYsTVZNO/+rmzdet0R\noq4RUaeGqM63bNni5suWLXPzxYsX12RR3UbnJ9odP2LECDePOj5EHTK8LivR8Uc76WfMmOHmUZ0f\nf/zxbt6snU2aQZnHi+R301mzZo07dv78+W6+aNGiLs6uIuoacfHFF3d5fFRDzz33nJtHx7R9+3Y3\njx4b3vcx+t7W87mYV4YBAACQLRbDAAAAyBaLYQAAAGSLxTAAAACyxWIYAAAA2WrJbhLf//733dzb\nkSxJs2bNcvPoOuGPPvpoTfaDH/zAHfuzn/3Mzb1rgUvSypUr3bysgQNrT+3EiRPdsVHXgEjUZeIj\nH/lIqftpNfXoJrFu3To3f/LJJ9086jIR1ZF3rqNd+bt373bzQYMGubl3TXkp3vE+cuRIN9+5c2dN\nFu1I9jpPSHEngKjLhLfjO3dePZepcSk+b1E9P/jgg27udU2JznH0/DRu3Dg3P+mkk9zcew6VpG3b\ntrm59ziKHs/RfYwZM8bNo049xx3nv57ldQLIqcOEd6xR7ZYV3U/UUWXz5s012cMPP+yO9dYWktTZ\n2enmF154oZtfcsklbt7e3u7mXneT6GfUrl273Dyqxeg5OuoQUeZ5p554ZRgAAADZYjEMAACAbLEY\nBgAAQLZYDAMAACBbx1wMm9mdZrbRzJ6qysaY2TwzW1787V/nEgAAAOjHutJN4i5JX5T0tarsZkkP\npJRuM7Obi8/7TSuBmTNnlsoj5557rptfd911Ndltt93mjl21apWbR90kvF3T3TF48OCaLOomEc1l\n06ZNbn7WWWd1f2KQFF9TfejQoW4edV4YNWqUm0+aNMnNvc4Op5xySqmvOX78eDefPn26m0+ePNnN\no93EHR0dNdm9997rjt26daubR4YPH+7mQ4YMcXNvh3Quu/LLHGfUTcLbSS9Jjz/+uJtHz5fe4+WM\nM85wx1566aVuHj3PjRgxws2j3fR/+tOf3Nx77o66Ruzfv9/No533UQeXqONFmbrNpZ7rJXreiros\nLFq0qCZbsGCBO3bHjh1ufs4557j5q1/9ajd/xSte4ebRc7f3+Ioe01Fnhz179rh5szjmK8MppV9L\nOvonzpsl3V18fLekt9R5XgAAAEDDdfc9wxNSSkcaKK6XNKFO8wEAAAB6TY830KXKa+ZhR2Qzu9HM\nOsysI/q1O9AfUbtoRtQtmhW1i77S3cXwBjObKEnF3/4lrCSllG5PKbWnlNrb2tq6+eWA3kftohlR\nt2hW1C76Sncvx/xDSTdIuq34278WcUaijU9lN5uV3eRXRnSpx2iTyytf+Uo3v+qqq+o2p1YSbUTx\nNid4GxwlacIE/x1Hs2fPdvOxY8e6ebQRw9tYE10uOdpwGeXRXKJNPlu2bHFzbxNVtJkj2ogYbdo7\n88wz3TzaWBfdv6dZNyKV2VgVjY0uRRtdjnj9+vVuHm1OOvvss2uyuXPnumNnzJjh5tFm0+iYog1R\n0WZLb3x0qfNog2t0ienoMVpmAx180fmPajHa5Bhtfp83b15NtmbNGndstF64/PLL3TzaLBptlIs2\nYnrPr7t373bHRptco8d6dBn06PveV7XbldZq35K0QNKZZrbGzP5SlUXwXDNbLunK4nMAAACgqRzz\nleGUUm0fsYor6jwXAAAAoFfxuxQAAABki8UwAAAAssViGAAAANnqbjcJ9HPeTtBrr73WHRvtmv38\n5z/v5tGO59zVo5tEtOM96qYQ7TKPduR65y7qhBJdpjYaH33NaO5RB4Knn366Juvs7HTHRpeMPv/8\n89389NNPd/McLscczTu6vGqZ44wuXbx69Wo33759u5tHXT0uuOCCmiw6l6NHj3bzqPNC9PwXfV+i\nTi1eh4CoU090aeioU0v0nBs95pq1RhupbNeI6Hlrw4YNbv7jH//YzX//+9/XZFHXoAsvvNDN58yZ\n4+YnnXSSm0e1Hh2T9/y6fPlyd+zChQvdPOqcUrYWvZ+XvVHnvDIMAACAbLEYBgAAQLZYDAMAACBb\nLIYBAACQLRbDAAAAyBbdJFrUXXfdVZOtX7/eHTt27Fg3nzp1aj2n1DLqsSs/2h0bXTs+6poQdXaI\n7sfrmuDt3pXiOZa9dvzzzz/v5g8//LCb/+IXv6jJos4TZ555ppu/5jWvcfPx48e7edTdg135LxbV\n+K5du9w86qawd+9eN4+6KZx88sk1WVT70RwPHDjg5lEnjC1btrj5U0895eZPPvlkTbZnzx53bNQ1\n5oQTTnDzqDtA2cdizqK6KNNhQZIeeOABN1+wYIGbe89/USeUyy67zM3b2trcPHp+imrd6zIlSStX\nrqzJvOdhSXr88cdL3Xe0voh+RpV5zq1HJ5wjeCQBAAAgWyyGAQAAkC0WwwAAAMgWi2EAAABki8Uw\nAAAAskU3iSb3zDPPuPmHPvShLt9HtAs2uu45Gifq7BDtnPe6Q0jldp9HO3IjZcevW7fOze+99143\n9zoQRLvvr776ajc/77zz3Dz6PtI1omsOHz7s5lE3iWinfrSTPLJx48aarGznhairSdQ1YMWKFW4e\n7aZfs2ZNTRbV7ejRo908qs9I2cdiLrzvS1SL+/fvd/MlS5a4+bx589z82WefdXOvc0TUTSKql7Ld\nIaJajzqkLFy4sCZbtmyZO3b16tVuHtVi1Amov3VI4ZVhAAAAZIvFMAAAALLFYhgAAADZYjEMAACA\nbB1zMWxmd5rZRjN7qiq7xczWmtni4o+/iwUAAADox7rSTeIuSV+U9LWj8s+llD5d9xmhlB/96Edu\n7u0mfdvb3uaOnT59el3nlKuoI4G3yzYaG+2kLbv7PlKma0I0Nsr37dvn5j/+8Y/dvKOjw829XcZz\n5sxxx771rW9182i3ftStI2dl6jbaMR7tDD/xxBPdfMyYMW4e7Tz3aiXqpBOd46gTRnRMW7dudfOn\nnnrKzb2OGuPGjXPHjh8/3s0HDx7s5n21w75Zeec66rCwfv16N49+tj7xxBNuHtWRd64nTJjgjo06\nW0RdI6JOKNGxep16JGnDhg012Z49e9yx0WM06mwUdciInjP6yjEfYSmlX0vynxUAAACAJtaT/25+\nwMyeKN5G4b8MAwAAAPRj3V0Mf0nSdEmzJa2T9JlooJndaGYdZtaxadOmbn45oPdRu2hG1C2aFbWL\nvtKtxXBKaUNK6VBK6bCkr0i68CXG3p5Sak8ptbe1tXV3nkCvo3bRjKhbNCtqF32lW4thM5tY9em1\nkvwdBQAAAEA/dsztfGb2LUlzJI0zszWSPi5pjpnNlpQkrZL0vgbOEYp3h37ve99zc29n56c+9Sl3\nLDvsG8vbrV+2m0SZHf9lld2pfujQITePdvffc889bh7tnJ45c2ZNdtNNN7ljp06d6ubRrvyyHTLw\nYtH3Keoacd5557l5dH62bNni5l7NRTvso84rQ4cOLZVH9blt2zY390yZMsXNX/ayl7l51AUlOqbo\nuTuXeo6e/7yOB9u3b3fH/uY3v3Hz3/72t26+d+9eN584caKbn3rqqTVZ1GGhbNeIqONJtF7YuHGj\nm+/cubMmi7pGRLV48sknu7l3/JI0bNgwN+8rx1wMp5Suc+I7GjAXAAAAoFfRvBAAAADZYjEMAACA\nbLEYBgAAQLZYDAMAACBb/evi0AjdcYe/Z3H+/Plu/o53vKMmmz59el3nhO4r200i2jVdj24S0VwO\nHz7s5tGO/y9/+ctuvmTJEjePdl/feOONNdkrXvEKd2zUCaBshwzU8upi4ED/R8a4cePc3OtqI0mn\nnHKKm0e75g8cOFCTRbVftmNItIN/5cqVbh51U/E6QVx00UXu2BkzZrj58OHD3Tz6vpfpGtGKHSai\nGti3b19NFp3PhQsXurnXYUGKa93rgiP53RSijgxRB5Ooa8SGDRvcfMeOHW6+evVqN1+zZk2X7zvq\neBJ1jjnnnHPcPKr1vnru5icGAAAAssViGAAAANliMQwAAIBssRgGAABAtthA188sXrzYzT/wgQ+4\neXQZ1E9+8pN1mxPqr+wmn3pslItE971nzx43/+lPf+rm999/v5tHl9597Wtf6+Zvf/vba7Lo0p1s\nlGucMhvoovMQbaCLNuFEm9O8Go3GRpeRjTYnrV27ttT4aOPPhAkTarK5c+e6Y6PjL7tRrhU3xZUR\nbfL1Nr/98Y9/dMdGm83Gjx/v5tHP3GhjnbeZ77nnnnPHRpeMXrFihZtHtbtq1apS470NqtElo9vb\n2938yiuvdPOpU6e6efRzoa9qmp8kAAAAyBaLYQAAAGSLxTAAAACyxWIYAAAA2WIxDAAAgGzRTaKP\n7N27182vu+46N492Tr/zne90cy693FrqtcPW25Uf7Zr/3e9+5+bf/OY33Ty6ZOj555/v5tdff72b\ne7u1c981319EXSOi8zNgwAA3j7omRJ1NvK4B0XNimY4Ukr/bX4ovxztmzBg3v+CCC2qy6HmYrhHl\nlKkLyX9Oi7rjRB1Pom4K0bmLulIsWrSoJos6nqxbt87No04YmzdvdvPoEuPR3L2OD1F3iGuuucbN\nzzzzTDePOqdEzw1erfdG/fPKMAAAALLFYhgAAADZYjEMAACAbLEYBgAAQLZYDAMAACBbx+wmYWan\nSPqapAmSkqTbU0pfMLMxkr4taZqkVZLenlLa1ripNqdot+sb3vAGN1+2bJmbz5w5080/8YlPdG9i\nyJJ3DfpnnnnGHXvfffe5+cKFC928ra3Nza+++mo3nzVrlpt7O55z303f35U9P9H46PmyzH1E+fPP\nP+/mq1evLjUXb+e9JJ177rk12ciRI92xZUXdFHhc+AYNGlSTRc9PUdeIzs5ON4+6UmzYsMHNt22r\nXRZFXSCijjxRd4hI1MEh6uzjdYiYM2eOO3bSpElufvzxx7u5dy6k8p1pGq0rrwwflPThlNLZki6S\n9H4zO1vSzZIeSCnNkPRA8TkAAADQNI65GE4prUspPV583ClpqaTJkt4s6e5i2N2S3tKoSQIAAACN\nUOo9w2Y2TdL5kh6VNCGldKRD9HpV3kbh/ZsbzazDzDo2bdrUg6kCvYvaRTOibtGsqF30lS4vhs1s\nhKT7JN2UUnrRpXlS5Q1N7puaUkq3p5TaU0rt0Xt2gP6I2kUzom7RrKhd9JUuLYbNbJAqC+FvpJTu\nL+INZjaxuH2ipI2NmSIAAADQGF3pJmGS7pC0NKX02aqbfijpBkm3FX//oCEzbHLR7tBf/vKXpe7n\nnnvucfMxY8aUnRIy4HWNkKS1a9fWZPPmzXPH/vznP3fzaBfwq171Kjd/05ve5ObDhw93c3bIt76o\nO0KUl3Ho0CE3j7oDbNmyxc2j+pw4caKbn3XWWTVZtJM+UrZrRC5dJqLjiZ6LvC4eUReQqFPD0KFD\n3Tx6+0bUTcGb4759+9yxkehV8unTp7v5RRdd5OZz58518xkzZtRkJ5xwgjvW6/YjSQMGDHDzsufO\nG98b9XzMxbCkSyX9haQnzWxxkX1MlUXwd8zsLyU9K+ntjZkiAAAA0BjHXAynlB6SFC3Lr6jvdAAA\nAIDewxXoAAAAkC0WwwAAAMgWi2EAAABkqysb6NAFO3bscPNoV2fk61//uptH1xRH3qKd8xs3+p0O\nH3rooZrsV7/6lTs26oTi7TyWpCuu8LcQRDueo93HaH316BoR3cfBgwfdfOfOnW4ejR87dqybT5o0\nyc29DgaRehw/4o4EXmeHKVOmuGOjriHTpk1z8127drl51CFi7969NVnUHWHIkCFuHnWNmjDBvdZZ\nWLvRsXodIsp2h4hE4+vRIaKeXSZ4ZRgAAADZYjEMAACAbLEYBgAAQLZYDAMAACBbLIYBAACQLbpJ\n1MlXv/pVN1+xYkWp+7nsssvcvNWuNY9yDh8+7OadnZ1u/oc//MHNFy5cWJOtWrXKHTt69Gg3b29v\nd/OLL77Yzb2d3VK8ExytI+qaUI88Grt//34393b1S9LgwYPdfNy4cW7e1tZW6n7KKNtlIvefC9Hx\ne+fixBNPdMeOGDHCzaOuIVH3keg52ptjVCuDBg1y86izQ/QcGuVl6qXRHU/KzKU36pyfRgAAAMgW\ni2EAAABki8UwAAAAssViGAAAANliMQwAAIBs0U2iG5YvX16T3XLLLb0/EbScsjvk169f7+ZLly51\n87Vr13Z5LtOmTXPzWbNmuXm0+z73He85KNsdoh6i3ftRPmTIEDc/+eST3TzqGjB27Fg39zoERLVP\n14j6KPMGNBzJAAAFd0lEQVR9iTo1DBzoL4OGDh3arTn1RL3Ocz3uJ7ea45VhAAAAZIvFMAAAALLF\nYhgAAADZYjEMAACAbB1zA52ZnSLpa5ImSEqSbk8pfcHMbpH0V5I2FUM/llL6SaMm2p/Mnz+/Jtu5\nc2ep+5g5c6abDxs2rFtzQnOJNtAcOnTIzaMNdNHlmKPxXn1NnTrVHTtlyhQ3P+2009w82ogSib4H\n3saNem3Eym1TSH9V9jx4l5eNLlEbXeo2urx4dD9RzUWXFx81alRNFm3aqsflcrszPhf97VK/6P+6\n8tProKQPp5QeN7ORkhaa2bzits+llD7duOkBAAAAjXPMxXBKaZ2kdcXHnWa2VNLkRk8MAAAAaLRS\n7xk2s2mSzpf0aBF9wMyeMLM7zcz9HZSZ3WhmHWbWsWnTJm8I0C9Ru2hG1C2aFbWLvtLlxbCZjZB0\nn6SbUko7JX1J0nRJs1V55fgz3r9LKd2eUmpPKbW3tbXVYcpA76B20YyoWzQrahd9pUuLYTMbpMpC\n+BsppfslKaW0IaV0KKV0WNJXJF3YuGkCAAAA9deVbhIm6Q5JS1NKn63KJxbvJ5akayU91ZgpNrdL\nLrnEzefNm+fmdJPIW7SzOerUMGbMGDefPXu2m3s77Xfv3u2OjbpJTJ7sbxmIds6XvVSvl7PLvn/r\ni8vIRmNHjhzp5tFz60knneTmUceHKPceo1Gniug+ItQz0Fhd6SZxqaS/kPSkmS0uso9Jus7MZqvS\nbm2VpPc1ZIYAAABAg3Slm8RDkrz/lmbRUxgAAACtiyvQAQAAIFsshgEAAJAtFsMAAADIVlc20OEo\n73nPe7qUAZFod3i0+3zEiBFuPnz4cDefPn26m19++eU1WdTVIVKvzg716ByA/q2R561spwavk0qj\nUbdAc+CVYQAAAGSLxTAAAACyxWIYAAAA2WIxDAAAgGyxGAYAAEC2rOxO8h59MbNNkp7ttS/YeOMk\nbe7rSfSS/nSsU1NKbb35BandptWfjpO6rY/+dE4bqT8dJ7Xbc/3pfDZafzrWLtVury6GW42ZdaSU\n2vt6Hr0hp2PNQS7nM5fjzEku5zSX48xFTuezGY+Vt0kAAAAgWyyGAQAAkC0Wwz1ze19PoBfldKw5\nyOV85nKcOcnlnOZynLnI6Xw23bHynmEAAABki1eGAQAAkC0Ww11kZnea2UYze6oqG2Nm88xsefH3\n6L6cYz2Y2Slm9qCZPW1mS8zsg0XecseaC2q39Y41B9Rt6x1rLqjd5jtWFsNdd5ek1x2V3SzpgZTS\nDEkPFJ83u4OSPpxSOlvSRZLeb2ZnqzWPNRd3idpttWPNwV2iblvtWHNxl6jdpjpWFsNdlFL6taSt\nR8VvlnR38fHdkt7Sq5NqgJTSupTS48XHnZKWSpqsFjzWXFC7rXesOaBuW+9Yc0HtNt+xshjumQkp\npXXFx+slTejLydSbmU2TdL6kR9Xix5qhlj6f1G7LaulzSd22tJY+n81euyyG6yRV2nK0TGsOMxsh\n6T5JN6WUdlbf1mrHmrtWO5/Ubh5a7VxSt/lotfPZCrXLYrhnNpjZREkq/t7Yx/OpCzMbpEphfyOl\ndH8Rt+SxZqwlzye12/Ja8lxSt1loyfPZKrXLYrhnfijphuLjGyT9oA/nUhdmZpLukLQ0pfTZqpta\n7lgz13Lnk9rNQsudS+o2Gy13PlupdrnoRheZ2bckzZE0TtIGSR+X9H1J35E0RdKzkt6eUjr6TfNN\nxcwukzRf0pOSDhfxx1R5H1BLHWsuqF1qtxlRt9Rts6J2m692WQwDAAAgW7xNAgAAANliMQwAAIBs\nsRgGAABAtlgMAwAAIFsshgEAAJAtFsMAAADIFothAAAAZIvFMAAAALL1/wEufHLDkoXM+QAAAABJ\nRU5ErkJggg==\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x11e9ff7f0>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "f, ((ax1, ax2, ax3, ax4)) = plt.subplots(1,4,  sharex='col', sharey='row',figsize=(12,3))\n",
+    "ax1.imshow(np.reshape(image_test[0,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax1.set_title('True image')\n",
+    "ax2.imshow(np.reshape(x_construction[0,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax2.set_title('Learned image')\n",
+    "ax3.imshow(np.reshape(x_construction[999,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax3.set_title('Learned image')\n",
+    "ax4.imshow(np.reshape(x_construction[9999,:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "ax4.set_title('Learned image')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXYAAAEICAYAAABLdt/UAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJztnXuUZXV157/7vmjurQbjacQRqFuK4izGqIlE42saLZNg\ngxLHLJUU0s7MsuTmMcSBPLBWBieZMq9ZQSfIYI3iOH1vjMwYHzE4JATGrPggNAhMRHTQrioUH3QB\nAl1IN1V7/rj3FKdOncfvnHve9/tZa6/uuvc8fufcc75nn/3bv/0TVQUhhJDqUMu7AYQQQpKFwk4I\nIRWDwk4IIRWDwk4IIRWDwk4IIRWDwk4IIRWDwk5yRUSWReR1ebeDkCpBYSelQURURJ6b0LbOFpHv\nJLEtQooGhZ0QQioGhZ0UBhF5qYh8WUQeFpHvichVItIafff3o8XuFJHHROSto8/PE5E7Rut8SURe\n6NjesohcJiJ3iciPROQTIrJLRDoAPg/gWaNtPSYiz/Jozz4RuVtEHhWR74rIZaPPzxaR74jIe0Tk\n8Gg/c471zhWRr4rIIyJyn4i817XdV43a+vDo+3eMPj9ORP6ziKyKyA9E5BoROT7Rk0wmAgo7KRIb\nAN4NYA+AlwOYBfArAKCq/3K0zItUdUpVPyEiPwXgWgDvAmAB+BCAz4rIcY5tvgXAOQCeDeCFAN6h\nqkcAvB7A/aNtTanq/R7t+QiAd6nqbgAvAHCT47tnjtp5CoD9AJZE5Pmj744AuAjA0wCcC6AnIr8I\nACLSxfCh8mcATgLwYgB3jNb7QwBnjD577mjb/8Hw3BGyBYWdFAZVvU1Vv6KqT6rqMoZCvTdglXkA\nH1LVW1R1Q1U/BuAJAD/rWOa/qOr9qvoggL/CUDRNOQbgTBE5QVUfUtXbXd//rqo+oapfAPDXGD5E\noKr/R1X/r6puqupdAD7uOI5fBnCjqn5cVY+p6pqq3iEiMjqed6vqg6r6KID3AXhbhPYSAoDCTgqE\niJwhIp8Tke+LyCMYCtuegFW6AC4dhTQeFpGHAZwGwBlW+b7j/+sApiI06c0A9gFYEZEviMjLHd89\nNPL8bVbs/YrIy0TkZhF5QER+BOBix3GcBuBbHvs6CUAbwG2OY/nfo88JiQSFnRSJ/wrgHgDPU9UT\nALwHgAQsfx+ARVV9msPaqvpxg32FljVV1VtV9XwAzwDwaQDXOb7+iVGs3mYagB3O+XMAnwVwmqqe\nCOAax3HcB+B0j90dBvA4gH/hOJYTVTXKg4gQABR2Uix2A3gEwGMi8s8B9Fzf/wDAcxx//zcAF488\nZBGRzqjjcrfBvn4AwBKRE72+FJGWiMyJyImqemzUrk3XYv9xtNyrAZwH4H86juNBVf2xiLwUw/CL\nzQDA60TkLSLSEBFLRF6sqpuj47lSRJ4xasMpIvILBsdCyDYo7KRIXIahCD6Koch9wvX9ewF8bBSq\neIuqHgTwTgBXAXgIwL0A3mGyI1W9B8PY97dH29uRFQPg7QCWR2GhiwHMOb77/mif92Mo1hePtgkM\nO3x/T0QexbDzc8vTV9VVDMM7lwJ4EMOO0xeNvv7t0TF8ZbTPGwHYHbKEGCOcaIOQaIjI2QD6qnpq\n3m0hxAt67IQQUjEo7IQQUjEYiiGEkIpBj50QQipGI4+d7tmzR2dmZvLYNSGElJbbbrvtsKqGDlrL\nRdhnZmZw8ODBPHZNCCGlRURWTJZjKIYQQioGhZ0QQioGhZ0QQioGhZ0QQioGhZ0QQioGhZ0QF4PB\nADMzM6jVapiZmcFgMMi7SYREIpd0R0KKymAwwPz8PNbX1wEAKysrmJ+fBwDMzc0FrUpIYaDHToiD\nhYWFLVG3WV9fx8LCQk4tIiQ6FHZCHKyurkb6nJAiQmEnxMH09HSkz6sK+xnKDYWdEAeLi4tot9vb\nPmu321hcXMypRdlj9zOsrKxAVbf6GSju5YHCToiDubk5LC0todvtQkTQ7XaxtLQ0UR2n7GcoPxR2\nQlzMzc1heXkZm5ubWF5eLrSopxEyWVnxrjPl9zkpHkx3JKSkpJWaWa/XsbGx4fk5KQf02EnpKXtH\nX9z2RwmZRNmHl6gHfU4KiKpmbi95yUuUkCTo9/vabrcVwJa1223t9/uxt9ftdlVEtNvtxt6O6fZ7\nvV7s9ovItvVsE5Ed+3Tvo9VqqWVZnsfZ7XY9t9vtdhM9FyQ6AA6qgcZS2EmpSVKEkn5ImGzfT5xN\n2m967H7L+R1n2ueBxIfCTiYCU6/VhLQ9VROBjdJ+UwH2O0dBx5n2mwuJB4WdlB4TcUlSjJN8SETZ\n/jjtDzpH9ndJPkxIvlDYSakx9UaTDBuEPSScQlmv17e+M92X3/bdgp9E2MPrvCT1MCH5QWEnpcLt\neVqWFSpCtvj1er0toa3X69rr9Yz2EeUhESSUpkLst/1er7ejQ3XcMEiQp25ZljabTd/jZAimuFDY\nSWmI4106RTFJz77f7297qFiWZRTS6HQ6RsfZ6XS21qnVajseQmHtNBXesLCS13bYaVp8KOykNESJ\nA7vN9tS9vFKncDkF1b2cEz9xM2mL35uCvV23lwwM0w6dHrrf8djiayq8cfoekuqvoNefHhR2Uhqi\ndCqmYVNTU1seq5+w+n3uXsaPoIeX6fHXarVA4XUKqmVZ2mq1PB8CfsKbROcxvf50obCTwuEnKH6i\nNzU1ZSSmSYh7o9Hw9KjdAhW2HT8PNe2Hl5egNpvNHYOQvN4c6vV6YJ9GFI897ZTRSYfCTgqF3+Cc\nXq/nOzIyTGiBYWw7K48/rEPX2XZnTNx0vXEs7E3D/jfOuQoKMblJO2V00jEVdhYBI5ngVddEVXHN\nNdcAAI4//vit72u1Go4ePWq03SNHjiTb0ADW1taMljt69CguuugifPGLX8RHPvIR42MZh7D6Lva/\nQ22IxvXXX2+87PT0tGcVyEmbqCRvWASMZILf1HK2uDtFc3NzM6tmpcbm5iauueaaQFG3LAu9Xg+t\nVivDlkUnyrSAnKikGCQm7CJSF5GvisjnktomqQaDwQC1mv+lFseLLANBxyUiOHz4MK6++mrs3r07\nw1ZFJ4q3zYlKikGSoZhLAHwdwAkJbpOUHLtmOEu+bkdVISJ5NyOUVqsV2duem5ujkOdMIh67iJwK\n4FwAH05ie6Q6eMXW42BP8hDk+ZPk2b17N0W6hCR1l7wfwG8B8A2Oisi8iBwUkYMPPPBAQrslRSco\nPtvtdo238/znPx8iUon4e5l48MEH824CicHYwi4i5wH4oareFrScqi6p6lmqetZJJ5007m5JSQiL\nz1qWZbSdu+++u7Kx+LRIYio7ZrOUkyQ89lcCeKOILAP4CwCvFZF+AtslFcArS8JmZWUFjzzySOGz\nQsqIiBj3a9gPAHfMn9ks5WVsYVfVy1X1VFWdAfA2ADep6oVjt4yUnsFgsBVj9/Mejx07ht27d29l\nUZBkiPJ2Mz8/D1XFgQMHtmWz7N+/H5dccglEBCKCPXv2lG4+2YnFZBSTqQE4G8DnwpbjyNP8SLpA\nk9/2olZstOn1epmMIqU9ZV7D/YOKlrHuS36AJQWImzQmfvbbXtSZe5xtSKr+C83c3AT9fqz7kh8U\ndrIDv5olzhs1ikcfVPApak0Suw302LO3Wq224zcP+v1Y9yU/KOwThIkY9/v90Bs16iQPQduLWmPd\nbgO99fyt3W4nVu2RJAso7NXFXXfbb5ozJyav1kEeeL/f31Hf288sy/IUhiDBsCfGyFvUaE/9Hoyx\nFw9Q2MuN3xRtpp2Sbq8q6NXaLssaVHLVtPRss9n0fABYlqW9Xs9zO37r0PIzu9/D6xok+QEKe3np\n9/vaaDQ8BdBUYN1x0HE9dpN91ut138kxvGb0sduZt4hNkrVarW2Tb5j0u5DiAENhZ+GNAnLJJZfg\nySef3PH5sWPHjGuCP/3pT9/2d9BAk5WVFczMzGDfvn2eJVf37dtntM+NjQ089thjnt+tra15lrAd\nXqvED9Pcfnu5sNGm1157LQ4fPozNzU0sLy/jAx/4AMvsVhET9U/a6LEHgwQ8s2azuW1+S5N12u32\ntomVu92u9nq9SPnotOTNfqvxe7up1Wqh85ba5gUnny4PYCgmO5K+MZIUhKixa/crODs0i2F2fNuv\n/yJs/liv35aUDzAUkw12vfGVlRWoKlZWVjA/Pz/W0GvTwlhhqGrkadnc05p5TXOWBZ1OJ5f9FhU7\nBHfttdfuuD7W1ta2rrnFxUU0m03PbZiG1EgFMFH/pK1KHnsas7L7DefOyuwQjF9HaBb7z2IC6LKZ\nSSe3ff10Op0d348zypgUAzAUkw1RZ2V3xrztwTi2kLo/n0SLkvkzaWZfUybXnJ/41+t1xtJLDCjs\n2RDFY49aGGvSzC8lkja0MI/djrVH6SynuJcLUNizIUphLXZExhMs2lNZTvY15xWqazQakUN47FAt\nF6CwZ4c7K8adMmiailZVs9Mow4aoT+r5MbF6vb5tYFFS/R/ukCFTH4sNKOz5EOTBFzl2nFaIyCkO\nvV5vm3hPTU1tewhWsW/Br+ZKUczpsSdd1pkkDyjs+eAXTiiyN2qLb9LbdXrjXqLRarW0Vqvlfvxp\nn9u82+BnbtFOI8OLJAso7PlQZAEPu7nTEjbV8ePntlef9/lK27I6RhHZKv4Wdu2y/npxAAco5UOZ\nZnW3LAtLS0sAgJmZmVT2sbq6uu3fOIgIFhcXS3Vu49BsNnHxxRcbD84aZ45YVcX111+/7TO/85vm\neR8MBpiZmUGtVsPMzAznVE0KE/VP2qrssff7/dLEikVEZ2dnY8fXTbxLO2963HMSpWRx2c3rvLqr\nfdod0uOcD6+O0yxj7IzpRwcMxeRD1aZ2ExE988wzdwhz1gXC7Bu+6J3QaZk9EYnXpOHOz6OcG7+x\nFlllxTCmHx1Q2POhLN66qXU6nR2DhsYR2XE6S22hKXKHZFpmEuf2+z28JjJJ2jOO80CIEtNnGuYQ\nUNjzIW8ByMosy8olLFLmUIxX/RZTC/Ni/cJUzpm3kp4NyfmQdYu0+8HhJcymHjtDNk8BCns+VD19\njxbf4oaQTETMpDBYkuJo0t8Rtm+vUJ7XA8HvLdgvPFVlQGFPD7/XwjJ1nNLKYU7POigcERbWiOId\nm4ilSTjMZN9B+4vaWT4JXjwo7MkR9MrpvKCyjv3mldfdbrcr04FpdwLn3Q4/c86EFeTdhgl30LXi\nvM7d+7DXM32QRN130AMkzv1kWVZ6QlAAQGFPBlOvIesBNPV6XWdnZzMXmnq9rv1+v9BiaGp5PZCT\nNNNQS9CIaHuZsIe1U+SjLGvSt+DlbY9zP1U1NAMKezJEuemzEog8PXWTKdho2ZkzgySogzSsZMQ4\nHbtJmTssNO41Zl+vvV5vK0Rar9efGnF7++2qtZpqibx8ZCXsAE4DcDOAuwF8DcAlYeuUSdijiGin\n0yl0wacwC+ofiPMqTkvfnGIY5LWnUQsoDXM+iJII9/k9sHoXX6z6ileonnwyhd1zA8A/A/DTo//v\nBvBNAGcGrVMmYY/qNRSxsFWYCDvjnL7ezZjnhZaOtVqtbbNveZmdPZJ3W02s0Whk8vZwkYjq6aer\nXn45hd1og8BnAPxc0DJFFXavHvqqDGP388btmHmcc1WF80KrlnWBoax52N7RMlOAfhfQd550kt75\npjdR2EM3BswAWAVwgsd38wAOAjg4PT2d/hmIgN9rnzNGV4XQg58Qx00Tm9Th/WW0SUnDbQH6spHt\nPe44fbmIXgfoOqDPGS3zx4DeOPr/f2o09PGpqRRUJR2QtbADmAJwG4B/FbZskTz2MM+z2+2W5jU2\nyOw3EL8b3KQ+R1D+vv25ZVkU+4Jald6wOp1OqLPV6/X0g69/vW4AetHoszMAPQLoC0Z/XwHoWq2W\nssokB7IUdgBNADcA+PcmyxdJ2MNEW0Qq4a2bTD8XNDAlSo6zyXmlZWtFz9e37Qp4h1F+Ica2ngfo\nQ4B++pRTthyazwP6Qdf+HgAy05txQYadpwLgfwB4v+k6RRL2MNGumscelM/s/Nt0dh2/5csgIpNk\ns7OzRtdx3iGbKzAU45e57ISI25kC9GuA/gOgxzca2u/3df8znqGPAHo6oCeO7A8AfbBWU33oIdUf\n/zgPCYoEMhT2V41O5l0A7hjZvqB1iiTsQRe7M1WsKq+xXtUa/R5uzvCMyVuLPeqvSueLlq1dgaEH\nHXW9CwA9BOjG6N9bAL0f0GeOvu92u3rrhRd6vg1s2e//fk4qZA44QMmMsKp4zuW8vJmXAPpRQO8Z\nXVQfLcDNEWbNZnPbjPd+yzkHv5i+tbBDlRZkXfgL617EE/YLAH3Mta1NQN/rWk7vu0//5j3v0bee\nfLKeDegbTzhB//y44/RhQN968sn6qfe/Pw8JigQo7OaYFD7yE6x/B+j/A/QAhilUHy3AzWN0gzm8\n8bA6I6rmoRWKOi3InFkrtjmzVq4A9BiG4n4U0NsBfVPINg/B+0Fxv2Mfr6jVVH/0o233c7vd3vYg\nKUMRMVDYkyMotCCO/9+K+ML+CkC/AujjgH4b0F9P+QZzeuN+ou0cnFSFfgZa8excYFvWyhyg7wb0\nbEDfAOjnMBTpIHHfgP9bwDa7+eYd1/MV2P6GYJIdliegsCfDp6680vdC2eu6wOIK++mAPgroxwF9\nDaC/jaHX8m9TvKGcxaNMUiCrkBlEK5bZWStXhSz3JQw9d7/vD8H7/jzkWMZdesFvWyYzVeUJKOzj\n0+/39WnHHx/46ui8KOIK+zWAfgPQuuOzDwK6mtINFaVT2MbEY3d3ytJofubMWmmELHsZhl55zed7\nrxj7Y6PP7WXst8+wa96efL2o1SFBYY+OycTA7ldHp/kJe9d10TltL6ArGKZdOdd59ej7F3hsbxwT\nEe31ekaTgtTr9W3nxuuGsAeKmJRypdFs+yS2Z60E2aUIFnZgZ1bMBa7v45RoLmLMHRR2M6L82GGv\njn7CHtRhdCaGAr7ftc6e0ee/lMJNFWW+Uq9z5Rxhaos689ZpfuYW3esAfQLQVxqu/yVAb0ugHfa1\nG3WdIgEKezhR8q1NXh1NQzFOr/9ZGAr4+a5l6qPP35njDel3UVeldg4tffNLRbwJO52d3YDeDOiv\nADoL6C8C+tcY3itvSKg9UQdgFS3mDgp7OH6eutdrncmro4mwu73+Igu710z2/X6fok4ztkPwD0O6\nbS+gHwb0Wxi+zT4G6N8Dek5CbYlz3dJjL5GwB4VfvDyMJzDMUgl7dQwTdi+vvz3ahztmn1YoJup8\npfacmzZMe6RFMb9UxI0CtC3MosTYTScBHxdQ2L0JC78c8rkQH4H3q+MeQN88snsxfMV8M6BXwtzr\nXwH0fa7PXjXabxKdp/brZ9wa851OJ1Z8kkY7BO/76VAB2ua2Vqu1rd8oiqgHzTebJKCwexPmcRoP\ndsDw1XGv4bJBXv81gH4d23v9/wxDwU/qovUSd/tcULBpaZlJKmIW5jerWRLpjSYjt5MCFHZvgkSs\nXq8n4mH4bcPP67cHKA0wHHH3mxgOp05rgJK73G5Z5sOkldPCUhGzsE6n4/uW6tWXlISmpNHxCgq7\nN35PV3uKuMUzzxzbw4jq9QNDT/4WDEsKHEL6JQVss18ZmYNOm2Rz9yUloSn02DMkKFWv2Wxqr9cb\n28M4hPG9/iyNg4totKHFCckwxp6TsGcdTy5KXDGKMc5OK5o1Gg1tNpuZ79dLlMOyXpgVk7Gw5zXp\nQxHiilHMz2PftWtX7m2jldPGdRbsN+g8Umydnaq9Xi8zjzwMUNiHMO96PPPLJqDRgszus+p0OmNt\nx7KsWPdwkp6+yQxjWQFDYW+gwgwGA6ysrOTdjFKzubmZdxNICdnY2MCFF1449nbW1tawtrYWeb1j\nx46NvW+boZ7uZHV1NbF9JE0t7wakxWAwwPz8fN7NqCyWZaFer+fdDEJyY3p6Ou8m+FJZYV9YWMD6\n+nrezagk3W4Xhw8fxq5du/JuCiG50G63sbi4mHczfKmssBf5NaksNBrekbp9+/YBAI4cOZJlcwhJ\njShvn5ZlYWlpCXNzcym2aDxKI+yDwQAzMzOo1WqYmZnBYDAIXL7Ir0lZUavF/3kty8KJJ57o+d11\n110Xev4JKSoisuOzjY0Nz8+9ePzxx7f9HVWbMsGkhzVpi5oVE2cAQF5pjkWzqFktlmXp7Oxs5LrV\nNFqZbNxsL+ecwVmmQqJK6Y4mQ3a9Bgi4Z/zJ+2Iqg3HeUlrVLYnBeHYdmCzLCahWTNjDiuyEPTXp\nvdNotCTNFu4sC4CpVkzYw56KYYW9OEiJRptMc761x/XU3es5nUZ67GMIe5hHHvSD0VOn0SbT3OIa\ndxtBdWD8JnHv9XqRNM4UZCnsAM4B8A0A9wL4nbDl45QUCDq5YR45h8XTaNWzoP4grzK8Ubdv0gla\nWY8dQB3AtwA8B0ALwJ0AzgxaJ+laMf1+P5cqcDQaLT+zLCswKcI9gYZJAoUdsnE7j36OZWVj7ABe\nDuAGx9+XA7g8aJ0khZ0xdBptsi0sFOtMojDJ+vLSGL9QcJU99l8C8GHH328HcFXQOkkJO7NdaDSa\nqdkleIMcwVqttkNngsQ7Sh57EjXbUTRhBzAP4CCAg9PT05EPyAt66skac9hpVTdbdMOWixJuMRHs\npAYyYRJCMZz1J3njOaVV3UxHVScZbkkqZANDYU+iVsytAJ4nIs8WkRaAtwH4bALbDYX1YJJneO0Q\nUl02NjaMlltfX8fCwgL27du3o45M1OqOfkUJUytWaKL+YQZgH4BvYpgdsxC2fJIxdhTAA6DRJsXK\n0KeV9FunO+NORCLnqZfRY4eqXq+qZ6jq6apa3CLFhJCx2L9/f2AVxLwnXxERHDhwAO12O7Ftumdj\nUlVcf/31kbaxuLi4o02p1nQ3Uf+kLSmPnZ2nNFp21ul0Au85u+NwdnY213baBQDT3EecPPVSZcXE\nsSw7T8vw6kijld2cQuc3zD4rM818GcfymMhaNeNQTF6EdZ52u13s378/o9YQMrmo6tYkE0tLS7m2\nxe70tCwrcLl6vY7Z2Vl0u91I2y/6tHgAUGqPPWiAUliqEo1Gq66JiJHXbuLdt1ottSxrrBBKUmAS\nQjGq20sK2PmpJoMLaDRaeaxWq0W6ly3LUlWz+jDdbtd3uVqtlquQu8GkCHsY9NhptGpYlDdwu7qj\nSdkR27t3j7xutVqFEnVVCvsWXj9s3KHznF6PRsvP/GqzBC2vOuzMDRptam/XeX+7K0MWBVDYn8Jv\nPlT3D9npdHx/fLvXP++LO+qNoJp/lgKNloQ5a7OYOllhy7Xbbe31eplOSD0OoLCbY/KKFzYNn22m\nk3o4Lzg7dmhXnwu7GE3ije4LkyGp9OxdgP4doD8E9GFA/wHQnytAu6pm9j2YVFVX5+xIQfsrEqCw\nm2FykZhMjN3pdELnQbTNZDhy0Pr2G4dfSElEtt4+7FfQoLcR2ni2AuiHAD0f0NcB+t8B3QD0DQVo\nW1XM9qyTdFBsgpykonntoLCb4Xeh1Ot13/SmsBFkfuIfpcaEiRfBEEsxzPL47IuA3lSAtpXZnFlu\nXuGSsX+3kBmYgOKFZEBhNyOtqa3GHT7s9XBwPxgYXsnGusDwVvGwvT7rXAXoPQVoe9mt2WzmnrRQ\npJAMKOxmFDm+1uv1djx4nB4Ec/SzsRagL3PZdYCuA/ocn3VuA/SzBWh7auek1drRH+TMJPG6dr2s\nVqsVfoIXd7kE+02iXq9HrvI4LqCwm5HUzCZpEPbQoceej52LYQz9Ip/v/zWG3vzZBWhrUtZoNHxH\nX7rfTp2xcGc4xa+fJywjLW+zw7JTU1Oe32cp7qCwm5NE1bU0MJmSi0XOsrXnAfoQhqEWr+9/GtDH\nAL2yAG1N0vxGYJpcg/ZAnyAPvihvn3HaUa/XM9MEUNjLj59HblnW1oPI2QFke0dFuUmqZlOAfg3D\ndMaGx/fPBvR7gH4G0FoB2pu02cP0Ta5Rt3U6Hd9BQqZT1aVtduZNnJh+VmASqjtWHa/i/K1WC488\n8ghWVlagqlhbW8Pjjz+Ofr+PJ598EqqKAwcOoNFo5NTq6nABgEMANkb//h2An8Bw9vYnXcueBOAG\nACsYzg25mV0zM2NtbW3HZ6ZTux05csR3SjrTqeqSwLIsdLtdiAgsy4JlWRARdLtdLC0t4eqrr8bU\n1FSkbeY9uYgnJuqftNFjN8cdJvLzJpydvVFG5tG87QIMQyrqsE1A3+uxbAfQWwH9JqB7CtD2NM1N\nmfp5gvrOnPdZnG1nFcIFQzHVJOjCq9VqOjs7y7h7AnYI20XdtvuxPTtmN6A3APoEhg8Dd/ZM3seR\npDlDMXmWxHaP1Lbb4b436vX61kjwoAyWKH1VjUbD9x7MIukCFPbq4PQmihKPrLptwFvY3bY35Ht7\nez0MvfoHAT0C6F2jz/I+TlNzVjrMu9PeT0Cd94llWTsmofZbz/QBVavVth4OeaVJg8JeDfK+iSbV\nDsFbqA/F3N57RnYeoK8FdBHDh8elBTjWMJuamopUd8jZue83anTcDJkwAY0ivFHCL/bDIa2BjWGA\nwl58TEoT0EPPx7xi7I+NPk9qH31A7yzAsYaZPeLZNAbtJW5eue5ujxoYvhn45YuH7cNJ0LpuooaU\nut0uPXYvo7CHD4yip56/XYChh74x+jdI1LvwD8fs9VnnA4B+vQDHmbSZiptfDfRxPfagqe68cs6j\n3mv2xBx5DGwEhb3YBD3x6amXz0zLDtQxzKI5B8OBTr9WgLYnbe5JKqIOAAzzoG1hjbu+F+74fFBW\nmf1QyWNgIyjsxSbIK/F6RaWVy7zKDpyM7Z787xWgnWmaZVm+8XVnhopXmMbPgzapkBp0b0UJlRSx\n3Ago7MXGz6vgqNHym1/ZgTqgL8EwNHMFoI8D+psFaG+aFtTJ6Dcnqbv2utck9W5MMsfCPP2w7Rah\n3Ago7MXG74LO+0akjWdhZQec9rsYhmqOL0C78zA7e8bruzDP2h06CasQKSI6OztbKJGOAyjsxcOv\nCp7zQsv7ZqONZ5/EcBDTMw2WPRfDkMzpBWh30Swo68W0s9M5WY7fvKZe92CRQRbCDuBPANwD4C4A\nnwLwNJN/GYuVAAAQmklEQVT1JlHYTeN1LAVQHnNnzVyH4QjUVxqu/z4MwzGT6rEHWZDHbpqe6Hw4\nmIY+846hh4GMhP3nATRG//8jAH9kst4kCrvpKye99nKYXy2Zm7AzO2Y3oP8I6K9iOCfq6wH9U0CP\nAvoHBTiWollYB6lpP5Tz3oqy/7Rz0ccBWYdiALwJwMBk2UkU9rCeeuerIOcyLb4dwnZRD7K9gC4B\n+g0Mywk8AOiXAJ0rwHEU1bzEPUp9GveYkChJCWmPHh0H5CDsfwXgwoDv5wEcBHBweno69ROQBVF6\nzKNekEWeUYbmX0tmowBtq4o5s1jC4urOuVHdk2BHHRMyER47gBsB/JOHne9YZgHDGLuY7LQKHnvU\nHNd+v79Vac7kgs77pqIF2yF4C/uhArStSmZXlAxyjGynKs5o7aLlqYeBrDx2AO8A8GUAbdN1qiDs\nUdO0GDuvlmVRS4Y2tKCEApMOUj+zHwjMitkp6ucAuBvASVHWq4KwR63uFvWioxXfotSSmUQzfUMd\nx5yOVJwqjUEUUfSRkbDfC+A+AHeM7BqT9aog7FE9doZXaEW2tK7PNNN33eIcxXnq9XqBwl3EcgKq\nGYZi4lgVhN3kh+cUdbSimt3Z6BzAk8Z+ugHTObotyr3i5UFHyX5pNps7Rqs679+4I2LTBhT29Al7\n4rOYF62I5iWKaYYKe71e6JB/u00mnZ9B4jruA8redl4TaYQBCnu+MKZOK7o5y+uaerpx4+adTmfL\nIw8a7el0lrxSfk3CIb1eL3ZoyRZueuwUdk8YU6eVwZrNpvHAn3a7rbOzs7H3ZYuySaeklziblOy1\niTtPsC3cjLFT2D2hx04ri9nC5xbSVqu1FYe3LMtoyrowM/F4g2LlcTxmL5EOi7Hb601kVkxcmwRh\nZ4ydVkazBdUpZFEG/oSFakxi1EFOUdwYt5dIF1G4wwCFPX+YFUPL26KGIoCdXnGUa9iyrMCHgInH\nHRTGdJbiLYMQJw0o7MWCk1PT8jD72ov69ui8bqOsZ9d38XoY+HWSukU6TqGvSQEU9uIRpTodjZaE\nOa89U8+7Xq9vrRdnmL77eneLd1jHZBQnKO8slawBhb24MGOGlpXFzVePc62aetAmqYTuh4LfPvPO\nK88aUNiLC+PutKzMK9MjzBt2CqzpgyBKzDvO4J+i5pVnDSjsxYXCTsvS7NK3NmEhQefApbAYe5w4\ndxyRLmpeedaAwl5cyh6KyaJqHy1Z86qrEua52wLvN+mLcyKMKHjt274nLMvaVsMmSl55GdMXowIK\ne3EpcweqPfqPOfrlsnq9rr1eb0v4TB/O7Xbbd2CS+00gCs63hiBHx9QrnxSPHhT24tLv90OLIjlt\ndna2MA+DXq/HUBJNgWQ6Lk2ua5M4+qTE4GEo7DWQzJmbm8Pu3buNl//yl7+M5z73uSm2KJxOp4N+\nv4+rr74aa2trubaFFIPp6emxt7G6uprqMibrVhEKe8YMBgPMzMxEEsf19XXcdNNNKbYqnCNHjuDC\nCy9Eo9HItR2kGLTbbSwuLgJ46pqu1WqYmZnBYDAw3o7Jw2GcZZJ4+JQSE7c+aZvUUAxHn5bf7Jht\n0DKNRiP3dqZpUQYbjXtPMMa+HTDGXjzC4ontdjvx+PXs7OxWrRBms4xvNkH1VyzL8u1cnpqaKnwf\nRVD73DHrJGLbzmyWoKyYKNthVgyFPTOCev+7o+nJkr7p3Z20ZU+1zNtsxtnGrl27cj+OIPMTa6/0\nxqLONFRVkQeFvXgEeTdlDdOIyEQ9LMJ+yyKa14QVfsva3rLf926xLGI2SpXDMqCwF4+gCy5roWBY\nJro5xcpvbs1Go1HIUItXLXL321yr1Yp0Lbbbbe31eoUT0SI+bJICFPZi4veKOElebxnNLVZ+4mGP\n1owyTiFtswXNfe05ByyFdYgGbbtoYY+ihoeSABT2clGmV/tJM/vtxunt+i1rx6HzGJnbaDR27Nc5\nz2iYZ93r9bZ1tPuVEnAfb9Ggx66gsBeEuDH2Xbt2RZ4hp8yWtyfcarUCRbvb7eb2kG42mzu8cPvv\noPaq+oeWwqyIYskYu4LCXiA4EUe5zRaPPNvgFFoTZ8H2uOM4B0UWy6KFh5ICFPbywnj75JplWWO9\nldhC3e/3jcTafhDEaWdVxLJMgLViyovfMGgRybgl43Pccceh3W4nuk3LshLfZt602230+30cPnwY\n1157LSzLirWd6elpDAYDzM/PY2NjI3SfdlmAer1uvA/LsnD48GHMzc3FaiPJABP1DzMAl2L4JN9j\nsjw99mD8YoRx46B5W6/XM+qIM7FWqxUpnbBIw/ud4YFOp7PVKWuX1HUTNW3SvkZMPXV3x6nJ+Sty\n+GUSQFahGACnAbgBwAoo7InhFyMsYwy+2+0m1sEbNdtkdnY29+O3z4HztzXp3PNarlarbT0Q7MwV\nZ0dpWEw9SJidDwX7YVPVWHVZyVLY/xeAFwFYprCnTxlHqCbVZxDn4ZDkQKywUZl+1mw2t9U/8fPE\nvTJM3HVU3PF3p1CHPfTr9TqFueRkIuwAzgfwgdH/A4UdwDyAgwAOTk9PZ3AKqkvYzV40S8JjzzKk\n4jeaMs6I0k6nY/z7hOWEh+VnJzETESk2iQk7gBsB/JOHnQ/gFgAnqoGwO40eezBRX3/dQl+kaets\nQQnqH7AsS2dnZ32FaWpqKrNjsgcYef0GUb31qOmrYTnhYSMq/fZFT706JCbsvisCPwnghyNBXwbw\nJIBVAM8MW5fC7k8SgytMU93SMHsaP6+HklcM1/7cS7Q6nU5udXS84t1RzqktpqYPA5PfOMxjr/LA\nHDIkdWHfsSF67ImQ1HDoOB6m2+yJq3u9nnGsul6vh9YiidLOvPoTnIIYt1/DJHzjdW783thMhJud\nndWGwl5S4hQw8ruZ/TzhKGYLdBJxfDsdz9nWtCshhpUACDt21fEykYI6XP06S/1SXe122G8OFO7J\nI3Nhj2IUdn9MPPYomRLOUEbcDJEkR8KmPaq2Xq/r1NTUNmGNm0dvP0zHabP91uPehl+IxO/3N12f\nVBsKe0kJe902DQv4hW6KWCs8bWs2m7HeOJLw2O3QlD3lm/2ZvX3TGYmi/MakulDYS0xQnNRUZPxC\nN3mLbFnM/TBN4k3DKyxkWuc9ym9MqoupsLNWTAGZm5vD8vIyNjc3sby8vK0mx+rqqtE2/OrNhJFH\nPRrLsmLXRhmXer2OXq+Hfr+PbrcLEUG328XS0tK2855EbZqjR4/i2LFj2z5bX1/H/v37MRgMAACL\ni4s79uX3m8T9jckEYKL+SRs99viYeHRB8dewGei9BuckGRcP6w9Iaj9xzF2xMEo20LjmPg/O32lq\nampHO5rN5o5+FGbCVB8wFFNNvGLsziHrdizX7yYPmuvSuYxTKExqkEQRTz8RCppSrl6vez4Ukiou\n5hbMKAXXknrwBeWju83+zZi7PllQ2CvMOHnOQesHkUTqpC2CfoS9jXg9FNKondM1LIHgbEdSndIm\n58HZzqBl6b1XDwr7BJLU4KYo247rlXoR9uDweygk9dBx7ifqcSQ1gXWUYxGR0GXpvVcLCvsEkubs\n7EkIZ9jbQ9g+/HL5owz1N91P2Da96qcnVcrBmYcf1k6TBy7TIqsDhX0CKaLHHhTyiRpGsTs30yxd\nbNeJCYux+53TpNoW5v3bD8ko85qS8kNhn0DS7EiLI1hhD5Q4DwsRMfZo45hlWVvtCxJ30xIPcT14\nd39CUN2dsEJp9NirA4V9Qkkz9W3cTls3RZy02y3YYemhJiWV46SPxvGymSFTfSjsJFPiPFCS6pC1\nzV1Z0inKziH9Qeb2bsPW8RNOdz0frxTUoDeCuF42c9qrDYWdpIapeIQtFxbeierRuwcYebUnaH9e\nIh03Q8bLS/fqcI1SIIwQCjtJhSi58lFz6t2ebVDqn58HHiaKJp60E5O3CnfYxG8de3amoDbRyyZB\nUNhJKphm3iSVoRPk0aaZBWRj0mns3l+Ql8+OTDIOFHaSCqa58knm1Pt5tEnn7Qd1DtsPEZOwSZCX\nz9RDMg4UdpIKWXvsSbTFhCTLMQSV+aXHTsaBwk5SIekYexZtMSHpBxE7RUkaUNhJaiSVFZNlW8JI\noxwDO0VJ0pgKuwyXzZazzjpLDx48mPl+STUYDAZYWFjA6uoqpqensbi4uG1SjDjMzMxgZWVlx+fd\nbhfLy8tjbZuQpBCR21T1rLDlOIMSKRWDwQDz8/NYWVmBqmJlZQXz8/NbMxDFxWvmona7jcXFxbG2\nS0geUNhJqVhYWMD6+vq2z9bX17GwsABgKPwzMzOo1WqYmZkxFvy5uTksLS0FTo9HSFlgKIaUilqt\nBq9rVkRw4MABzM/PbxP+drtNgSaVgaEYUkn8JnCenp4O9eYJmRQo7KRUBMXCV1dXPdfx+5yQqkJh\nJ6UiKBYe5M0TMklQ2EnpmJubw/LyMjY3N7G8vLwVP2dmCyFDxhZ2Efl1EblHRL4mIn+cRKMIiQMz\nWwgZMlZWjIi8BsACgHNV9QkReYaq/jBsPWbFEEJIdLLKiukB+ENVfQIATESdEEJIuowr7GcAeLWI\n3CIiXxCRn/FbUETmReSgiBx84IEHxtwtIYQQPxphC4jIjQCe6fHVwmj9pwP4WQA/A+A6EXmOesR3\nVHUJwBIwDMWM02hCCCH+hAq7qr7O7zsR6QH4y5GQ/6OIbALYA4AuOSGE5MS4oZhPA3gNAIjIGQBa\nAA6P2yhCCCHxGTcrpgXgWgAvBnAUwGWqepPBeg8A2FkjNX32YHIfPDz2yYTHXi26qnpS2EK5FAHL\nCxE5aJIqVEV47Dz2SWOSj50jTwkhpGJQ2AkhpGJMmrAv5d2AHOGxTyY89glkomLshBAyCUyax04I\nIZWHwk4IIRVjYoVdRC4VERWRPXm3JStE5E9GJZbvEpFPicjT8m5T2ojIOSLyDRG5V0R+J+/2ZIWI\nnCYiN4vI3aOS2pfk3aasEZG6iHxVRD6Xd1uyZiKFXUROA/DzACZtzrS/BfACVX0hgG8CuDzn9qSK\niNQBfBDA6wGcCeACETkz31ZlxpMALlXVMzGs5fSrE3TsNpcA+HrejciDiRR2AFcC+C0AE9VzrKp/\no6pPjv78CoBT82xPBrwUwL2q+m1VPQrgLwCcn3ObMkFVv6eqt4/+/yiGAndKvq3KDhE5FcC5AD6c\nd1vyYOKEXUTOB/BdVb0z77bkzL8B8Pm8G5EypwC4z/H3dzBB4mYjIjMAfgrALfm2JFPej6Hztpl3\nQ/IgtLpjGQkpNfweDMMwlSTo2FX1M6NlFjB8VR9k2TaSPSIyBeCTAH5DVR/Juz1ZICLnAfihqt4m\nImfn3Z48qKSw+5UaFpGfBPBsAHeKCDAMRdwuIi9V1e9n2MTUCCqzDAAi8g4A5wGY9aqbXzG+C+A0\nx9+njj6bCESkiaGoD1T1L/NuT4a8EsAbRWQfgF0AThCRvqpemHO7MmOiByiJyDKAs1S1ahXgPBGR\ncwD8KYC9qlr5mvki0sCwk3gWQ0G/FcAvq+rXcm1YBsjQc/kYgAdV9Tfybk9ejDz2y1T1vLzbkiUT\nF2OfcK4CsBvA34rIHSJyTd4NSpNRR/GvAbgBw87D6yZB1Ee8EsDbAbx29FvfMfJgyQQw0R47IYRU\nEXrshBBSMSjshBBSMSjshBBSMSjshBBSMSjshBBSMSjshBBSMSjshBBSMf4/1HOo5p+uW1YAAAAA\nSUVORK5CYII=\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x12fab6eb8>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAsMAAAC3CAYAAAD3oFO8AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XmQXeV55/HfA4jFyAIJLQgEEmKRWAIiFhgHs4Ultkmw\nQ7m8ToaUE5OZSjLOjCsVT6pmxjVVmXFSWWpqqiYTMvaAPTF2cELhcpwhBisDGErQCAm0IIRAWAK0\ngQFhK2x6549uOfee99fdr+7St889309Vl3Qenb73vec55/Sr2+9zn0gpCQAAAGiiwwY9AAAAAGBQ\nmAwDAACgsZgMAwAAoLGYDAMAAKCxmAwDAACgsZgMAwAAoLGYDAMAAKCxmAwXilF/GBEvjX39YUTE\noMeFQxcRV0XEqoh4NSK2DXo86FxE/G5ErI+IfRHxbET87qDHhM5ExL+NiGci4rWIeCEi/iwijhj0\nuNCZiDgyIjZFxI5BjwWdiYgvRsRbEfF6y9fSQY+rH5gMl7tZ0kckXSDpfEm/JOk3BjoidOrHkr4i\niYlT/YWkfylptqQPSPqtiPjEYIeEDn1b0kUppVmSztPovfbfDHZI6MLvStoz6EGga99MKc1s+Xpm\n0APqBybDFRHx8cr/gt6IiH+UdJOkP0kp7UgpPS/pjyX96iDHiomNl8uU0sMppa9JGsqLehhNkMs/\nSimtSSm9nVLaLOkuSZcOerwY3wS53JpSeungbpIOSDpjgEPFBCb4WamIOE3Sv5D0Xwc6SBSZKJdN\nwWS4IqX00/8FSTpJoxOm2yWdK2ldy67rxmKYpibIJWqmJJdjy5Yuk7RhAENEoYlyGRGfiojXJO3V\n6DvDfzG4kWIik1yT/13S70vaP6jxodwkufyliHg5IjZExL8e3Cj7i8nwOCLiMElfl/SPKaW/kDRT\n0qstu7wmaSbrhqc/k0vU1CS5/KJG72n/e6rHhUPncplS+vrYMomzJP1PSbsGOEQUqOYxIn5Z0uEp\npTsHPDQcInNN/rWksyXNk/RZSf8xIj45wCH2DZPh8f2BpHfrn9esvS5pVsu/Hyfp9ZRSmuqB4ZBV\nc4n6srmMiN/S6Nrh61NKbwxiYDhk416XKaUtGn2H/39M9aBwyH6ax4g4VtIfiXttXbVdkymljSml\nF1JK76SUHpT03yR9dJAD7BcqdY2xApxParSY462x8AaN/tru4bHtC8SvY6e9cXKJGhovlxHxGUlf\nkHR5SonK9RoovC6PkHT61I0Kh6qax4g4V9ISSfeP/dL0SEnHRcROSZeklLYNaqyYWOE1mTS6nn/o\n8M5wRURcqNH1Th9JKbVWwn5V0r+LiJMj4mRJn5d06wCGiELj5TIiDouIoyXNGN2MoyPiyEGNE5Ob\nIJeflvRfJF07rFXOw2aCXP56RMwf+/s5kv69pHsHM0pMZpw8rpd0iqQVY1+/rtGlLiskbR/EODG5\nCa7JD0fE7LGPlr1Y0uc0WqQ8dHhnOPdhjX5M0wMty4Hvl/QhSUslPTEW+1+iuGO6Gy+XfyhpVct+\n+yX9P0lXTuXgcEjGy+VySSdIeqQl/n9SSv9qykeIUuPlcqekP4iImRr9SK47JP2HgYwQJWweU0of\nPLgRES9LOpBS2jmA8aHceNfkKxr9GNKjJO2Q9KWU0m0DGWGfBUteAQAA0FQskwAAAEBjMRkGAABA\nYzEZBgAAQGMxGQYAAEBjdTUZjogPRMTmiHg6Ir7Qq0EBAAAAU6HjT5OIiMMlPSXpWo1+5MYjkj6Z\nUto43vfMnTs3LVmypKPnQ/ceffTRvSmleb14LHI5ONu2bdPevXt79sHn5HJweplL8jhY3F+HB7kc\nHqW57OZzhi+W9PTBD7qPiG9o9LPqxp0ML1myRCMjI108JboREc/16rHI5eCsXLmyp49HLgenl7kk\nj4PF/XV4kMvhUZrLbpZJnKz2jjI7xmLVgdwcESMRMbJnz57qP6NGyOXwIJfDgTwOD3I5PMhl/fS9\ngC6ldEtKaWVKaeW8eT35rQMGhFwOD3I5HMjj8CCXw4Nc1k83k+HnNdqD/KBFYzEAAACgFrqZDD8i\n6cyIOC0ijpT0CUnf7s2wAAAAgP7ruIAupfR2RPyWpLslHS7pKymlDT0bGQAAANBn3XyahFJK35X0\n3R6NBQAAAJhSdKADAABAYzEZBgAAQGN1tUwCAA5VaddLt5+LRZQ1cHP7lX4vyvJWmrNSpTkjj4em\nm5x0+vi9PDd6fV7U+fypUy6n4l7daS55ZxgAAACNxWQYAAAAjcVkGAAAAI3FZBgAAACNRQHdgPS6\nqASYrjotvDpw4EDRfu56OOww/p/fjV4Wy7k8Ok0oduq3bopTe/mcLuel13OJ0gIrdx+oy71hEIVx\nTje5rMa6KXx0eevlvaAeZwUAAADQB0yGAQAA0FhMhgEAANBYjVsz3M2aqrfffjuLvf7661lsx44d\nWWzfvn1t2yeddFK2z/z587PYUUcdlcX6vXamLqYil9u3b89ir776atv2okWLsn0WLFiQxYYtl51+\nGHuv1weXHq9OPyi+9DyrQzOIXjY8GVSDDad0XXKnBp3H6bIW2MXeeeedLPbmm29msVdeeSWL7dmz\nJ4u99NJLk47tXe96VxZz99y5c+cWfe8RR0zdVGi65NJdM2+99VYW+/GPf5zFXI5crDrvOfroo7N9\njjvuuCw2b968ov3c4x1++OFZrATvDAMAAKCxmAwDAACgsZgMAwAAoLG6WigTEdsk7ZP0jqS3U0or\nezEoAAAAYCr0YtX4VSmlvT14nIEpLRRwRVc//OEPs9gdd9yRxaqFAldeeWW2zzXXXJPFZsyYkcUG\nXcwxDFzxwAsvvJDFvvWtb2WxXbt2tW1fffXV2T4ul0ceeWTR2EoLx/qlm2YKLlYtsHHXkYu5whx3\nPbiYK0wsKf5x4y/9AHhXuFG6Xz90WuAoleWxmwYbpcfAHfuS886dO6XFqqWNGvp9TU5WQFV6n+hl\nAxV3nf7TP/1TFnNF5A8++GAWe/TRR7PYzp0727bfeOONbJ/jjz8+i5111llZ7Kqrrspi559/fhZz\nxVm91Ekue/lcpUWOe/fmU7knnngii/3gBz/IYlu3bs1i1aI6V6i4cOHCLHbBBRdkscsuuyyLuZzP\nnDkzi5VgmQQAAAAaq9vJcJJ0T0Q8GhE392JAAAAAwFTpdjL8/pTSCkkflPSbEXF5dYeIuDkiRiJi\nxH2mIOqDXA4PcjkcyOPwIJfDg1zWT1eT4ZTS82N/7pZ0p6SLzT63pJRWppRWug9SRn2Qy+FBLocD\neRwe5HJ4kMv66biALiKOlXRYSmnf2N+vk/SfezayHuhmUbr7XrfgfNOmTVlsw4YNkz6+6/TiijRw\naEoLQdzxf/rpp7PY+vXrJ3089/iuQGg6Fz62voaS4inJF9O4a6TasW/37t3ZPtWixPEe/9hjj81i\nxxxzTBZzRXUuJ9WCDvd97jldzHVDcgUjredBLwtnqo9XmkcXc9fHT37yk7Zt15mquk91TAe5Y+UK\nTCc7fhM9R5XrAOli7hwovcanUun9pOR4lf68c13GNm7cmMXuv//+LLZ69eos9swzz2Sx6nnlzkV3\nrmzevDmLufG6n7MXXXRRFuuXXhY+uv3c9bx///4s9uyzz2axe+65J4s9/PDDWcwV1b388stZrHo/\ncK/JFby5YjzXwfDGG2/MYsuWLctiJbr5NIkFku4cS+IRkr6eUvq/XTweAAAAMKU6ngynlJ6RlH/+\nBQAAAFAT/F4eAAAAjcVkGAAAAI3Viw50A9Hvji2u+MQtQnfFcq4r3amnnjrhtuSLOaZz0VWv9LqI\nqOTxXVHG448/nsVckcHixYvbthctWpTt4wo8uil46bfWY1RaLOc6Q/3oRz/KYtu2bWvbdsfZffyQ\nK55ynafe/e53F32vK4yq7ucqv+fPn5/F3LXqTHUuW+9b7hx3eXT3NZfH7du3t2274qfXXnsti7lj\nNWfOnCw2e/bsou8tKTR2RZXuOd35VNpJsN/3rV4p+fnmzgGX33/4h3/IYiMjI1nMFUC5c8rda6rF\nlS4f7t5TPT8l6aGHHspis2bNymLVe3o/dXNPKOku5wpb165dm8XuvPPOLOaK5aodASV/b3H312pn\nP/d9roPhli1bsti73vWuLLZgwYIs5jraleCdYQAAADQWk2EAAAA0FpNhAAAANBaTYQAAADRWLQro\n+l0s57gCOtdh5cknn8xirlPKe97znrZtV3TlFqC7wo06F9UNoujEPafL0WOPPZbF9u7dm8VWrFjR\ntn3yySdn+5R2zpouuWw9RqUFpa6IxR2vp556qm3bFZi689wVsy1dujSLuQI6V5jjCniqxSbuGncF\nN6Xn8VTmPKXUlid3DFyxiuvS5e5r69ata9t2XQNdkYu717kOdO7+516Dy1E1j66A1Z3Dbhwu5gz6\n2u3mZ1m1YM51kbv99tuz2AMPPJDFSq8ZVzReLbBy3+uKPqv3FMkX0D3//PNZzN3nXSfZfum0s5xU\nlss1a9Zk+9x2221ZzOXSFd+5ItPTTz89i7n7dbW7nOtQ6c49lzeX8+o9SZIuvfTSLFaCd4YBAADQ\nWEyGAQAA0FhMhgEAANBYTIYBAADQWLUooOs3t1DdLdp3HXncwnu3yP2ss85q2z7hhBOyfVynHRya\nkg49Ut4VTfKL8V3HnGoRl+ucVdIlazppLQYqLfpyx8YVY1WLIdy1ddppp2Wxiy++OIudeOKJWcwV\nXrlCDZeTasHfm2++me1z0kknZTGntNh1smPdjckKId3rc4Vw7lp4+umn27ZdkZorrKkWnErS3Llz\ns5jLoyvocbndvXt327Z7na64z3UX7KY4sl/cmEoLrFzRZLVzqiuwuvvuu7OYKzx2hVPnnXdeFnPX\nuLu2qo/nuhq682fVqlVZbMeOHVmseh5L/nyfSt3k8oknnmjb/upXv5rtc88992Sxffv2ZTHXve2S\nSy7JYmeffXYWW7JkSRarFki6TqPu5+d9992Xxdz3ug6xzz33XBYrUa+f2AAAAEAPMRkGAABAY006\nGY6Ir0TE7ohY3xKbExHfi4gtY3/m73MDAAAA01zJO8O3SvpAJfYFSfemlM6UdO/YNgAAAFArkxbQ\npZTui4gllfCHJV059vfbJP2jpN/r4bjalBYqdNrdrLTQZPXq1Vls586dWcwVBVSLgVyXo+ncoaxX\nOn093XSucwVb3//+97NYtQhH8oWO73//+9u2S3NZF6XFf66YwxWZvvjii23b1a5Eku8s564j17HK\nHWtXNOmKsV544YUsVuUKf5zS67ef58ZkxXnuvuYKTlw3r2rBpDsuF154YRZbvHhxFjv22GOzmFN6\nLlavXVfo5c6dZcuWZbFBdMmcTDddITdv3pzFvva1r7Vtu2K50vvhFVdckcV+4Rd+IYudccYZWcx1\nj6x273QFdO68cMVUroDu1VdfzWKdFl31isulu1a3bt2axb7+9a+3bd97773ZPq4zqLu/XnvttVns\n53/+57NY9QMBJN91sPqhAK4LqPs54rqUuvPRFf+67y3R6ZrhBSmlgz/hdkpa0OHjAAAAAAPTdQFd\nGv0v67j/lY6ImyNiJCJG3EdjoD7I5fAgl8OhNY/u3R/UB9fk8CCX9dPpZHhXRCyUpLE/8/evx6SU\nbkkprUwprXSfR4j6IJfDg1wOh9Y8us9eRX1wTQ4Pclk/nTbd+LakmyR9aezPu3o2ogFw63Vc84CR\nkZEs5ta7uLVR1Q+krq6LksrXEtZ5PWq/ubV1bs3Z/fffn8XcejuXy+p6KdcspddrRfud88ke310j\nL7/8chZza4ar67rch6y7tYjHHHNMFnPXTen64C1btmSx6gfwu8YM7rGc6bDmf7Ln279/fxZz66Zf\nf/31LFZdG++ujUWLFmUxt1bXNdhweXTXs1t3WD3H3Ll5yimnZDHXOGY6Km0M5dbIfvOb38xi1ZoJ\n91uF448/Pov94i/+Yhb7+Mc/nsWWL1+exdy15c7X6r3GNXdxuZwzZ04Wc2uL3X3erTHvl9LGUK4e\n6Y477shi1fXe7p1o9x/lG264IYt95CMfyWJufbBb6+1+DrrXVeXWLpfm0jUOcWvCS5R8tNrtkh6S\ntCwidkTEr2l0EnxtRGyRdM3YNgAAAFArJZ8m8clx/unqHo8FAAAAmFJ0oAMAAEBjMRkGAABAY3Va\nQFdbpYvXq4U1krRu3bos5hZ1X3DBBVmsuiC810VXFNWNcoVeroHAhg0bspgrzrrsssuyWLUAzJ0D\nw5Y3d424D0F3H3heLQQ555xzsn1ckYzjCp5cgaRrNLB+/fos9uSTT7Ztn3nmmdk+JUUg05G717nC\nOFds5ooGq+e9K2JyhTWuWM5xTQZcMYwrEqs2TXCFze7x3f1iOiot8r7zzjuz2KpVq7JY9Zp0jXCu\nv/76LPbpT386i5177rlZzBW/lhTLudhRRx2V7eOatrgCPXdPd/cQd270i3vNrhDMNUJxsWpTI3cN\nulx+9KMfzWJnn312FnPnhsulu99U93P3eff4bj/3c9Ydy07v17wzDAAAgMZiMgwAAIDGYjIMAACA\nxmIyDAAAgMZqXAGd4xbPP/jgg1nMdelZuHBhFrv88suzWLUIwC0Gd+pSYDUIpV2Z7rvvvizmOg4t\nWLAgi1177bVZrLq4f9hy6Y6rK0iqFm5IvptZtcDNFWe5a8sVUbhrdevWrVnsoYceymJr1qzJYtWC\nP3c9u4Ibd565wg1XwDNVXHGJKzZ0RWqu0K76eO71uvy4fLv9XOesTZs2ZTGX72rMFVOV5tHF3LF0\n10kvtT7+T37yk+zfq13kJOnv/u7vsti2bduyWPW8dIXCn/jEJ7KYK351x9rdE93xKrl3lhYou3PK\nnaOuoNO9hl5qfe1unO48//u///ss5s79ai6vuOKKbB/XJdAVPrrCRFfs75RcI6U/A93PG3ddukJN\n9xpK8M4wAAAAGovJMAAAABqLyTAAAAAai8kwAAAAGqtxBXRuEb/rwPTAAw9kMbeA23Wbc12sSgpp\n6lJgNZ25Tj733ntvFnOFFZdcckkWO+OMM7JYSUFBnXPprhFX9OGKetx+1WKsp556KtvHdZlyRS0u\nv65ob8uWLVnMdZXcv39/27YrMPvRj36UxVxRWGnxSeu50c8iLHeOu3G7/VwBS7XjoCtIdI/vjoE7\npi6PrqPhE088kcWqxXcnnHBCto87X9143X6lHbF6JaXU9vPGnc/f+c53spjrrOkKB6udxq677rpJ\n95Gko48+Oot1022zpGuZK8xy548r6HWP7zq0nXrqqVmsV1JKba/BdQ50hY+PPPJIFnPX5fLly9u2\nP/jBD2b7uGI5d38t7Yxbkjcpv0ZcLnft2pXF3L3A3afcdb5o0aIsVoJ3hgEAANBYTIYBAADQWJNO\nhiPiKxGxOyLWt8S+GBHPR8Tasa8P9XeYAAAAQO+VvDN8q6QPmPifpZRWjH19t7fDAgAAAPpv0qqu\nlNJ9EbGk/0OZGm4Rtis6WL9+fRZzRXCu6GrOnDlZrLq4vM4FVtOFy+XmzZuz2GOPPZbFXKGA68I0\na9asLDbsuXTFES7mCsbmzp2bxaqFLS5HrvjFXW+uuKm0s5jbr1rQ4TqvuQ5trtDOnSuu4Kjfncsm\neh6Xs5kzZ2YxVzC2ffv2tu033ngj2+fxxx/PYu5ac4VArvjS5cwVT1W/1+XRFcu53LrXNdUd6A4c\nONB2jFatWpXt47qkutfjfh69973vbdt+3/vel+3jCs26udeVHq/qsXavaWRkJIs999xzWcydx2ed\ndVYWu+iii4rG1omUUtt57Ap5f/CDH2Qx15XzuOOOy2I/93M/N+G25O9N/SwAPajk/rpx48YstmPH\njizmiqyXLVuWxVx+S3RzNH47Ih4fW0Yxu4vHAQAAAAai08nwn0taKmmFpBcl/cl4O0bEzRExEhEj\nrvc86oNcDg9yORxa8+jeSUJ9tObSffwW6oPrsn46mgynlHallN5JKR2Q9JeSLp5g31tSSitTSivn\nzZvX6TgxDZDL4UEuh0NrHt0SFdRHay7d56eiPrgu66ejyXBELGzZ/GVJ+QJbAAAAYJqbtIAuIm6X\ndKWkuRGxQ9J/knRlRKyQlCRtk/QbfRxjx9yCfVe48f3vfz+LuWIL9w7a5ZdfnsVmzJhROkQUKu2K\n5jr5vPLKK1mMXB4aV7zgOi2uWLEii1VzVy3EknwuHVfU44pDSlW7T7rXWVpo4gr+3Pf2q+AyItoe\n2xWuuXepzjnnnCzmChW3bdvWtu0K2Vxhoet0dcwxx2Sx0kI+t6ynWjxVWqzl9ivNYz8LkN555522\nc9MVjLnj4F6P68i1cuXKtm13XnTTjcxxRYiuCLp6v3YdRN3PbFdod+KJJ2YxV/R+/vnnZ7FeOXDg\nQFunS1f46Irq3LFxnfLe8573tG27n22luSzlculi1YI517Vy9erVWcx1Gl28eHEWqxaCStJJJ52U\nxUqUfJrEJ034yx09GwAAADCN0IEOAAAAjcVkGAAAAI3FZBgAAACNNema4emqZNG+W4DuOl09/PDD\nWcwtLq8uVJekpUuXZrGp6OwyTEpy6Rbnu89vfOCBB4qes1pAIvnihGHrLlfCnb+uc5k792+44YYs\nVi18ePbZZ7N9XMGqK2RyBXSu8Mp1OnKFXM8880zbtis2WrBgQRZzH33ljtFkxVi9Pr9aH88Vn5UW\nFLn9nnzyybbt3bt3Z/u4e6477q6T1vHHH5/FXAc6VwRdzaMrhHTP6XLrxuGOZT/v82+99ZZ27dr1\n0+0tW7Zk+7iiU3e+LV++PItVr133+hx3Hy49h13e3Dl01113tW1/61vfyvbZunVrFnP3BnduX3/9\n9Vls/vz5WaxXDhw40FZY6orlXOGpy4nLZbWQ2Z37pTlyP4vdNe3OPVeo/tBDD7Vtf+Mb38j2Wbdu\nXRZz9wzXWe+KK67IYp0WVDNrAwAAQGMxGQYAAEBjMRkGAABAY027NcOlH+DtVNczufVma9euzWI7\nd+7MYm4dolt/5NYpYXydfhj+22+/ne2zcePGLPb8889nMZfLK6+8Mou5dUpN5NZCHn300VnMfbi7\nW6+1cOHCtu3WtZAHuTVzbv2jWw/nuPOg2mBDyu8Z7gPblyxZksVmz56dxVyDlqmuH2hdG+jG49bN\nuvPeraWtrld0TTfeeOONLObOHRdz6xrduVJduyzl+XDXvKsJqJ6b442t100LJlNt1OCaoDju+nBr\n2avnvVtj77jXXNJsQZI2bNiQxb7zne9ksWrjJLeu2J2zF198cRb72Mc+lsWWLVuWxUrXTHfiwIED\nbflzzUHcz0XXmMbNN6o5cddg6X3IrQ929Rw//OEPs9iqVauyWDW/mzZtyvZx56xbH3zjjTdmMXdN\nd9ooi3eGAQAA0FhMhgEAANBYTIYBAADQWEyGAQAA0FjTroCulFtwXo3t27cv2+exxx7LYq7QzhVW\n/OzP/mwWc0U+TWzU4HRTDFn9XlcUsHr16izmcumKot73vvdlsakukplOWl+nK7YojbnroVqc4opf\nWouFDnLFHK5Yx32vG5srkqk2WHDNNFzRmSvSmA7nSusY3Pnsjot7La6IrFq84woo3fVXcq+WfJGY\n28+dA9Vz6uSTT872qTZ/kXxxmTtGLrf9zHdEtOXFFaa6a81dMy+88EIWW7NmTdu2a4jhntPtV1rk\n6O7Xjz/+eBarFma6QjLXbOGzn/1sFrvwwguzmLv/9LvQdbJmOC6X7loqOdbuena5dD9TX3rppSzm\nPnTg0UcfzWKueVl1vK5YzuXyM5/5TBY799xzs5i7T3WaS94ZBgAAQGMxGQYAAEBjMRkGAABAY006\nGY6IUyJiVURsjIgNEfG5sficiPheRGwZ+zP/FHoAAABgGispoHtb0udTSmsi4t2SHo2I70n6VUn3\nppS+FBFfkPQFSb93KE/eTYGVHWilS9mLL76Y7eM6p7gF16effnoWc0UZU91hqimqRTJ79uzJ9nn2\n2WezmCtO+Jmf+ZksRi67V1poVL3OXQGLK/pwBR6ugMft54qx3HirRWGugK60W9p0U1r05e7DJbkt\nLVJ0RV0uZ65gyHUmdN0oq3k75ZRTsn1cwZ8rMpwOZsyYofnz5/90++yzz872efrpp7OYO14jIyNZ\nrHr8169fn+3jzhVXlO46EbpCr1deeaXoOZYuXdq27QqsbrrppixW7ZAo+eK7qS6UPuyww9o6IrpC\nTtdZznXGffDBB7NY9fpynf5c4ZrrhOfmTNu2bctiritgyQcRXH311dk+LpfufO93Lif96Z9SejGl\ntGbs7/skbZJ0sqQPS7ptbLfbJH2koxEAAAAAA3JIb4VFxBJJF0paLWlBSungfyN2Ssqb2Y9+z80R\nMRIRI+7dPdQHuRwe5HI4kMfh0ZrLl19+edDDQRfIZf0UT4YjYqakv5H0Oymltt/FpNHfs9k1Dyml\nW1JKK1NKK92vqVAf5HJ4kMvhQB6HR2su58yZM+jhoAvksn6KJsMRMUOjE+G/Sin97Vh4V0QsHPv3\nhZLyRSQAAADANDZpAV2Mrkb+sqRNKaU/bfmnb0u6SdKXxv68qy8jHIcr+qgu4HYLv91if7d4vbqI\nX8q7VUkUXR3Uy25zUp7L7du3Z/u4gozZs/MPNXGda1znqToURU2F0uNQmvPqNVLavc3F3nzzzaLn\ndB2dWotYDqoWYLjzwunmfJ9u51mnRXXuGJd2KnRFcO453XO4+/Cpp57atu26TrriPqe0Y14/83jE\nEUdo7ty5P92+/PLLs32eeeaZLLZu3bos5u6TjzzySNv2U089le3jiq5Kc+SutQsuuCCLnXnmmVms\n+lpdF7nWY3OQy29pkW8/HX744W1dLC+99NJsn82bN2exVatWZbG9e/dmsbvvvrttu5pbyRcBu8JW\nd126/Lqfs+edd14Wu+qqq9q2XQGd6/brzr1+57Lk0yQulfQrkp6IiIN9+X5fo5Pgv46IX5P0nKSP\n9WxUAAAAwBSYdDKcUnpA0njT73yaDwAAANQEv+MHAABAYzEZBgAAQGOVrBkeOLdo3y3+3r9/f9v2\njh07sn19fRF3AAAH/klEQVSqnc0ktXX6Oei0007LYt10oqq+hulWRDNILifVQqnnnnuu6PtOPPHE\nLOZy6YoCSpHLQ1PSuaw0H+4adEVvrQUrE31v9TxzHY163SmzrkryWHotuGInV8jsrmfXPbJ6L5g1\na1bR2NzPkdJ897trWWunxmuuuSbbx3VZdMWFTz75ZBYrOe9dPlyHxkWLFmUxVyznCseqhY/ueUsL\n46ZrMfthhx3Wdu9ZuXJlto8rcnSv2xXHVb/XnReuMM7dN9315grQXS4vu+yyLFbtBFnaRa6bXPat\nAx0AAAAwrJgMAwAAoLGYDAMAAKCxmAwDAACgsaa8gG6y4oTS7j8uVi2iaC1AOMgVX7hCAddNxT1e\naREJRVajOs2lK4hxhXFuv+XLl2ex6dqtaLoqzVuJXl8frgiq2sFwvOeoFnS4IrvSTmt1ve57WSBY\negzc8XOdCV0nKtfdrHrdu+u7m1xMdR4jou14uI56n/rUp7KYK85au3ZtFtu1a1fbtjsHXBHq4sWL\ns5i7v7qiutJrq3qsS499aSfFQeSy9XXOmzcv2+eGG27IYu64rl69Oott2bKlbfu1117L9nHH2XV+\nc4WPK1asyGILFizIYu5arRbHlRbGDeK+yTvDAAAAaCwmwwAAAGgsJsMAAABoLCbDAAAAaKwpL6Br\nXRjdTeGG61pS7b5z4403Zvt86EMfymKucMMVy1F01R/uGFaLZK677rpsn6uuuqrosdzCfpfzuhZA\nDUovC1a6KbxyhTmuU9YZZ5yRxaqduJYsWZLt47p6ufPH3ZOm2znV72K5bh6/NLeumKz6vK5IyeXR\n3dNL89hvrc/pzrc5c+Zksfe+971ZzBXVuaLTQx3TQS5vve4q1qluiu/6NQ6XS1es6PLmitmq3eVc\nbrvJWzcFxINABzoAAADgEDEZBgAAQGNNOhmOiFMiYlVEbIyIDRHxubH4FyPi+YhYO/aVrz8AAAAA\nprGSNcNvS/p8SmlNRLxb0qMR8b2xf/uzlNIf93JApWvs3Nq06vovt1bUrc3pxnRZJ1MXpceruq7K\nrbOainFgVL8/0N6tQXNr1Y499tgs5q5ztz7UrQeurpFzDR3c45c0CxgvNki9Xufb6XOWru1362Ld\nusZTTz21bds1Upo9e3bRc06Xtd+TPX7p87vXg6nVz1y6cxiHbtLJcErpRUkvjv19X0RskpS3cQMA\nAABq5pDWDEfEEkkXSjrYE/C3I+LxiPhKROT/7QYAAACmseLJcETMlPQ3kn4npfSapD+XtFTSCo2+\nc/wn43zfzRExEhEje/bs6cGQMSjkcniQy+FAHocHuRwe5LJ+iibDETFDoxPhv0op/a0kpZR2pZTe\nSSkdkPSXki5235tSuiWltDKltNJ99iPqg1wOD3I5HMjj8CCXw4Nc1s+ka4ZjdGX3lyVtSin9aUt8\n4dh6Ykn6ZUnrD/XJuylAmG6FKU1Xmo9eFl11Mw50r9NjXfp9pUV1rnGCa5pTcu51UyhV13Ov39df\nacwVB7nczpo1a9LHc+dOaayueQTQuZJPk7hU0q9IeiIi1o7Ffl/SJyNihaQkaZuk3+jLCAEAAIA+\nKfk0iQckuf8qf7f3wwEAAACmDh3oAAAA0FhMhgEAANBYJWuGgZ6hOAWl6tDRDZMrLZZzxWy9fE4A\nGA/vDAMAAKCxmAwDAACgsZgMAwAAoLGYDAMAAKCxwnVl6tuTReyR9NyUPWG7uZL2Dui5e6EX41+c\nUupJb0hy2ZVux9+zPEoDzWXd8yhNo1xyTXaNXI6qey75WfnPyGVhLqd0MjxIETGSUlo56HF0qu7j\n76W6H4u6j79XhuE4DMNr6IVhOA7D8Bp6oe7Hoe7j76W6H4upHD/LJAAAANBYTIYBAADQWE2aDN8y\n6AF0qe7j76W6H4u6j79XhuE4DMNr6IVhOA7D8Bp6oe7Hoe7j76W6H4spG39j1gwDAAAAVU16ZxgA\nAABoM5ST4Yj4SkTsjoj1LbE5EfG9iNgy9ufsQY5xIhFxSkSsioiNEbEhIj43Fq/Na+gF8jg8yOXw\nIJfDg1wOh7rnURp8LodyMizpVkkfqMS+IOnelNKZku4d256u3pb0+ZTSOZIukfSbEXGO6vUaeuFW\nkcdhcavI5bC4VeRyWNwqcjkMblW98ygNOpcppaH8krRE0vqW7c2SFo79faGkzYMe4yG8lrskXVvn\n10Aem51HcjlcX+RyeL7I5XB8DVMeB5HLYX1n2FmQUnpx7O87JS0Y5GBKRcQSSRdKWq2avoYeq+Ux\nII9WLY8DubRqeRzIpVXL40AuM7U9BoPIZZMmwz+VRv+LMe0/RiMiZkr6G0m/k1J6rfXf6vIa+qku\nx4A8Tq4ux4FcTq4ux4FcTq4ux4FcTqxOx2BQuWzSZHhXRCyUpLE/dw94PBOKiBkaPSH+KqX0t2Ph\nWr2GPqnVMSCPE6rVcSCXE6rVcSCXE6rVcSCX46rdMRhkLps0Gf62pJvG/n6TRtejTEsREZK+LGlT\nSulPW/6pNq+hj2pzDMjjpGpzHMjlpGpzHMjlpGpzHMjlhGp1DAaey0Evku7Hl6TbJb0o6S1JOyT9\nmqQTNFqJuEXSPZLmDHqcE4z//Rr9VcDjktaOfX2oTq+BPJJHcjmcX+RyeL7I5XB81T2P0yGXdKAD\nAABAYzVpmQQAAADQhskwAAAAGovJMAAAABqLyTAAAAAai8kwAAAAGovJMAAAABqLyTAAAAAai8kw\nAAAAGuv/A7waJ9gxyNuaAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x11f9cd438>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "z1 = z[:,0]\n",
+    "z2 = z[:,1]\n",
+    "\n",
+    "fig = plt.figure()\n",
+    "ax = fig.add_subplot(111)\n",
+    "ax.plot(z1,z2,'ko')\n",
+    "plt.title(\"latent space\")\n",
+    "\n",
+    "#np.where((z1>3) & (z2<2) & (z2>0))\n",
+    "#select the points from the latent space\n",
+    "a_vec = [2,5,7,789,25,9993]\n",
+    "for i in range(len(a_vec)):\n",
+    "    ax.plot(z1[a_vec[i]],z2[a_vec[i]],'ro')  \n",
+    "    ax.annotate('z%d' %i, xy=(z1[a_vec[i]],z2[a_vec[i]]), \n",
+    "                xytext=(z1[a_vec[i]],z2[a_vec[i]]),color = 'r',fontsize=15)\n",
+    "\n",
+    "\n",
+    "f, ((ax0, ax1, ax2, ax3, ax4,ax5)) = plt.subplots(1,6,  sharex='col', sharey='row',figsize=(12,2.5))\n",
+    "for i in range(len(a_vec)):\n",
+    "    eval('ax%d' %(i)).imshow(np.reshape(x_construction[a_vec[i],:],(28,28)), interpolation='nearest', cmap=cm.Greys)\n",
+    "    eval('ax%d' %(i)).set_title('z%d'%i)\n",
+    "\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Above is a plot of points in the 2D latent space and their corresponding decoded images, it can be seen that points that are close in the latent space get mapped to the same digit from the decoder, and we can see how it evolves from left to right."
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/example/warpctc/lstm_ocr.py b/example/warpctc/lstm_ocr.py
index 49df98a77236..9dd39efb4962 100644
--- a/example/warpctc/lstm_ocr.py
+++ b/example/warpctc/lstm_ocr.py
@@ -72,7 +72,7 @@ def __init__(self, count, batch_size, num_label, init_states):
         self.num_label = num_label
         self.init_states = init_states
         self.init_state_arrays = [mx.nd.zeros(x[1]) for x in init_states]
-        self.provide_data = [('data', (batch_size, 2400))] + init_states
+        self.provide_data = [('data', (batch_size, 80, 30))] + init_states
         self.provide_label = [('label', (self.batch_size, 4))]
 
     def __iter__(self):
@@ -88,7 +88,7 @@ def __iter__(self):
                 img = cv2.imdecode(img, cv2.IMREAD_GRAYSCALE)
                 img = cv2.resize(img, (80, 30))
                 img = img.transpose(1, 0)
-                img = img.reshape((80 * 30))
+                img = img.reshape((80, 30))
                 img = np.multiply(img, 1/255.0)
                 data.append(img)
                 label.append(get_label(num))
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index 695408380ec9..84b2fea7129c 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file base.h
  * \brief configuation of mxnet as well as basic data structure.
  */
@@ -108,9 +109,9 @@
 #endif
 
 /*! \brief major version */
-#define MXNET_MAJOR 0
+#define MXNET_MAJOR 1
 /*! \brief minor version */
-#define MXNET_MINOR 11
+#define MXNET_MINOR 0
 /*! \brief patch version */
 #define MXNET_PATCH 0
 /*! \brief mxnet version */
@@ -143,7 +144,8 @@ struct Context {
   enum DeviceType {
     kCPU = cpu::kDevMask,
     kGPU = gpu::kDevMask,
-    kCPUPinned = 3
+    kCPUPinned = 3,
+    kCPUShared = 5,
   };
   /*! \brief the device type we run the op on */
   DeviceType dev_type;
@@ -155,10 +157,17 @@ struct Context {
    * \brief Get corresponding device mask
    * \return cpu::kDevMask or gpu::kDevMask
    */
-  inline int dev_mask() const {
-    if (dev_type == kCPUPinned) return cpu::kDevMask;
+  inline DeviceType dev_mask() const {
+    if (dev_type == kCPUPinned || dev_type == kCPUShared) return kCPU;
     return dev_type;
   }
+  /*!
+   * \brief Returns dev_id for kGPU, 0 otherwise
+   */
+  inline int real_dev_id() const {
+    if (dev_type == kGPU) return dev_id;
+    return 0;
+  }
   /*!
    * \brief Comparator, used to enable Context as std::map key.
    * \param b another context to compare
@@ -200,7 +209,7 @@ struct Context {
     return true;
   }
   /*! \brief the maximal device type */
-  static const int32_t kMaxDevType = 4;
+  static const int32_t kMaxDevType = 6;
   /*! \brief the maximal device index */
   static const int32_t kMaxDevID = 16;
   /*!
@@ -223,6 +232,12 @@ struct Context {
    * \return Pinned CPU context. -1 for current GPU.
    */
   inline static Context CPUPinned(int32_t dev_id = -1);
+  /*!
+   * Create a CPU shared memory context.
+   * \param dev_id dummy device id.
+   * \return CPU shared memory context.
+   */
+  inline static Context CPUShared(int32_t dev_id = 0);
   /*!
    * Create a context from string of the format [cpu|gpu|cpu_pinned](n)
    * \param str the string pattern
@@ -273,7 +288,7 @@ inline Context Context::Create(DeviceType dev_type, int32_t dev_id) {
   ctx.dev_type = dev_type;
   if (dev_id < 0) {
     ctx.dev_id = 0;
-    if (dev_type != kCPU) {
+    if (dev_type & kGPU) {
 #if MXNET_USE_CUDA
       CHECK_EQ(cudaGetDevice(&ctx.dev_id), cudaSuccess);
 #else
@@ -293,6 +308,10 @@ inline Context Context::CPUPinned(int32_t dev_id) {
   return Create(kCPUPinned, dev_id);
 }
 
+inline Context Context::CPUShared(int32_t dev_id) {
+  return Create(kCPUShared, dev_id);
+}
+
 inline Context Context::GPU(int32_t dev_id) {
   return Create(kGPU, dev_id);
 }
@@ -313,6 +332,8 @@ inline Context Context::FromString(std::string str) {
       ret = GPU(id);
     } else if (type == "cpu_pinned") {
       ret = CPUPinned(id);
+    } else if (type == "cpu_shared") {
+      ret = CPUShared(id);
     } else {
       LOG(FATAL) << "Invalid context string " << str;
     }
@@ -329,6 +350,8 @@ inline std::ostream& operator<<(std::ostream &out, const Context &ctx) {
     out << "gpu(";
   } else if (ctx.dev_type == Context::kCPUPinned) {
     out << "cpu_pinned(";
+  } else if (ctx.dev_type == Context::kCPUShared) {
+    out << "cpu_shared(";
   } else {
     out << "unknown(";
   }
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 2289354e8a5e..9815786ba7f6 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_api.h
  * \brief C API of mxnet
  */
@@ -84,6 +85,10 @@ typedef void *KVStoreHandle;
 typedef void *RecordIOHandle;
 /*! \brief handle to MXRtc*/
 typedef void *RtcHandle;
+/*! \brief handle to rtc cuda module*/
+typedef void *CudaModuleHandle;
+/*! \brief handle to rtc cuda kernel*/
+typedef void *CudaKernelHandle;
 
 typedef void (*ExecutorMonitorCallback)(const char*,
                                         NDArrayHandle,
@@ -228,6 +233,20 @@ MXNET_DLL int MXDumpProfile();
 /*! \brief Set the number of OMP threads to use */
 MXNET_DLL int MXSetNumOMPThreads(int thread_num);
 
+/*!
+ * \brief set bulk execution limit
+ * \param bulk_size new bulk_size
+ * \param prev_bulk_size previous bulk_size
+ */
+MXNET_DLL int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size);
+
+/*!
+ * \brief get the MXNet library version as an integer
+ * \param pointer to the integer holding the version number
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXGetVersion(int *out);
+
 //-------------------------------------
 // Part 1: NDArray creation and deletion
 //-------------------------------------
@@ -276,6 +295,38 @@ MXNET_DLL int MXNDArrayCreateEx(const mx_uint *shape,
                               int delay_alloc,
                               int dtype,
                               NDArrayHandle *out);
+
+
+/*!
+ * \brief create an empty sparse NDArray with specified shape and data type
+ * \param storage_type the storage type of the ndarray
+ * \param shape the pointer to the shape
+ * \param ndim the dimension of the shape
+ * \param dev_type device type, specify device we want to take
+ * \param dev_id the device id of the specific device
+ * \param delay_alloc whether to delay allocation until
+ *        the narray is first mutated
+ * \param dtype data type of created array
+ * \param num_aux the number of aux data to support this ndarray
+ * \param aux_type data type of the aux data for the created array
+ * \param aux_ndims the dimension of the shapes of aux data
+ * \param aux_shape the shapes of aux data
+ * \param out the returning handle
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type,
+                                      const mx_uint *shape,
+                                      mx_uint ndim,
+                                      int dev_type,
+                                      int dev_id,
+                                      int delay_alloc,
+                                      int dtype,
+                                      mx_uint num_aux,
+                                      int *aux_type,
+                                      mx_uint *aux_ndims,
+                                      const mx_uint *aux_shape,
+                                      NDArrayHandle *out);
+
 /*!
  * \brief create a NDArray handle that is loaded from raw bytes.
  * \param buf the head of the raw bytes
@@ -350,6 +401,23 @@ MXNET_DLL int MXNDArraySyncCopyFromCPU(NDArrayHandle handle,
 MXNET_DLL int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
                                      void *data,
                                      size_t size);
+/*!
+ * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0
+ * This function blocks. Do not use it in performance critical code.
+ * \param handle_dst handle of a dst ndarray whose data/aux_data has been allocated
+ * \param handle_src handle of a src ndarray which has default storage type
+ * \param i dst data blob indicator
+ */
+MXNET_DLL int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst,
+                                           const NDArrayHandle handle_src,
+                                           const int i);
+
+/*!
+ * \brief check whether the NDArray format is valid
+ * \param full_check if `True`, rigorous check, O(N) operations
+ *    Otherwise basic check, O(1) operations
+ */
+MXNET_DLL int MXNDArraySyncCheckFormat(NDArrayHandle handle, const bool full_check);
 /*!
  * \brief Wait until all the pending writes with respect NDArray are finished.
  *  Always call this before read data out synchronizely.
@@ -388,6 +456,7 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle,
                              mx_uint slice_begin,
                              mx_uint slice_end,
                              NDArrayHandle *out);
+
 /*!
  * \brief Index the NDArray along axis 0.
  * \param handle the handle to the NDArray
@@ -398,6 +467,13 @@ MXNET_DLL int MXNDArraySlice(NDArrayHandle handle,
 MXNET_DLL int MXNDArrayAt(NDArrayHandle handle,
                           mx_uint idx,
                           NDArrayHandle *out);
+
+/*!
+ * \brief get the storage type of the array
+ */
+MXNET_DLL int MXNDArrayGetStorageType(NDArrayHandle handle,
+                                      int *out_storage_type);
+
 /*!
  * \brief Reshape the NDArray.
  * \param handle the handle to the narray
@@ -436,6 +512,34 @@ MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle,
  */
 MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle,
                                int *out_dtype);
+
+/*!
+ * \brief get the type of the ith aux data in NDArray
+ * \param handle the handle to the narray
+ * \param i the index of the aux data
+ * \param out_type pointer holder to get type of aux data
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXNDArrayGetAuxType(NDArrayHandle handle,
+                                  mx_uint i,
+                                  int *out_type);
+
+/*!
+ * \brief Get a deep copy of the ith aux data blob
+ * in the form of an NDArray of default storage type.
+ * This function blocks. Do not use it in performance critical code.
+ */
+MXNET_DLL int MXNDArrayGetAuxNDArray(NDArrayHandle handle,
+                                     mx_uint i,
+                                     NDArrayHandle *out);
+
+/*!
+ * \brief Get a deep copy of the data blob
+ * in the form of an NDArray of default storage type.
+ * This function blocks. Do not use it in performance critical code.
+ */
+MXNET_DLL int MXNDArrayGetDataNDArray(NDArrayHandle handle,
+                                      NDArrayHandle *out);
 /*!
  * \brief get the context of the NDArray
  * \param handle the handle to the narray
@@ -581,6 +685,28 @@ MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator creator,
                                  int num_params,
                                  const char **param_keys,
                                  const char **param_vals);
+/*!
+ * \brief invoke a nnvm op and imperative function
+ * \param creator the op
+ * \param num_inputs number of input NDArrays
+ * \param inputs input NDArrays
+ * \param num_outputs number of output NDArrays
+ * \param outputs output NDArrays
+ * \param num_params number of keyword parameters
+ * \param param_keys keys for keyword parameters
+ * \param param_vals values for keyword parameters
+ * \param out_stypes output ndarrays' stypes
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXImperativeInvokeEx(AtomicSymbolCreator creator,
+                                   int num_inputs,
+                                   NDArrayHandle *inputs,
+                                   int *num_outputs,
+                                   NDArrayHandle **outputs,
+                                   int num_params,
+                                   const char **param_keys,
+                                   const char **param_vals,
+                                   const int **out_stypes);
 /*!
  * \brief set whether to record operator for autograd
  * \param is_recording 1 when recording, 0 when not recording.
@@ -590,7 +716,7 @@ MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator creator,
 MXNET_DLL int MXAutogradSetIsRecording(int is_recording, int* prev);
 /*!
  * \brief set whether to record operator for autograd
- * \param is_train 1 when training, 0 when testing
+ * \param is_training 1 when training, 0 when testing
  * \param prev returns the previous status before this set.
  * \return 0 when success, -1 when failure happens
  */
@@ -638,19 +764,26 @@ MXNET_DLL int MXAutogradBackward(mx_uint num_output,
                                  NDArrayHandle* ograd_handles,
                                  int retain_graph);
 /*!
-* \brief compute the gradient of outputs w.r.t variabels
-* \param num_output number of output NDArray
-* \param output_handles output NDArrays
-* \param ograd_handles head gradient for NDArrays
-* \param retain_graph whether to keep the graph after backward
-* \param is_train whether to do backward for training or inference
-* \return 0 when success, -1 when failure happens
-*/
+ * \brief compute the gradient of outputs w.r.t variabels
+ * \param num_output number of output NDArray
+ * \param output_handles output NDArrays
+ * \param ograd_handles head gradient for NDArrays
+ * \param num_variables number of variables
+ * \param
+ * \param retain_graph whether to keep the graph after backward
+ * \param is_train whether to do backward for training or inference
+ * \return 0 when success, -1 when failure happens
+ */
 MXNET_DLL int MXAutogradBackwardEx(mx_uint num_output,
-                                   NDArrayHandle* output_handles,
-                                   NDArrayHandle* ograd_handles,
+                                   NDArrayHandle *output_handles,
+                                   NDArrayHandle *ograd_handles,
+                                   mx_uint num_variables,
+                                   NDArrayHandle *var_handles,
                                    int retain_graph,
-                                   int is_train);
+                                   int create_graph,
+                                   int is_train,
+                                   NDArrayHandle **grad_handles,
+                                   int **grad_stypes);
 /*
  * \brief get the graph constructed by autograd.
  * \param handle ndarray handle
@@ -666,6 +799,30 @@ MXNET_DLL int MXCreateCachedOp(SymbolHandle handle,
  * \brief free cached operator
  */
 MXNET_DLL int MXFreeCachedOp(CachedOpHandle handle);
+/*!
+ * \brief invoke cached operator
+ */
+MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle,
+                               int num_inputs,
+                               NDArrayHandle *inputs,
+                               int *num_outputs,
+                               NDArrayHandle **outputs);
+/*!
+ * \brief invoke a cached op
+ * \param handle the handle to the cached op
+ * \param num_inputs number of input NDArrays
+ * \param inputs input NDArrays
+ * \param num_outputs number of output NDArrays
+ * \param outputs output NDArrays
+ * \param out_stypes output ndarrays' stypes
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXInvokeCachedOpEx(CachedOpHandle handle,
+                                 int num_inputs,
+                                 NDArrayHandle *inputs,
+                                 int *num_outputs,
+                                 NDArrayHandle **outputs,
+                                 const int** out_stypes);
 /*!
  * \brief invoke cached operator
  */
@@ -1017,20 +1174,20 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
-                                 mx_uint num_args,
-                                 const char** keys,
-                                 const mx_uint *arg_ind_ptr,
-                                 const mx_uint *arg_shape_data,
-                                 mx_uint *in_shape_size,
-                                 const mx_uint **in_shape_ndim,
-                                 const mx_uint ***in_shape_data,
-                                 mx_uint *out_shape_size,
-                                 const mx_uint **out_shape_ndim,
-                                 const mx_uint ***out_shape_data,
-                                 mx_uint *aux_shape_size,
-                                 const mx_uint **aux_shape_ndim,
-                                 const mx_uint ***aux_shape_data,
-                                 int *complete);
+                                        mx_uint num_args,
+                                        const char** keys,
+                                        const mx_uint *arg_ind_ptr,
+                                        const mx_uint *arg_shape_data,
+                                        mx_uint *in_shape_size,
+                                        const mx_uint **in_shape_ndim,
+                                        const mx_uint ***in_shape_data,
+                                        mx_uint *out_shape_size,
+                                        const mx_uint **out_shape_ndim,
+                                        const mx_uint ***out_shape_data,
+                                        mx_uint *aux_shape_size,
+                                        const mx_uint **aux_shape_ndim,
+                                        const mx_uint ***aux_shape_data,
+                                        int *complete);
 
 /*!
  * \brief infer type of unknown input types given the known one.
@@ -1061,6 +1218,10 @@ MXNET_DLL int MXSymbolInferType(SymbolHandle sym,
                                 mx_uint *aux_type_size,
                                 const int **aux_type_data,
                                 int *complete);
+
+
+
+
 //--------------------------------------------
 // Part 4: Executor interface
 //--------------------------------------------
@@ -1222,36 +1383,39 @@ MXNET_DLL int MXExecutorBindEX(SymbolHandle symbol_handle,
                                ExecutorHandle *out);
 
 MXNET_DLL int MXExecutorSimpleBind(SymbolHandle symbol_handle,
-                         int dev_type,
-                         int dev_id,
-                         const mx_uint num_g2c_keys,
-                         const char** g2c_keys,
-                         const int* g2c_dev_types,
-                         const int* g2c_dev_ids,
-                         const mx_uint provided_grad_req_list_len,
-                         const char** provided_grad_req_names,
-                         const char** provided_grad_req_types,
-                         const mx_uint num_provided_arg_shapes,
-                         const char** provided_arg_shape_names,
-                         const mx_uint* provided_arg_shape_data,
-                         const mx_uint* provided_arg_shape_idx,
-                         const mx_uint num_provided_arg_dtypes,
-                         const char** provided_arg_dtype_names,
-                         const int* provided_arg_dtypes,
-                         const mx_uint num_shared_arg_names,
-                         const char** shared_arg_name_list,
-                         int* shared_buffer_len,
-                         const char** shared_buffer_name_list,
-                         NDArrayHandle* shared_buffer_handle_list,
-                         const char*** updated_shared_buffer_name_list,
-                         NDArrayHandle** updated_shared_buffer_handle_list,
-                         mx_uint* num_in_args,
-                         NDArrayHandle** in_args,
-                         NDArrayHandle** arg_grads,
-                         mx_uint* num_aux_states,
-                         NDArrayHandle** aux_states,
-                         ExecutorHandle shared_exec_handle,
-                         ExecutorHandle* out);
+                                   int dev_type,
+                                   int dev_id,
+                                   const mx_uint num_g2c_keys,
+                                   const char** g2c_keys,
+                                   const int* g2c_dev_types,
+                                   const int* g2c_dev_ids,
+                                   const mx_uint provided_grad_req_list_len,
+                                   const char** provided_grad_req_names,
+                                   const char** provided_grad_req_types,
+                                   const mx_uint num_provided_arg_shapes,
+                                   const char** provided_arg_shape_names,
+                                   const mx_uint* provided_arg_shape_data,
+                                   const mx_uint* provided_arg_shape_idx,
+                                   const mx_uint num_provided_arg_dtypes,
+                                   const char** provided_arg_dtype_names,
+                                   const int* provided_arg_dtypes,
+                                   const mx_uint num_provided_arg_stypes,
+                                   const char** provided_arg_stype_names,
+                                   const int* provided_arg_stypes,
+                                   const mx_uint num_shared_arg_names,
+                                   const char** shared_arg_name_list,
+                                   int* shared_buffer_len,
+                                   const char** shared_buffer_name_list,
+                                   NDArrayHandle* shared_buffer_handle_list,
+                                   const char*** updated_shared_buffer_name_list,
+                                   NDArrayHandle** updated_shared_buffer_handle_list,
+                                   mx_uint* num_in_args,
+                                   NDArrayHandle** in_args,
+                                   NDArrayHandle** arg_grads,
+                                   mx_uint* num_aux_states,
+                                   NDArrayHandle** aux_states,
+                                   ExecutorHandle shared_exec_handle,
+                                   ExecutorHandle* out);
 /*!
  * \brief set a call back to notify the completion of operation
  */
@@ -1380,6 +1544,19 @@ MXNET_DLL int MXInitPSEnv(mx_uint num_vars,
  */
 MXNET_DLL int MXKVStoreCreate(const char *type,
                               KVStoreHandle *out);
+
+/*!
+ * \brief Set parameters to use low-bit compressed gradients
+ * \param handle handle to the kvstore
+ * \param keys keys for compression parameters
+ * \param vals values for compression parameters
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXKVStoreSetGradientCompression(KVStoreHandle handle,
+                                              mx_uint num_params,
+                                              const char** keys,
+                                              const char** vals);
+
 /*!
  * \brief Delete a KVStore handle.
  * \param handle handle to the kvstore
@@ -1468,6 +1645,44 @@ MXNET_DLL int MXKVStorePullEx(KVStoreHandle handle,
                               const char** keys,
                               NDArrayHandle* vals,
                               int priority);
+
+/*!
+ * \brief pull a list of (key, value) pairs from the kvstore, where each key is an integer.
+ *        The NDArray pulled back will be in row_sparse storage with only the specified
+ *        row_ids present based row_ids (others rows are zeros).
+ * \param handle handle to the kvstore
+ * \param num the number of key-value pairs
+ * \param keys the list of keys
+ * \param vals the list of values
+ * \param row_ids the list of row_id NDArrays
+ * \param priority the priority of the action
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXKVStorePullRowSparse(KVStoreHandle handle,
+                                     mx_uint num,
+                                     const int* keys,
+                                     NDArrayHandle* vals,
+                                     const NDArrayHandle* row_ids,
+                                     int priority);
+/*!
+ * \brief pull a list of (key, value) pairs from the kvstore, where each key is a string.
+ *        The NDArray pulled back will be in row_sparse storage with only the specified
+ *        row_ids present based row_ids (others rows are zeros).
+ * \param handle handle to the kvstore
+ * \param num the number of key-value pairs
+ * \param keys the list of keys
+ * \param vals the list of values
+ * \param row_ids the list of row_id NDArrays
+ * \param priority the priority of the action
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXKVStorePullRowSparseEx(KVStoreHandle handle,
+                                       mx_uint num,
+                                       const char** keys,
+                                       NDArrayHandle* vals,
+                                       const NDArrayHandle* row_ids,
+                                       int priority);
+
 /*!
  * \brief user-defined updater for the kvstore
  * It's this updater's responsibility to delete \a recv and \a local
@@ -1481,7 +1696,19 @@ typedef void (MXKVStoreUpdater)(int key,
                                 NDArrayHandle local,
                                 void *handle);
 /*!
- * \brief register an push updater
+ * \brief user-defined updater for the kvstore with string keys
+ * It's this updater's responsibility to delete \a recv and \a local
+ * \param the key
+ * \param recv the pushed value on this key
+ * \param local the value stored on local on this key
+ * \param handle The additional handle to the updater
+ */
+typedef void (MXKVStoreStrUpdater)(const char* key,
+                                   NDArrayHandle recv,
+                                   NDArrayHandle local,
+                                   void *handle);
+/*!
+ * \brief register a push updater
  * \param handle handle to the KVStore
  * \param updater udpater function
  * \param updater_handle The additional handle used to invoke the updater
@@ -1490,6 +1717,18 @@ typedef void (MXKVStoreUpdater)(int key,
 MXNET_DLL int MXKVStoreSetUpdater(KVStoreHandle handle,
                                   MXKVStoreUpdater updater,
                                   void *updater_handle);
+/*!
+ * \brief register a push updater with int keys and one with string keys
+ * \param handle handle to the KVStore
+ * \param updater updater function with int keys
+ * \param str_updater updater function with string keys
+ * \param updater_handle The additional handle used to invoke the updater
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXKVStoreSetUpdaterEx(KVStoreHandle handle,
+                                    MXKVStoreUpdater updater,
+                                    MXKVStoreStrUpdater str_updater,
+                                    void *updater_handle);
 /*!
  * \brief get the type of the kvstore
  * \param handle handle to the KVStore
@@ -1680,6 +1919,14 @@ MXNET_DLL int MXRecordIOReaderReadRecord(RecordIOHandle handle,
 */
 MXNET_DLL int MXRecordIOReaderSeek(RecordIOHandle handle, size_t pos);
 
+/**
+ * \brief Get the current writer pointer position
+ * \param handle handle to RecordIO object
+ * \param pos handle to output position
+ * \return 0 when success, -1 when failure happens
+*/
+MXNET_DLL int MXRecordIOReaderTell(RecordIOHandle handle, size_t *pos);
+
 /**
  * \brief Create a MXRtc object
 */
@@ -1720,7 +1967,80 @@ MXNET_DLL int MXCustomOpRegister(const char* op_type, CustomOpPropCreator creato
  */
 MXNET_DLL int MXCustomFunctionRecord(int num_inputs, NDArrayHandle *inputs,
                                      int num_outputs, NDArrayHandle *outputs,
-                                     MXCallbackList *callbacks);
+                                     struct MXCallbackList *callbacks);
+/*
+ * \brief create cuda rtc module
+ * \param source cuda source code
+ * \param num_options number of compiler flags
+ * \param options compiler flags
+ * \param num_exports number of exported function names
+ * \param exported function names
+ * \param out handle to created module
+ */
+MXNET_DLL int MXRtcCudaModuleCreate(const char* source, int num_options,
+                                    const char** options, int num_exports,
+                                    const char** exports, CudaModuleHandle *out);
+/*
+ * \brief delete cuda rtc module
+ * \param handle handle to cuda module
+ */
+MXNET_DLL int MXRtcCudaModuleFree(CudaModuleHandle handle);
+/*
+ * \brief get kernel from module
+ * \param handle handle to cuda module
+ * \param name name of kernel function
+ * \param num_args number of arguments
+ * \param is_ndarray whether argument is ndarray
+ * \param is_const whether argument is constant
+ * \param arg_types data type of arguments
+ * \param out created kernel
+ */
+MXNET_DLL int MXRtcCudaKernelCreate(CudaModuleHandle handle, const char* name,
+                                    int num_args, int* is_ndarray, int* is_const,
+                                    int* arg_types, CudaKernelHandle *out);
+/*
+ * \brief delete kernel
+ * \param handle handle to previously created kernel
+ */
+MXNET_DLL int MXRtcCudaKernelFree(CudaKernelHandle handle);
+/*
+ * \brief launch cuda kernel
+ * \param handle handle to kernel
+ * \param dev_id (GPU) device id
+ * \param args pointer to arguments
+ * \param grid_dim_x grid dimension x
+ * \param grid_dim_y grid dimension y
+ * \param grid_dim_z grid dimension z
+ * \param block_dim_x block dimension x
+ * \param block_dim_y block dimension y
+ * \param block_dim_z block dimension z
+ * \param shared_mem size of dynamically allocated shared memory
+ */
+MXNET_DLL int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** args,
+                                  mx_uint grid_dim_x, mx_uint grid_dim_y,
+                                  mx_uint grid_dim_z, mx_uint block_dim_x,
+                                  mx_uint block_dim_y, mx_uint block_dim_z,
+                                  mx_uint shared_mem);
+/*!
+ * \brief Get shared memory handle from NDArray
+ * \param handle NDArray handle.
+ * \param shared_pid output PID
+ * \param shared_id output shared memory id.
+ */
+MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
+                                          int* shared_id);
+/*!
+ * \brief Reconstruct NDArray from shared memory handle
+ * \param shared_pid shared PID
+ * \param shared_id shared memory id
+ * \param shape pointer to NDArray dimensions
+ * \param ndim number of NDArray dimensions
+ * \param dtype data type of NDArray
+ * \param out constructed NDArray
+ */
+MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *shape,
+                                           mx_uint ndim, int dtype, NDArrayHandle *out);
+
 
 #ifdef __cplusplus
 }
diff --git a/include/mxnet/c_predict_api.h b/include/mxnet/c_predict_api.h
index 8cf153e7cae1..e4bfb398d53a 100644
--- a/include/mxnet/c_predict_api.h
+++ b/include/mxnet/c_predict_api.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_predict_api.h
  * \brief C predict API of mxnet, contains a minimum API to run prediction.
  *  This file is self-contained, and do not dependent on any other files.
diff --git a/include/mxnet/engine.h b/include/mxnet/engine.h
index fc24fac06bd3..366a6b61b3e0 100644
--- a/include/mxnet/engine.h
+++ b/include/mxnet/engine.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file engine.h
  * \brief Engine that schedules all the operations according to dependency.
  */
@@ -84,7 +85,9 @@ enum class FnProperty {
   /*! \brief Prioritized sync operation on CPU */
   kCPUPrioritized,
   /*! \brief Asynchronous function call */
-  kAsync
+  kAsync,
+  /*! \brief Delete variable call */
+  kDeleteVar
 };  // enum class FnProperty
 
 /*!
@@ -110,6 +113,18 @@ class MXNET_API Engine {
    * \return 0 when success, -1 when failure happens.
    */
   virtual void NotifyShutdown() = 0;
+  /*!
+   *\brief Stop all workers in the engine
+   */
+  virtual void Stop() {
+    LOG(FATAL) << "Engine cannot be stopped";
+  }
+  /*!
+   * \brief Restart all workers in the engine
+   */
+  virtual void Start() {
+    LOG(FATAL) << "Engine cannot be restarted";
+  }
   /*!
    * \brief Allocate a new variable, the variable can then
    *        be used to schedule the operation concurrently via dependency
@@ -126,15 +141,13 @@ class MXNET_API Engine {
    * \param mutable_vars The variables that current operation will mutate.
    * \param prop Property of the function.
    * \param opr_name The operator name.
-   * \param attr_name The attribute name.
    * \return The new operator allocated.
    */
   virtual OprHandle NewOperator(AsyncFn fn,
                                 std::vector<VarHandle> const& const_vars,
                                 std::vector<VarHandle> const& mutable_vars,
                                 FnProperty prop = FnProperty::kNormal,
-                                const char* opr_name = nullptr,
-                                const char* attr_name = nullptr) = 0;
+                                const char* opr_name = nullptr) = 0;
   /*!
    * \brief Delete the given operator.
    * \param op The operator to delete.
@@ -163,15 +176,13 @@ class MXNET_API Engine {
    * \param prop Property of the function.
    * \param priority Priority of the action, as hint to the engine.
    * \param opr_name The operator name.
-   * \param attr_name The attribute name.
    */
   virtual void PushAsync(AsyncFn exec_fun, Context exec_ctx,
                          std::vector<VarHandle> const& const_vars,
                          std::vector<VarHandle> const& mutable_vars,
                          FnProperty prop = FnProperty::kNormal,
                          int priority = 0,
-                         const char* opr_name = nullptr,
-                         const char* attr_name = nullptr) = 0;
+                         const char* opr_name = nullptr) = 0;
   /*!
    * \brief Schedule the deletion of a variable.
    *
@@ -221,20 +232,18 @@ class MXNET_API Engine {
    * \param prop Property of the function.
    * \param priority Priority of the action, as hint to the engine.
    * \param opr_name The operator name.
-   * \param attr_name The attribute name.
    * \tparam SyncFn the synchronous function to be pushed.
    */
-  inline void PushSync(SyncFn exec_fn, Context exec_ctx,
-                       std::vector<VarHandle> const& const_vars,
-                       std::vector<VarHandle> const& mutable_vars,
-                       FnProperty prop = FnProperty::kNormal,
-                       int priority = 0,
-                       const char* opr_name = nullptr,
-                       const char* attr_name = nullptr) {
+  virtual void PushSync(SyncFn exec_fn, Context exec_ctx,
+                        std::vector<VarHandle> const& const_vars,
+                        std::vector<VarHandle> const& mutable_vars,
+                        FnProperty prop = FnProperty::kNormal,
+                        int priority = 0,
+                        const char* opr_name = nullptr) {
     this->PushAsync([exec_fn](RunContext ctx, CallbackOnComplete on_complete) {
         exec_fn(ctx);
         on_complete();
-      }, exec_ctx, const_vars, mutable_vars, prop, priority, opr_name, attr_name);
+      }, exec_ctx, const_vars, mutable_vars, prop, priority, opr_name);
   }
 
   /*!
@@ -271,6 +280,14 @@ class MXNET_API Engine {
     }
     read_vars->resize(rtop - read_vars->begin());
   }
+  /*! \brief query current limit for bulk size */
+  virtual int bulk_size() const {
+    return 0;
+  }
+  /*! \brief set maximum limit for bulk size */
+  virtual int set_bulk_size(int) {
+    return 0;
+  }
 };  // class Engine
 #endif  // DMLC_USE_CXX11
 }  // namespace mxnet
diff --git a/include/mxnet/executor.h b/include/mxnet/executor.h
index a74d3b07b5be..d749100f5de6 100644
--- a/include/mxnet/executor.h
+++ b/include/mxnet/executor.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file executor.h
  * \brief Symbolic executor interface of mxnet.
  * \author Min Lin, Bing Xu
@@ -133,6 +134,7 @@ class Executor {
                               const std::vector<Context>& aux_state_ctxes,
                               const std::unordered_map<std::string, TShape>& arg_shape_map,
                               const std::unordered_map<std::string, int>& arg_dtype_map,
+                              const std::unordered_map<std::string, int>& arg_stype_map,
                               const std::vector<OpReqType>& grad_req_types,
                               const std::unordered_set<std::string>& param_names,
                               std::vector<NDArray>* in_args,
diff --git a/include/mxnet/graph_attr_types.h b/include/mxnet/graph_attr_types.h
new file mode 100644
index 000000000000..d0efa832804e
--- /dev/null
+++ b/include/mxnet/graph_attr_types.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file graph_attr_types.h
+ * \brief Data structures that can appear in graph attributes.
+ */
+#ifndef MXNET_GRAPH_ATTR_TYPES_H_
+#define MXNET_GRAPH_ATTR_TYPES_H_
+
+#include <mxnet/op_attr_types.h>
+#include <vector>
+
+namespace mxnet {
+
+/*!
+ * \brief The result holder of storage type of each NodeEntry in the graph.
+ * \note Stored under graph.attrs["storage_type"], provided by Pass "InferStorageType"
+ *
+ * \code
+ *  Graph g = ApplyPass(src_graph, "InferStorageType");
+ *  const StorageVector& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+ *  // get storage type by entry id
+ *  int entry_type = stypes[g.indexed_graph().entry_id(my_entry)];
+ * \endcode
+ *
+ * \sa FInferStorageType
+ */
+using StorageTypeVector = std::vector<int>;
+
+/*!
++ * \brief The result holder of dispatch mode of each Node in the graph.
++ * \note Stored under graph.attrs["dispatch_mode"], provided by Pass "InferStorageType"
++ *
++ * \code
++ *  Graph g = ApplyPass(src_graph, "InferStorageType");
++ *  const DispatchModeVector& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
++ *  // get dispatch mode by entry node id
++ *  int node_type = dispatch_modes[nid];
++ * \endcode
++ *
++ * \sa FInferStorageType
++ */
+using DispatchModeVector = std::vector<DispatchMode>;
+
+}  // namespace mxnet
+
+#endif  // MXNET_GRAPH_ATTR_TYPES_H_
diff --git a/include/mxnet/imperative.h b/include/mxnet/imperative.h
new file mode 100644
index 000000000000..88a9f4d597ef
--- /dev/null
+++ b/include/mxnet/imperative.h
@@ -0,0 +1,218 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_IMPERATIVE_H_
+#define MXNET_IMPERATIVE_H_
+
+#include <mxnet/op_attr_types.h>
+#include <mxnet/graph_attr_types.h>
+#include <mxnet/c_api.h>
+#include <nnvm/symbolic.h>
+#include <nnvm/op.h>
+#include <nnvm/graph.h>
+#include <vector>
+#include <atomic>
+#include <unordered_map>
+
+#include "./ndarray.h"
+
+namespace mxnet {
+/*! \brief runtime functions for NDArray */
+class Imperative {
+ public:
+  /*! \brief */
+  class AGInfo {
+   public:
+    Context ctx;
+    OpReqType grad_req;
+    OpStatePtr state;
+    std::vector<NDArray> outputs;
+    std::vector<NDArray> out_grads;
+    bool fresh_out_grad;
+
+    AGInfo() :
+      grad_req(kNullOp), fresh_out_grad(false) {}
+
+    static void Clear(const nnvm::NodePtr& node) {
+      if (node == nullptr || node->info.empty()) return;
+      AGInfo& info = Get(node);
+      if (info.grad_req != kNullOp) return;
+      node->info.clear();
+    }
+
+    static AGInfo& Get(const nnvm::NodePtr& node) {
+      return dmlc::get<AGInfo>(node->info);
+    }
+
+    static AGInfo& Create(const nnvm::NodePtr& node) {
+      node->info.construct<AGInfo>();
+      return Get(node);
+    }
+
+    static bool IsNone(const NDArray& arr) {
+      return arr.entry_.node == nullptr || arr.entry_.node->info.empty();
+    }
+
+    static bool IsVariable(const nnvm::NodePtr& node) {
+      AGInfo& info = Get(node);
+      return info.grad_req != kNullOp && info.outputs.size() == 1
+             && info.out_grads.size() == 1;
+    }
+  };
+  class CachedOp {
+   public:
+    explicit CachedOp(const nnvm::Symbol& sym);
+    uint32_t num_inputs() {
+      return fwd_graph_.indexed_graph().input_nodes().size();
+    }
+    uint32_t num_outputs() {
+      return fwd_graph_.outputs.size();
+    }
+    uint32_t num_backward_inputs() {
+      return bwd_ograd_dep_.size() + bwd_in_dep_.size() + bwd_out_dep_.size();
+    }
+    std::vector<bool>& save_inputs() {
+      return save_inputs_;
+    }
+    std::vector<bool>& save_outputs() {
+      return save_outputs_;
+    }
+    const std::unordered_set<uint32_t>& mutable_input_nodes() {
+      return fwd_graph_.indexed_graph().mutable_input_nodes();
+    }
+    nnvm::Graph GetForwardGraph(const bool recording,
+                                const std::vector<NDArray*>& inputs);
+    nnvm::Graph GetBackwardGraph(const OpStatePtr& state,
+                                 const std::vector<OpReqType>& reqs,
+                                 const std::vector<NDArray*>& inputs);
+    std::vector<nnvm::NodeEntry> Gradient(const nnvm::NodePtr& node,
+                                          const std::vector<nnvm::NodeEntry>& ograds);
+    OpStatePtr Forward(const std::vector<NDArray*>& inputs,
+                       const std::vector<NDArray*>& outputs);
+    void Backward(const bool retain_graph,
+                  const OpStatePtr& state,
+                  const std::vector<NDArray*>& inputs,
+                  const std::vector<OpReqType>& reqs,
+                  const std::vector<NDArray*>& outputs);
+
+   private:
+    struct CachedOpState {
+      std::vector<NDArray> buff;
+      std::vector<OpStatePtr> states;
+    };
+    std::mutex mutex_;
+    nnvm::Graph fwd_graph_;
+    nnvm::Graph grad_graph_;
+    nnvm::Graph full_graph_;
+    std::vector<nnvm::NodeEntry> ograd_entries_;
+    std::vector<bool> curr_grad_req_;
+    std::vector<uint32_t> bwd_in_dep_, bwd_out_dep_, bwd_ograd_dep_;
+    std::vector<uint32_t> bwd_input_eid_;
+    std::vector<bool> save_inputs_, save_outputs_;
+  };
+  /*! \brief whether operator recording is on. */
+  bool is_training() const {
+    return is_train_;
+  }
+  /*! \brief turn on or turn off operator recording for autograd. */
+  bool set_is_training(bool is_train) {
+      bool old = is_train_;
+      is_train_ = is_train;
+      return old;
+  }
+  /*! \brief whether operator recording is on. */
+  bool is_recording() const {
+    return is_recording_;
+  }
+  /*! \brief turn on or turn off operator recording for autograd. */
+  bool set_is_recording(bool is_recording) {
+      bool old = is_recording_;
+      is_recording_ = is_recording;
+      return old;
+  }
+  /*! \brief to record operator, return corresponding node. */
+  void RecordOp(nnvm::NodeAttrs&& attrs,
+                const std::vector<NDArray*>& inputs,
+                const std::vector<NDArray*>& outputs,
+                const OpStatePtr& state = OpStatePtr(),
+                std::vector<bool>* p_save_inputs = nullptr,
+                std::vector<bool>* p_save_outputs = nullptr);
+  /*! \brief */
+  OpStatePtr Invoke(const Context& default_ctx,
+                    const nnvm::NodeAttrs& attrs,
+                    const std::vector<NDArray*>& inputs,
+                    const std::vector<NDArray*>& outputs);
+  /*! \brief */
+  OpStatePtr InvokeOp(const Context& ctx,
+                      const nnvm::NodeAttrs& attrs,
+                      const std::vector<NDArray*>& inputs,
+                      const std::vector<NDArray*>& outputs,
+                      const std::vector<OpReqType>& req,
+                      const DispatchMode dispatch_mode,
+                      OpStatePtr state = OpStatePtr());
+  /*! \brief mark variables for computing gradients. */
+  void MarkVariables(const std::vector<NDArray*>& variables,
+                     const std::vector<mx_uint>& grad_reqs,
+                     const std::vector<NDArray*>& gradients);
+  /*! \brief compute the gradient of outputs w.r.t variables. */
+  std::vector<NDArray*> Backward(const std::vector<NDArray*>& outputs,
+                                 const std::vector<NDArray*>& ograds,
+                                 const std::vector<NDArray*>& variables,
+                                 bool is_train, bool retain_graph,
+                                 bool create_graph);
+  /*! \return AutogradRuntime singleton */
+  static Imperative* Get();
+
+ private:
+  friend class NDArray;
+  /*! \brief make constructor protected. */
+  Imperative() {}
+  /*! \brief find the input/output ndarrays that are needed for backward */
+  void GetBackwardDependency(
+      const nnvm::NodePtr& node,
+      uint32_t num_inputs, uint32_t num_outputs,
+      std::vector<bool> *p_save_inputs,
+      std::vector<bool> *p_save_outputs);
+  void RunGraph(
+      const bool retain_graph,
+      const nnvm::IndexedGraph& idx,
+      const std::vector<NDArray*> arrays,
+      size_t node_start, size_t node_end,
+      std::vector<OpReqType>&& array_reqs,
+      std::vector<uint32_t>&& ref_count,
+      std::vector<OpStatePtr> *p_states,
+      const DispatchModeVector& dispatch_modes);
+  /*! \brief indicate whether is training. */
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local bool is_train_;
+  static thread_local bool is_recording_;
+#else
+  static MX_THREAD_LOCAL bool is_train_;
+  static MX_THREAD_LOCAL bool is_recording_;
+#endif
+  /*! \brief node count used for naming */
+  std::atomic<uint64_t> node_count_{0};
+  /*! \brief variable count used for naming */
+  std::atomic<uint64_t> variable_count_{0};
+};
+
+using CachedOpPtr = std::shared_ptr<Imperative::CachedOp>;
+
+}  // namespace mxnet
+#endif  // MXNET_IMPERATIVE_H_
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 68c1ede65ada..3c806d85d56a 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file io.h
  * \brief mxnet io data structure and data iterator
  */
diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
index d2924ecea1b5..4e99a9c861f2 100644
--- a/include/mxnet/kvstore.h
+++ b/include/mxnet/kvstore.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file kvstore.h
  * \brief key-value store interface for mxnet
  */
@@ -25,10 +26,12 @@
 #define MXNET_KVSTORE_H_
 #include <dmlc/io.h>
 #include <vector>
+#include <utility>
 #include <unordered_map>
 #include <string>
 #include <functional>
 #include <atomic>
+#include "../../src/kvstore/gradient_compression.h"
 #include "./ndarray.h"
 #if MXNET_USE_DIST_KVSTORE
 #include "ps/ps.h"
@@ -63,6 +66,14 @@ class KVStore {
    */
   inline const std::string& type() { return type_; }
 
+  /**
+   * \brief Set parameters to use low-bit compressed gradients
+   * \param compression_type type of compression
+   * \param threshold threshold for 2bit compression
+   */
+  virtual void SetGradientCompression(const std::vector<std::pair<std::string, std::string> >
+                                      & kwargs) = 0;
+
   /*!
    * \brief Initialize a list of key-value pair to the store.
    *
@@ -173,11 +184,38 @@ class KVStore {
                     const std::vector<NDArray*>& values,
                     int priority = 0) = 0;
 
+  /*!
+   * \brief pull a list of key-value pairs from the store.
+   *        The NDArray pulled back will be in row_sparse storage with only the
+   *        specified row_ids present (others rows are zeros).
+   * \param keys the list of keys
+   * \param values the list of buffers - row_id pairs
+   * \param priority the priority of the action.
+   */
+  virtual void PullRowSparse(const std::vector<int>& str_keys,
+                             const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                             int priority = 0) = 0;
+
+  /*!
+   * \brief pull a list of key-value pairs from the store, where each key is a string.
+   *        The NDArray pulled back will be in row_sparse storage with only the
+   *        specified row_ids present (others rows are zeros).
+   * \param keys the list of keys in string format
+   * \param values the list of buffers - row_id pairs
+   * \param priority the priority of the action.
+   */
+  virtual void PullRowSparse(const std::vector<std::string>& str_keys,
+                             const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                             int priority = 0) = 0;
 
   /**
    * \brief the prototype of user-defined updater
    */
   typedef std::function<void(int, const NDArray&, NDArray*)> Updater;
+  /**
+   * \brief the prototype of user-defined updater with string keys
+   */
+  typedef std::function<void(const std::string&, const NDArray&, NDArray*)> StrUpdater;
   /*!
    * \brief set an updater
    *
@@ -191,6 +229,19 @@ class KVStore {
     CHECK(updater) << "invalid updater";
     updater_ = updater;
   }
+  /*!
+   * \brief set an updater with string keys
+   *
+   * Given a string key, assume \a x is the received (pushed) value and \a y is the
+   * value stored on the store node. The store updates \a y by `h(x, &y)`. The
+   * default \a h is ASSIGN, namely `*y = x`.
+   *
+   * \param updater user-defined string updater, default is assign
+   */
+  virtual void set_updater(const StrUpdater& updater) {
+    CHECK(updater) << "invalid updater";
+    str_updater_ = updater;
+  }
 
   /******************************************************
    * the following are used for multi-machines.
@@ -332,15 +383,26 @@ class KVStore {
 
  protected:
   /**
-   * \brief the user-defined  updater
+   * \brief the user-defined updater
    */
   Updater updater_;
 
+  /**
+   * \brief the user-defined updater with string keys
+   */
+  StrUpdater str_updater_;
+
   /**
    * \brief the kvstore type
    */
   std::string type_;
 
+  /** \brief Gradient compression object starts with GC_NONE mode
+   * Used if SetGradientCompression sets the type.
+   * Currently there is no support for un-setting gradient compression
+   */
+  std::shared_ptr<kvstore::GradientCompression> gradient_compression_;
+
   /**
    * \brief whether to do barrier when finalize
    */
diff --git a/include/mxnet/mxrtc.h b/include/mxnet/mxrtc.h
deleted file mode 100644
index 8d7facc5b82a..000000000000
--- a/include/mxnet/mxrtc.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mxrtc.h
- * \brief Wrapper for NVRTC
- * \author Junyuan Xie
- */
-#ifndef MXNET_MXRTC_H_
-#define MXNET_MXRTC_H_
-#include "./base.h"
-#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
-#include <nvrtc.h>
-#include <cuda.h>
-
-#include <vector>
-#include <string>
-#include <memory>
-#include <utility>
-#include <unordered_map>
-#include "./ndarray.h"
-
-namespace mxnet {
-
-/*!
- * \brief Runtime compile of cuda kernel code with NVRTC
- */
-class MXRtc {
- public:
-  /*!
-   * \brief Build a new kernel.
-   *
-   * If the same kernel has been compiled before it will be load from
-   * cache instead of compile again.
-   * \param name name of the kernel function.
-   * \param input list of input ndarrays and their name.
-   * \param output list of output ndarrays and their name.
-   * \param kernel cuda code.
-   */
-  MXRtc(const std::string& name,
-        std::vector<std::pair<std::string, NDArray> > const& input,
-        std::vector<std::pair<std::string, NDArray> > const& output,
-        const std::string& kernel);
-  /*!
-   * \brief launch a kernel with the engine.
-   * \param input list of input ndarray.
-   * \param output list of output ndarray.
-   * \param grid_dim_X kernel grid dimensions.
-   * \param grid_dim_Y kernel grid dimensions.
-   * \param grid_dim_Z kernel grid dimensions.
-   * \param block_dim_X kernel block dimensions.
-   * \param block_dim_Y kernel block dimensions.
-   * \param block_dim_Z kernel block dimensions.
-   */
-  void push(std::vector<NDArray> const& input,
-            std::vector<NDArray> const& output,
-            unsigned int  grid_dim_X,
-            unsigned int  grid_dim_Y,
-            unsigned int  grid_dim_Z,
-            unsigned int  block_dim_X,
-            unsigned int  block_dim_Y,
-            unsigned int  block_dim_Z);
-
- private:
-  static const char str_type[];
-  static std::unordered_map<std::string, char*> kernel_registry;
-
-  std::string name_;
-  index_t num_input_, num_output_;
-  std::string code_;
-  char* ptx_;
-  std::unordered_map<int, CUmodule> module_;
-  std::unordered_map<int, CUfunction> func_;
-
-  /*!
-   * \brief add supporting code to kernel.
-   */
-  std::string decorate(const std::string& name,
-                       std::vector<std::pair<std::string, NDArray> > const& input,
-                       std::vector<std::pair<std::string, NDArray> > const& output,
-                       const std::string kernel);
-  /*!
-   * \brief compile the kernel with nvrtc.
-   */
-  char* compile(const std::string& name, const std::string& code);
-};
-
-}  // namespace mxnet
-
-#endif  // MXNET_USE_CUDA && MXNET_USE_NVRTC
-#endif  // MXNET_MXRTC_H_
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index d7dff4098b27..8398b7bf7291 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ndarray.h
  * \brief NDArray interface that handles array arithematics.
  */
@@ -46,30 +47,31 @@
 #endif
 
 namespace mxnet {
+// enum for storage types
+namespace csr {
+enum CSRAuxType {kIndPtr, kIdx};
+}
 
-// forward declaration
-namespace autograd {
-class AGNode;
+namespace rowsparse {
+enum RowSparseAuxType {kIdx};
+}
 
-using AGNodePtr = std::shared_ptr<AGNode>;
-
-class AGNodeEntry {
- public:
-  AGNodePtr ag_node;
-  uint32_t index;
-  uint32_t version;
-
-  void clear() {
-    ag_node.reset();
-    index = version = 0;
-  }
+enum NDArrayStorageType {
+  kUndefinedStorage = -1,  // undefined storage
+  kDefaultStorage,         // dense
+  kRowSparseStorage,       // row sparse
+  kCSRStorage,             // csr
+};
 
-  nnvm::NodeEntry nn_entry() const;
-  bool is_none() const;
+enum NDArrayFormatErr {
+  kNormalErr,     // normal
+  kCSRShapeErr,   // shape mismatch for csr
+  kCSRIndPtrErr,  // indptr error for csr
+  kCSRIdxErr,     // idx error for csr
+  kRSPShapeErr,   // shape mismatch for row sparse
+  kRSPIdxErr,     // indices error for row sparse
 };
 
-class AutogradRuntime;
-}  // namespace autograd
 
 /*!
  * \brief ndarray interface
@@ -91,10 +93,57 @@ class NDArray {
    */
   NDArray(const TShape &shape, Context ctx,
           bool delay_alloc = false, int dtype = mshadow::default_type_flag)
-      : ptr_(std::make_shared<Chunk>(shape.Size(), ctx, delay_alloc, dtype)),
-        shape_(shape), dtype_(dtype), entry_({nullptr, 0, 0}) {
+      : ptr_(std::make_shared<Chunk>(shape, ctx, delay_alloc, dtype)),
+        shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage),
+        entry_({nullptr, 0, 0}) {
 #if MKL_EXPERIMENTAL == 1
     Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
+  }
+  /*! \brief constructor for NDArray with storage type
+   */
+  NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx,
+          bool delay_alloc = true, int dtype = mshadow::default_type_flag,
+          std::vector<int> aux_types = {}, std::vector<TShape> aux_shapes = {},
+          TShape storage_shape = TShape(mshadow::Shape1(0)))
+      : shape_(shape), dtype_(dtype), storage_type_(stype),
+        entry_({nullptr, 0, 0}) {
+      // Assign default aux types if not given
+      if (aux_types.size() == 0) {
+        if (stype == kRowSparseStorage) {
+          aux_types = {mshadow::kInt64};
+        } else if (stype == kCSRStorage) {
+          aux_types = {mshadow::kInt64, mshadow::kInt64};
+        } else {
+          LOG(FATAL) << "Unknown storage type " << stype;
+        }
+      }
+      // Assign default shapes if not given
+      // unknown shapes are intialized as {0} such that Size() would return 0
+      if (aux_shapes.size() == 0) {
+        if (stype == kRowSparseStorage) {
+          aux_shapes = {TShape(mshadow::Shape1(0))};
+        } else if (stype == kCSRStorage) {
+          // aux shapes for indptr and indices
+          aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))};
+        } else {
+          LOG(FATAL) << "Unknown storage type " << stype;
+        }
+      }
+      if (storage_shape.Size() == 0) {
+        if (stype == kRowSparseStorage) {
+          storage_shape = shape;
+          storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
+        } else if (stype == kCSRStorage) {
+          storage_shape = aux_shapes[csr::kIdx];
+        } else {
+          LOG(FATAL) << "Unknown storage type " << stype;
+        }
+      }
+      ptr_ = std::make_shared<Chunk>(stype, storage_shape, ctx, delay_alloc,
+                                     dtype, aux_types, aux_shapes);
+#if MKL_EXPERIMENTAL == 1
+      Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
   /*!
@@ -106,22 +155,100 @@ class NDArray {
    */
   NDArray(const TBlob &data, int dev_id)
       : ptr_(std::make_shared<Chunk>(data, dev_id)), shape_(data.shape_),
-        dtype_(data.type_flag_), entry_({nullptr, 0, 0}) {
+        dtype_(data.type_flag_), storage_type_(kDefaultStorage),
+        entry_({nullptr, 0, 0}) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
+  }
+  /*! \brief create ndarray from shared memory */
+  NDArray(int shared_pid, int shared_id, const TShape& shape, int dtype)
+      : ptr_(std::make_shared<Chunk>(shared_pid, shared_id, shape, dtype)), shape_(shape),
+        dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) {
+#if MKL_EXPERIMENTAL == 1
+    Mkl_mem_ = std::make_shared<MKLMemHolder>();
+#endif
+  }
+
+  /*!
+   * \brief constructing a static NDArray of non-default storage that shares data with TBlob
+   *  Use with caution: allocate ONLY ONE NDArray for each TBlob,
+   *  make sure the memory region is available through out the life of NDArray
+   * \param stype the storage type of NDArray
+   * \param shape the shape of NDArray
+   * \param data the memory content of static data
+   * \param aux_data the memory content of static aux data
+   * \param dev_id the device id this tensor sits at
+   */
+  NDArray(const NDArrayStorageType stype, const TShape &shape,
+          const TBlob &data, const std::vector<TBlob> &aux_data, int dev_id)
+      : ptr_(std::make_shared<Chunk>(stype, data, aux_data, dev_id)), shape_(shape),
+        dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) {
 #if MKL_EXPERIMENTAL == 1
     Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
+
+
   /*!
-   * \return the shape of current NDArray
+   * \return the shape of current NDArray.
    */
   inline const TShape& shape() const {
     return shape_;
   }
+  /*!
+   * \return the shape of underlying chunk which stores the NDArray data/value.
+   *  It is only intended for non-default storage. For row-sparse storage, it is the shape of
+   *  the tensor which stores the non-zero values.
+   */
+  inline const TShape &storage_shape() const {
+    CHECK(ptr_ != nullptr);
+    CHECK_NE(storage_type(), kDefaultStorage)
+             << "storage_shape() is not intended for kDefaultStorage.";
+    return ptr_->storage_shape;
+  }
+
+  /*!
+   * \brief get the shape of aux_data(index)
+   * \param index the index of the aux data
+   * \return the shape of aux data at given index
+   */
+  inline const TShape& aux_shape(size_t index) const {
+    CHECK_NE(storage_type(), kDefaultStorage)
+             << "aux_shape() is not intended for kDefaultStorage.";
+    return ptr_->aux_shapes[index];
+  }
+
+  /* \return the shapes of all aux data */
+  const std::vector<TShape>& aux_shapes() const {
+    CHECK_NE(storage_type(), kDefaultStorage)
+             << "aux_shapes() is not intended for kDefaultStorage.";
+    return ptr_->aux_shapes;
+  }
+
+  /*! returns the dtypes of all aux data */
+  const std::vector<int>& aux_types() const {
+    CHECK_NE(storage_type(), kDefaultStorage)
+             << "aux_types() is not intended for kDefaultStorage.";
+    return ptr_->aux_types;
+  }
+
+  /*!
+   * \brief For a sparse operation on a csr matrix for example,
+   * the size of the column index array
+   * is an estimated value in the beginning for allocating enough capacity
+   * for the final result. After the operation is done, the exact size of
+   * the shape is known and need to be reset using this function.
+   */
+  inline void set_aux_shape(size_t index, const TShape& shape) const {
+    ptr_->set_aux_shape(index, shape);
+  }
+
   /*!
    * \return the data TBlob
    */
   inline const TBlob& data() const {
-    CheckAndAlloc();
+    if (storage_type() == kDefaultStorage) CheckAndAlloc();
     SetTBlob();
     return tblob_;
   }
@@ -129,10 +256,31 @@ class NDArray {
    * \return the gradient ndarray.
    */
   NDArray grad() const;
+
+  /*!
+   * \return the aux TBlob
+   */
+  inline TBlob aux_data(size_t i) const {
+    auto stype = storage_type();
+    TBlob res;
+    auto shape = aux_shape(i);
+    auto type = aux_type(i);
+    MSHADOW_TYPE_SWITCH(type, DType, {
+      auto dptr = static_cast<DType*>(ptr_->aux_handles[i].dptr);
+      CHECK(stype == kRowSparseStorage || stype == kCSRStorage)
+            << "Unexpected storage type: " << stype;
+      res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
+    });
+#if MKL_EXPERIMENTAL == 1
+    res.Mkl_mem_ = Mkl_mem_;
+#endif
+    return res;
+  }
   /*!
    * \return the context of NDArray, this function is only valid when the NDArray is not empty
    */
   inline Context ctx() const {
+    CHECK(!is_none());
     return ptr_->shandle.ctx;
   }
   /*!
@@ -141,6 +289,14 @@ class NDArray {
   inline int dtype() const {
     return dtype_;
   }
+  inline int aux_type(size_t i) const {
+    CHECK(!is_none());
+    return ptr_->aux_types[i];
+  }
+
+  inline NDArrayStorageType storage_type() const {
+    return storage_type_;
+  }
   /*! \return whether this ndarray is not initialized */
   inline bool is_none() const {
     return ptr_.get() == nullptr;
@@ -149,6 +305,34 @@ class NDArray {
   bool fresh_out_grad() const;
   /*! \return updated grad state in entry_ */
   void set_fresh_out_grad(bool state) const;
+  // returns true if a sparse ndarray's aux_data and storage are initialized
+  inline bool storage_initialized() const {
+    if (is_none()) return false;
+    auto stype = storage_type();
+    CHECK_NE(stype, kDefaultStorage)
+             << "storage_initialized() is not intended for kDefaultStorage.";
+    if (stype == kRowSparseStorage) {
+      CHECK_EQ(aux_shape(rowsparse::kIdx)[0], storage_shape()[0])
+               << "inconsistent storage shape " << storage_shape()
+               << " vs. aux shape " << aux_shape(rowsparse::kIdx);
+      return aux_shape(0).Size() != 0;
+    } else if (stype == kCSRStorage) {
+      CHECK_EQ(aux_shape(csr::kIdx)[0], storage_shape()[0])
+               << "inconsistent storage shape " << storage_shape()
+               << " vs. aux shape " << aux_shape(csr::kIdx);
+      return aux_shape(0).Size() != 0;
+    } else {
+      LOG(FATAL) << "Unknown storage type";
+    }
+    return true;
+  }
+  /*! \brief get storage handle */
+  inline Storage::Handle storage_handle() const {
+    CHECK(!is_none());
+    CHECK_EQ(storage_type(), kDefaultStorage);
+    CheckAndAlloc();
+    return ptr_->shandle;
+  }
   /*!
    * \brief Block until all the pending write operations with respect
    *    to current NDArray are finished, and read can be performed.
@@ -167,7 +351,10 @@ class NDArray {
      * Push an empty mutable function to flush all preceding reads to the
      * variable.
      */
-    Engine::Get()->PushSync([](RunContext) {}, Context{}, {}, {ptr_->var});
+    Engine::Get()->PushAsync(
+      [](RunContext, Engine::CallbackOnComplete on_complete) {
+        on_complete();
+      }, Context{}, {}, {ptr_->var});
     Engine::Get()->WaitForVar(ptr_->var);
   }
   /*! \return the associated variable of the ndarray.*/
@@ -179,6 +366,12 @@ class NDArray {
    * \param strm the output stream
    */
   void Save(dmlc::Stream *strm) const;
+  /*!
+   * \brief load ndarrays before supporting sparse ndarrays
+   * \param strm the output stream
+   * \param magic the magic number used for version control
+   */
+  bool LegacyLoad(dmlc::Stream *strm, const uint32_t magic);
   /*!
    * \brief load the content from binary stream
    * \param strm the output stream
@@ -247,11 +440,6 @@ class NDArray {
    * \return reference of self
    */
   NDArray &operator/=(const real_t &src);
-  /*!
-   * \brief return transpose of current NDArray
-   * \return a new transposed NDArray
-   */
-  NDArray T() const;
   /*!
    * \brief return a new copy this NDArray
    * \param ctx the new context of this NDArray
@@ -269,6 +457,12 @@ class NDArray {
    * \param size the size of the source array, in sizeof(DType) not raw btyes.
    */
   void SyncCopyFromCPU(const void *data, size_t size) const;
+
+  /*!
+   * \brief Copy from src.data()/aux_data(i) to this->data()/aux_data(j)
+   */
+  void SyncCopyFromNDArray(const NDArray &src, int i = -1, int j = -1);
+
   /*!
    * \brief Do a synchronize copy to a continugous CPU memory region.
    *
@@ -280,19 +474,50 @@ class NDArray {
    * \param size the memory size we want to copy into, in sizeof(DType) not raw btyes.
    */
   void SyncCopyToCPU(void *data, size_t size) const;
+  /*!
+  * \brief check whether the NDArray format is valid
+  * \param full_check if `True`, rigorous check, O(N) operations
+  *    Otherwise basic check, O(1) operations
+  */
+  void SyncCheckFormat(const bool full_check) const;
   /*!
    * \brief Slice a NDArray
-   * \param begin begin index in first dim
-   * \param end end index in first dim
+   * \param begin begin index in first dim (inclusive)
+   * \param end end index in first dim (exclusive)
    * \return sliced NDArray
    */
   NDArray Slice(index_t begin, index_t end) const;
+  /*!
+   * \brief Slice a NDArray. Supports recording with autograd
+   * \param begin begin index in first dim (inclusive)
+   * \param end end index in first dim (exclusive)
+   * \return sliced NDArray
+   */
+  NDArray SliceWithRecord(index_t begin, index_t end);
   /*!
    * \brief Index a NDArray
    * \param idx the index
    * \return idx-th sub array NDArray
    */
   NDArray At(index_t idx) const;
+  /*!
+   * \brief Index a NDArray
+   * \param idx the index
+   * \return idx-th sub array NDArray
+   */
+  NDArray AtWithRecord(index_t idx);
+  /*!
+   * \brief Generate a deep copy of aux_data(i) returned as
+   * a default storage type NDArray
+   */
+  NDArray aux_ndarray(size_t i) const;
+
+  /*!
+   * \brief Generate a deep copy of data() returned as a
+   * default storage type NDArray
+   */
+  NDArray data_ndarray() const;
+
   /*!
    * \brief Create a NDArray that shares memory with current one
    *  The new array must have smaller memory size than the current array.
@@ -301,7 +526,9 @@ class NDArray {
    * \return NDArray in new shape and type.
    */
   inline NDArray AsArray(const TShape &shape, int dtype) const {
-    CHECK_GE(shape_.Size() * mshadow::mshadow_sizeof(dtype_),
+    CHECK_EQ(storage_type(), kDefaultStorage)
+             << "AsArray is intended only for kDefaultStorage.";
+    CHECK_GE(ptr_->shandle.size,
              shape.Size() * mshadow::mshadow_sizeof(dtype))
         << "NDArray.AsArray: target memory size is bigger";
 #if MKL_EXPERIMENTAL == 1
@@ -321,29 +548,66 @@ class NDArray {
    * \return NDArray in new shape
    */
   NDArray Reshape(const TShape &shape) const;
+  /*!
+   * \brief Get an reshaped NDArray. Supports autograd recording
+   * \param shape new shape
+   * \return NDArray in new shape
+   */
+  NDArray ReshapeWithRecord(const TShape &shape);
   /*!
    * \brief Return a copy of this NDArray without autograd history
    */
   NDArray Detach() const {
     NDArray ret(*this);
-    ret.entry_ = autograd::AGNodeEntry{nullptr, 0, 0};
+    ret.entry_ = nnvm::NodeEntry{nullptr, 0, 0};
     return ret;
   }
 
-  nnvm::Symbol get_autograd_symbol() {
-    CHECK(!entry_.is_none())
-      << "NDArray is not part of a computation graph. Did you forget to turn on recording?";
-    nnvm::Symbol ret;
-    ret.outputs.emplace_back(entry_.nn_entry());
-    return ret;
-  }
+  nnvm::Symbol get_autograd_symbol() const;
   /*!
    * \brief Allocate the space if it is delayed allocated.
    * This is an internal function used by system that normal user should not use
    */
   inline void CheckAndAlloc() const {
+    CHECK_EQ(storage_type(), kDefaultStorage);
     ptr_->CheckAndAlloc();
   }
+
+  /*!
+   * \brief Allocate the space if the allocation has been delayed
+   * or the requested size is bigger than the available one.
+   * This function can only be called by ndarray of default
+   * storage type and effectively changes the ndarray's shape_.
+   * Note: This function is named as this to avoid overload conflict
+   * with CheckAndAlloc(const std::vector<TShape> &aux_shapes), since
+   * TShape tmp = some_shape is equivalent to TShape tmp = {some_shape}.
+   */
+  void ReshapeAndAlloc(const TShape& shape) {
+    CHECK_EQ(storage_type(), kDefaultStorage);
+    CHECK(!is_none());
+    shape_ = shape;
+    ptr_->CheckAndAlloc(shape.Size() * mshadow::mshadow_sizeof(dtype_));
+  }
+
+  /* !
+   * \brief Alloc memory for non-default storage
+   * aux_shape is only known at run time
+   */
+  inline void CheckAndAlloc(const std::vector<TShape> &aux_shapes) const {
+    CHECK_NE(storage_type(), kDefaultStorage)
+             << "CheckAndAlloc(aux_shapes) is not intended for kDefaultStorage";
+    ptr_->CheckAndAlloc(shape_, aux_shapes, dtype_);
+  }
+  inline void CheckAndAllocData(const TShape &storage_shape) const {
+    CHECK_NE(storage_type(), kDefaultStorage)
+             << "CheckAndAllocData is not intended for kDefaultStorage";
+    ptr_->CheckAndAllocData(storage_shape, dtype_);
+  }
+  inline void CheckAndAllocAuxData(size_t i, const TShape &aux_shape) const {
+    CHECK_NE(storage_type(), kDefaultStorage)
+             << "CheckAndAllocAuxData is not intended for kDefaultStorage";
+    ptr_->CheckAndAllocAuxData(i, aux_shape);
+  }
   /*!
    * \brief Save list of ndarray into the Stream.x
    * \param fo The stream of output.
@@ -364,46 +628,152 @@ class NDArray {
                    std::vector<std::string>* keys);
 
  private:
-  friend class autograd::AutogradRuntime;
+  friend class Imperative;
   /*! \brief the real data chunk that backs NDArray */
+  // shandle is used to store the actual values in the NDArray
+  // aux_handles store the aux data(such as indices) if it's needed by non-default storage.
   struct Chunk {
-    /*! \brief storage handlefrom storage engine */
+    /*! \brief storage handle from storage engine.
+               for non-default storage, shandle stores the data(value) array.
+     */
     Storage::Handle shandle;
+    /*! \brief storage handles for aux data (e.g index)
+               for row_sparse, aux_handles[0] = indices
+               for csr, aux_handles[0] = indptr, aux_handles[1] = indices
+    */
+    std::vector<Storage::Handle> aux_handles;
     /*! \brief variable from engine */
     Engine::VarHandle var;
     /*!
      * \brief if this is true, this means the data do not come
      * from Storage, and do not need to be freed
      */
+    /*! \brief construct from static data */
     bool static_data;
-    /*! \brief whether allocation is delayed */
+    /*! \brief whether data allocation is delayed. This doesn't indicate whether aux data
+               allocation is delayed. */
     bool delay_alloc;
+    // the type of the storage. The storage_type is never kUndefinedStorage once the chunk
+    // is constructed.
+    NDArrayStorageType storage_type = kDefaultStorage;
+    /*! \brief type of aux */
+    std::vector<int> aux_types;
+    // context of data
+    Context ctx;
+    // The shape of the chunk data.
+    // This might not be the same shape as the NDArray, since the storage may be sparse.
+    // The default value for storage_shape is {0} when an empty non-default NDArray is created.
+    TShape storage_shape;
+    // The shape of aux data. The default value for the shape depends on the type of storage.
+    // If aux_shapes[i].Size() is zero, aux data i is empty.
+    std::vector<TShape> aux_shapes;
+
     /*! \brief default cosntructor */
-    Chunk() : static_data(true), delay_alloc(false) {
-      var  = Engine::Get()->NewVariable();
+    Chunk() : static_data(true), delay_alloc(false) {}
+
+    /*! \brief construct a new chunk */
+    Chunk(TShape shape, Context ctx_, bool delay_alloc_, int dtype)
+        : static_data(false), delay_alloc(true), ctx(ctx_) {
+      auto size = shape.Size();
+      storage_shape = shape;
+      var = Engine::Get()->NewVariable();
+      shandle.size = size * mshadow::mshadow_sizeof(dtype);
+      shandle.ctx = ctx_;
+      if (!delay_alloc_) this->CheckAndAlloc();
     }
-    /*! \brief construct from static data */
+
     Chunk(const TBlob &data, int dev_id)
-        : static_data(true),
-          delay_alloc(false) {
+        : static_data(true), delay_alloc(false) {
+      CHECK(storage_type == kDefaultStorage);
       var = Engine::Get()->NewVariable();
       if (data.dev_mask() == cpu::kDevMask) {
-        shandle.ctx = Context::CPU();
+        ctx = Context::CPU();
       } else {
         CHECK_EQ(data.dev_mask(), gpu::kDevMask);
-        shandle.ctx = Context::GPU(dev_id);
+        ctx = Context::GPU(dev_id);
       }
+      // init shandle
+      shandle.ctx = ctx;
       shandle.dptr = data.dptr_;
       shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
+      storage_shape = data.shape_;
     }
-    /*! \brief construct a new chunk */
-    Chunk(uint64_t size, Context ctx, bool delay_alloc_, int dtype)
-        : static_data(false), delay_alloc(true) {
+
+    Chunk(int shared_pid, int shared_id, const TShape& shape, int dtype)
+        : static_data(false), delay_alloc(false) {
       var = Engine::Get()->NewVariable();
-      shandle.size = size * mshadow::mshadow_sizeof(dtype);
+      ctx = Context::CPUShared(0);
+      shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);;
       shandle.ctx = ctx;
-      if (!delay_alloc_) this->CheckAndAlloc();
+      shandle.shared_pid = shared_pid;
+      shandle.shared_id = shared_id;
+      Storage::Get()->Alloc(&shandle);
+      storage_shape = shape;
+    }
+    // Constructor for a non-default storage chunk
+    Chunk(NDArrayStorageType storage_type_, const TShape &storage_shape_, Context ctx_,
+          bool delay_alloc_, int dtype, const std::vector<int> &aux_types_,
+          const std::vector<TShape> &aux_shapes_)
+        : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
+          aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
+          aux_shapes(aux_shapes_) {
+      shandle.ctx = ctx;
+      var = Engine::Get()->NewVariable();
+      // aux_handles always reflect the correct number of aux data
+      for (size_t i = 0; i < aux_shapes.size(); i++) {
+        CheckAndAllocAuxData(i, aux_shapes[i]);
+        // this line is needed in case when aux_shapes[i].Size() = 0
+        // aux_handles[i] will not be updated and take only default value.
+        aux_handles[i].ctx = ctx;
+      }
+      if (!delay_alloc) {
+        CheckAndAllocData(storage_shape, dtype);
+      }
+    }
+
+    Chunk(const NDArrayStorageType storage_type_, const TBlob &data,
+          const std::vector<TBlob> &aux_data, int dev_id)
+        : static_data(true), delay_alloc(false), storage_type(storage_type_) {
+      using namespace mshadow;
+      CHECK_NE(storage_type, kDefaultStorage);
+      // init var
+      var = Engine::Get()->NewVariable();
+      // init ctx
+      if (data.dev_mask() == cpu::kDevMask) {
+        ctx = Context::CPU();
+      } else {
+        CHECK_EQ(data.dev_mask(), gpu::kDevMask);
+        ctx = Context::GPU(dev_id);
+      }
+      // init shandle
+      shandle.ctx = ctx;
+      shandle.dptr = data.dptr_;
+      shandle.size = data.shape_.Size() * mshadow_sizeof(data.type_flag_);
+      storage_shape = data.shape_;
+      // init aux handles
+      for (const auto &aux : aux_data) {
+        Storage::Handle aux_handle;
+        aux_handle.ctx = ctx;
+        aux_handle.dptr = aux.dptr_;
+        aux_handle.size = aux.shape_.Size() * mshadow_sizeof(aux.type_flag_);
+        aux_handles.push_back(aux_handle);
+        aux_types.emplace_back(aux.type_flag_);
+        aux_shapes.emplace_back(aux.shape_);
+      }
     }
+
+    /*! \brief set the shape for ith aux data, and update storage shape if necessary */
+    inline void set_aux_shape(const size_t i, const TShape& shape) {
+      aux_shapes[i] = shape;
+      if (storage_shape.ndim() > 0) {
+        if (storage_type == kRowSparseStorage && i == rowsparse::kIdx) {
+          storage_shape[0] = shape[0];
+        } else if (storage_type == kCSRStorage && i == csr::kIdx) {
+          storage_shape[0] = shape[0];
+        }
+      }
+    }
+
     /*! \brief check if delay alloc is on, do alloc if not yet done */
     inline void CheckAndAlloc(void) {
       if (delay_alloc) {
@@ -411,22 +781,113 @@ class NDArray {
         delay_alloc = false;
       }
     }
-    /*! \brief destructor */
-    ~Chunk() {
-      if (static_data || delay_alloc) {
-        Engine::Get()->DeleteVariable([](RunContext s) {}, shandle.ctx, var);
+
+    /*! \brief Check and alloc memory for a dense ndarray */
+    // size is the number of bytes
+    void CheckAndAlloc(uint64_t dbytes) {
+      CHECK_EQ(kDefaultStorage, storage_type)
+              << "CheckAndAlloc(dbytes) is not intended for kDefaultStorage";
+      if (delay_alloc) {
+        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+        delay_alloc = false;
+      } else if (shandle.size < dbytes) {
+        // free storage if necessary and alloc again
+        if (shandle.size > 0) Storage::Get()->Free(shandle);
+        // init storage
+        shandle = Storage::Get()->Alloc(dbytes, shandle.ctx);
+      }
+    }
+
+    inline void CheckAndAlloc(const TShape &shape, const std::vector<TShape> &aux_shapes,
+                              int dtype) {
+      // calculate size, perform allocation
+      if (kRowSparseStorage == storage_type) {
+        // For row sparse, aux_shape indicates the number of rows to allocate
+        auto aux_shape = aux_shapes[rowsparse::kIdx];
+        CheckAndAllocAuxData(rowsparse::kIdx, aux_shape);
+        TShape storage_shape(shape);
+        storage_shape[0] = aux_shape[0];
+        CheckAndAllocData(storage_shape, dtype);
+      } else if (kCSRStorage == storage_type) {
+        CheckAndAllocAuxData(csr::kIndPtr, aux_shapes[csr::kIndPtr]);
+        CheckAndAllocAuxData(csr::kIdx, aux_shapes[csr::kIdx]);
+        CheckAndAllocData(aux_shapes[csr::kIdx], dtype);
       } else {
-        Storage::Handle h = this->shandle;
-        Engine::Get()->DeleteVariable([h](RunContext s) {
-            Storage::Get()->Free(h);
-          }, shandle.ctx, var);
+        LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc";
+      }
+    }
+    // create storage handle for data based on shape and dtype, assuming ctx is set
+    // storage shape is also updated
+    // if data is already allocated, try reuse the storage. Otherwise, free the current one
+    // and allocate new storage
+    inline void CheckAndAllocData(const TShape &shape, int dtype) {
+      CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
+      auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
+      if (shandle.size < dbytes) {
+        // free storage if necessary and alloc again
+        if (shandle.size > 0) Storage::Get()->Free(shandle);
+        // init storage
+        shandle = Storage::Get()->Alloc(dbytes, ctx);
       }
+      // init shape
+      storage_shape = shape;
+      // delay_alloc is only set when data storage handle is present
+      delay_alloc = false;
     }
-  };
+    // create storage handle for aux data based on shape
+    // this function assumes ctx, aux shapes and aux types are set
+    // aux shape is also updated
+    // if aux data is already allocated, try reuse the storage. Otherwise, free the current one
+    // and allocate new storage
+    inline void CheckAndAllocAuxData(size_t i, const TShape &shape) {
+      CHECK_EQ(shape.ndim(), 1) << "shape must be 1D in CheckAndAllocAuxData";
+      CHECK_NE(storage_type, kUndefinedStorage)
+        << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData";
+      CHECK_NE(storage_type, kDefaultStorage)
+        << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData";
+      if (aux_handles.size() <= i) {
+        aux_handles.resize(i + 1);
+      }
+      size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]);
+      if (aux_handles[i].size < aux_bytes) {
+        // free storage if necessary and alloc again
+        if (aux_handles[i].size > 0) Storage::Get()->Free(aux_handles[i]);
+        // init aux storage
+        aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx);
+      }
+      // init shape
+      set_aux_shape(i, shape);
+    }
+    /*! \brief destructor */
+    ~Chunk() {
+      bool skip_free = static_data || delay_alloc;
+      Storage::Handle h = this->shandle;
+      std::vector<Storage::Handle> aux_h = this->aux_handles;
+      Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) {
+        if (skip_free == false) {
+          Storage::Get()->Free(h);
+          for (size_t i = 0; i < aux_h.size(); i++) {
+            if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]);
+          }
+        }
+      }, shandle.ctx, var);
+    }
+  };  // struct Chunk
 
   void SetTBlob() const {
-    tblob_.dptr_ = static_cast<char*>(ptr_->shandle.dptr) + byte_offset_;
-    tblob_.shape_ = shape_;
+    CHECK(ptr_ != nullptr);
+    TShape shape = shape_;
+    char *dptr = static_cast<char*>(ptr_->shandle.dptr);
+    auto stype = storage_type();
+    if (stype == kDefaultStorage) {
+      dptr += byte_offset_;
+    } else if (stype == kCSRStorage || stype == kRowSparseStorage) {
+      shape = storage_shape();
+    } else {
+      LOG(FATAL) << "unknown storage type " << stype;
+    }
+    tblob_.dptr_ = dptr;
+    tblob_.shape_ = shape;
     tblob_.type_flag_ = dtype_;
     tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id);
 #if MKL_EXPERIMENTAL == 1
@@ -438,15 +899,17 @@ class NDArray {
   std::shared_ptr<MKLMemHolder> Mkl_mem_;
 #endif
   /*! \brief internal data of NDArray */
-  std::shared_ptr<Chunk> ptr_;
+  std::shared_ptr<Chunk> ptr_{nullptr};
   /*! \brief shape of current NDArray */
   TShape shape_;
   /*! \brief byte offset in chunk */
   size_t byte_offset_ = 0;
   /*! \brief type of data */
   int dtype_ = -1;
+  /*! \brief storage type of data */
+  NDArrayStorageType storage_type_ = kUndefinedStorage;
   /*! \brief node entry for autograd */
-  autograd::AGNodeEntry entry_;
+  nnvm::NodeEntry entry_;
   /*!
    * \brief internal TBlob
    * \note When user access tblob_ by some const methods like
@@ -455,7 +918,12 @@ class NDArray {
    *     this situation.
    */
   mutable TBlob tblob_;
-};
+};  // class NDArray
+
+/*!
+ * \return the number of aux data used for given storage type
+ */
+size_t num_aux_data(NDArrayStorageType stype);
 
 /*!
  * \brief issue an copy operation from one NDArray to another
@@ -468,8 +936,20 @@ class NDArray {
  * \note The function name explicitly marks the order of from and to
  *     due to different possible convention carried by copy function.
  */
-void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0);
+void CopyFromTo(const NDArray &from, const NDArray *to, int priority = 0);
 
+/*!
+ * \brief issue an copy operation from one NDArray to another
+ *  the two ndarray can sit on different devices
+ *  this operation will be scheduled by the engine
+ *
+ * \param from the ndarray we want to copy data from
+ * \param to the target ndarray
+ * \param priority Priority of the action.
+ * \note The function name explicitly marks the order of from and to
+ *     due to different possible convention carried by copy function.
+ */
+void CopyFromTo(const NDArray &from, const NDArray& to, int priority = 0);
 
 /*!
  * \brief Perform elementwise sum over each data from source, store result into out.
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index 1bcae0d29348..8cb8a99b4636 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file op_attr_types.h
  * \brief Additional operator attributes
  *  beside the ones provided by NNVM
@@ -25,7 +26,6 @@
 #ifndef MXNET_OP_ATTR_TYPES_H_
 #define MXNET_OP_ATTR_TYPES_H_
 
-
 #include <mshadow/tensor.h>
 #include <nnvm/op_attr_types.h>
 
@@ -103,6 +103,19 @@ enum class ExecType {
   kCrossDeviceCopy
 };
 
+/*! \brief the dispatch mode of the operator */
+enum class DispatchMode {
+  kUndefined = -1,
+  // dispatch on FCompute or FStatefulCompute
+  kFCompute,
+  // dispatch on FComputeEx or FStatefulComputeEx, if available
+  kFComputeEx,
+  // dispatch on FCompute or FStatefulCompute, and performs storage fallback
+  kFComputeFallback,
+  // special dispatch mode for variables
+  kVariable,
+};
+
 /*!
  * \brief Operator state. This is a pointer type, its content is mutable
  *  even if OpStatePtr is const.
@@ -226,6 +239,30 @@ using FCompute = std::function<void (const nnvm::NodeAttrs& attrs,
                                      const std::vector<TBlob>& inputs,
                                      const std::vector<OpReqType>& req,
                                      const std::vector<TBlob>& outputs)>;
+/*!
+ * \brief Resiger an NDArray compute function for simple stateless forward only operator
+ *
+ * \note Register under "FComputeEx<xpu>" and "FComputeEx<xpu>"
+ *       Dispatched only when inferred dispatch_mode is FDispatchComputeEx
+ */
+using FComputeEx = std::function<void (const nnvm::NodeAttrs& attrs,
+                                       const OpContext& ctx,
+                                       const std::vector<NDArray>& inputs,
+                                       const std::vector<OpReqType>& req,
+                                       const std::vector<NDArray>& outputs)>;
+
+/*!
+ * \brief Resiger a storage and dispatch mode inference function based on
+ *        storage types of the inputs and outputs, and the dev_mask for the operator.
+ *
+ * \note Register under "FInferStorageType"
+ */
+using FInferStorageType = std::function<bool (const NodeAttrs& attrs,
+                                              const int dev_mask,
+                                              DispatchMode* dispatch_mode,
+                                              std::vector<int>* in_attrs,
+                                              std::vector<int>* out_attrs)>;
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h
index 2245db0dbb93..cfa162780495 100644
--- a/include/mxnet/operator.h
+++ b/include/mxnet/operator.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file operator.h
  * \brief Operator interface of mxnet.
  * \author Naiyan Wang
diff --git a/include/mxnet/operator_util.h b/include/mxnet/operator_util.h
index 92ef2ecc58f6..bebe3f13ae45 100644
--- a/include/mxnet/operator_util.h
+++ b/include/mxnet/operator_util.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file operator_util.h
  * \brief Utility functions and registries to help quickly build new operators.
  *  [Deprecated]
diff --git a/include/mxnet/resource.h b/include/mxnet/resource.h
index 1ca1fc6fa707..7d2e6caf8565 100644
--- a/include/mxnet/resource.h
+++ b/include/mxnet/resource.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file resource.h
  * \brief Global resource allocation handling.
  */
diff --git a/include/mxnet/rtc.h b/include/mxnet/rtc.h
new file mode 100644
index 000000000000..747c0b5c94ab
--- /dev/null
+++ b/include/mxnet/rtc.h
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_RTC_H_
+#define MXNET_RTC_H_
+#include "./base.h"
+#if MXNET_USE_CUDA
+#include <nvrtc.h>
+#include <cuda.h>
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <utility>
+#include <unordered_map>
+#include <unordered_set>
+#include "./ndarray.h"
+
+namespace mxnet {
+namespace rtc {
+
+/*! \brief Cuda runtime compile module. */
+class CudaModule {
+ private:
+  /*! \brief Structure for holding internal info. */
+  struct Chunk {
+    /*!
+     * \brief Constructs cuda module.
+     * \param source cuda source code.
+     * \param exports export symbols before mangling.
+     */
+    Chunk(const char* source,
+          const std::vector<std::string>& options,
+          const std::vector<std::string>& exports);
+    /*! \brief deconstrutor */
+    ~Chunk();
+    /*!
+     * \brief Get handle to cuda kernel from loaded module
+     * \param mangled_name mangled kernel name
+     * \param ctx context to run kernel on
+     * \return loaded function handle
+     */
+    CUfunction GetFunction(const std::string& mangled_name, const Context& ctx);
+    /*! \brief nvrtc program handle. */
+    nvrtcProgram prog_;
+    /*! \brief compiled cuda PTX */
+    char* ptx_;
+    /*! \brief lazily loaded cuda module */
+    std::unordered_map<int, CUmodule> mod_;
+    /*! \brief exported names */
+    std::unordered_set<std::string> exports_;
+  };
+  /*! \brief pointer to Chunk */
+  std::shared_ptr<Chunk> ptr_;
+
+ public:
+  /*! \brief cuda kernel argument descriptor */
+  struct ArgType {
+    /*! \brief whether argument is NDArray */
+    bool is_ndarray;
+    /*! \brief whether argument is constant (input) */
+    bool is_const;
+    /*! \brief data type of argument */
+    mshadow::TypeFlag dtype;
+  };
+  /*! \brief Cuda kernel */
+  class Kernel {
+   public:
+    /*! \brief Launch the kernel */
+    void Launch(const Context& ctx, const std::vector<dmlc::any>& args,
+                uint32_t grid_dim_x, uint32_t grid_dim_y, uint32_t grid_dim_z,
+                uint32_t block_dim_x, uint32_t block_dim_y, uint32_t block_dim_z,
+                uint32_t shared_mem);
+    /*! \brief kernel interface signature */
+    const std::vector<ArgType>& signature() { return signature_; }
+
+   private:
+    friend class CudaModule;
+    /*!
+     * \brief constructor
+     * \param mod module of this kernel
+     * \param mangled_name mangled kernel name
+     * \param signature kernel argument signature
+     */
+    Kernel(const std::shared_ptr<Chunk>& mod,
+           const std::string& mangled_name,
+           const std::vector<ArgType>& signature);
+    /*! \brief mangled kernel name */
+    std::string mangled_name_;
+    /*! \brief kernel argument signature */
+    std::vector<ArgType> signature_;
+    /*! \brief module of this kernel */
+    std::shared_ptr<Chunk> mod_;
+    /*! \brief cached kernel function on each device */
+    std::unordered_map<int, CUfunction> func_;
+  };
+  /*!
+   * \brief CudaModule constructor
+   * \param source cuda source code.
+   * \param exports export symbols before mangling.
+   */
+  CudaModule(const char* source,
+             const std::vector<std::string>& options,
+             const std::vector<std::string>& exports)
+      : ptr_(std::make_shared<Chunk>(source, options, exports)) {}
+  /*!
+   * \brief Get cuda kernal from module by name
+   * \param name kernel name
+   * \param signature kernel signature
+   * \return shared pointer to cuda kernel
+   */
+  std::shared_ptr<Kernel> GetKernel(const std::string& name,
+                                    const std::vector<ArgType>& signature);
+};
+
+}  // namespace rtc
+}  // namespace mxnet
+
+#endif  // MXNET_USE_CUDA
+#endif  // MXNET_RTC_H_
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index bfb42de8771a..d19f98b2f460 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file storage.h
  * \brief Storage manager across multiple devices.
  */
@@ -41,15 +42,20 @@ class Storage {
     /*!
      * \brief Pointer to the data.
      */
-    void* dptr;
+    void* dptr{nullptr};
     /*!
      * \brief Size of the storage.
      */
-    size_t size;
+    size_t size{0};
     /*!
      * \brief Context information about device and ID.
      */
     Context ctx;
+    /*!
+     * \brief Id for IPC shared memory
+     */
+    int shared_pid{-1};
+    int shared_id{-1};
   };
   /*!
    * \brief Allocate a new contiguous memory for a given size.
@@ -57,7 +63,23 @@ class Storage {
    * \param ctx Context information about the device and ID.
    * \return Handle struct.
    */
-  virtual Handle Alloc(size_t size, Context ctx) = 0;
+  Handle Alloc(size_t size, Context ctx) {
+    Handle hd;
+    hd.size = size;
+    hd.ctx = ctx;
+    this->Alloc(&hd);
+    return hd;
+  }
+  /*!
+   * \brief Allocate a new contiguous memory for a given size.
+   * \param handle handle initialized with size and ctx
+   */
+  virtual void Alloc(Handle* handle) = 0;
+  /*!
+   * \brief Increase ref counter on shared memory.
+   * \param handle handle to shared memory.
+   */
+  virtual void SharedIncrementRefCount(Handle handle) = 0;
   /*!
    * \brief Free storage.
    * \param handle Handle struect.
@@ -77,6 +99,16 @@ class Storage {
    * \brief Destructor.
    */
   virtual ~Storage() {}
+  /*!
+   * \brief Returns mutex used by storage manager
+   */
+  std::mutex& GetMutex(Context::DeviceType dev) {
+    if (dev == Context::kCPU) {
+      return cpu_mutex_;
+    } else {
+      return gpu_mutex_;
+    }
+  }
   /*!
    * \return Storage singleton.
    */
@@ -90,6 +122,10 @@ class Storage {
    * \return A shared pointer to Storage singleton.
    */
   static std::shared_ptr<Storage> _GetSharedRef();
+
+ private:
+  std::mutex cpu_mutex_;
+  std::mutex gpu_mutex_;
 };  // class Storage
 }  // namespace mxnet
 #endif  // MXNET_STORAGE_H_
diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h
index 18bf4fa780d9..b65cd2b434e4 100755
--- a/include/mxnet/tensor_blob.h
+++ b/include/mxnet/tensor_blob.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2014 by Contributors
  * \file tensor_blob.h
  * \brief TBlob class that holds common representation of
  *  arbirary dimension tensor, can be used to transformed
diff --git a/make/config.mk b/make/config.mk
index 58668fb33d8e..6db22df0c8c5 100644
--- a/make/config.mk
+++ b/make/config.mk
@@ -30,9 +30,12 @@ DEV = 0
 # whether compile with debug
 DEBUG = 0
 
-# whether compiler with profiler
+# whether compile with profiler
 USE_PROFILER =
 
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER =
+
 # the additional link flags you want to add
 ADD_LDFLAGS =
 
@@ -54,14 +57,21 @@ USE_CUDA_PATH = NONE
 # whether use CuDNN R3 library
 USE_CUDNN = 0
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-USE_NVRTC = 0
+#whether to use NCCL library
+USE_NCCL = 0
+#add the path to NCCL library
+USE_NCCL_PATH = NONE
 
 # whether use opencv during compilation
 # you can disable it, however, you will not able to use
 # imbin iterator
 USE_OPENCV = 1
 
+#whether use libjpeg-turbo for image decode without OpenCV wrapper
+USE_LIBJPEG_TURBO = 0
+#add the path to libjpeg-turbo library
+USE_LIBJPEG_TURBO_PATH = NONE
+
 # use openmp for parallelization
 USE_OPENMP = 1
 
@@ -100,6 +110,12 @@ USE_LAPACK = 1
 # path to lapack library in case of a non-standard installation
 USE_LAPACK_PATH =
 
+# by default, disable lapack when using MKL
+# switch on when there is a full installation of MKL available (not just MKL2017/MKL_ML)
+ifeq ($(USE_BLAS), mkl)
+USE_LAPACK = 0
+endif
+
 # add path to intel library, you may need it for MKL, if you did not add the path
 # to environment variable
 USE_INTEL_PATH = NONE
@@ -142,6 +158,18 @@ LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
 # sudo apt-get install -y libcurl4-openssl-dev
 USE_S3 = 0
 
+#----------------------------
+# performance settings
+#----------------------------
+# Use operator tuning
+USE_OPERATOR_TUNING = 1
+
+# Use gperftools if found
+USE_GPERFTOOLS = 1
+
+# Use JEMalloc if found, and not using gperftools
+USE_JEMALLOC = 1
+
 #----------------------------
 # additional operators
 #----------------------------
diff --git a/make/osx.mk b/make/osx.mk
index 7823b072a2ad..b17b04cfdb82 100644
--- a/make/osx.mk
+++ b/make/osx.mk
@@ -51,9 +51,6 @@ USE_CUDA_PATH = NONE
 # whether use CUDNN R3 library
 USE_CUDNN = 0
 
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-USE_NVRTC = 0
-
 # whether use opencv during compilation
 # you can disable it, however, you will not able to use
 # imbin iterator
@@ -70,6 +67,12 @@ USE_BLAS = apple
 # only effective when compiled with blas versions openblas/apple/atlas/mkl
 USE_LAPACK = 1
 
+# by default, disable lapack when using MKL
+# switch on when there is a full installation of MKL available (not just MKL2017/MKL_ML)
+ifeq ($(USE_BLAS), mkl)
+USE_LAPACK = 0
+endif
+
 # add path to intel library, you may need it for MKL, if you did not add the path
 # to environment variable
 USE_INTEL_PATH = NONE
@@ -100,6 +103,12 @@ USE_S3 = 0
 # path to folders containing projects specific operators that you don't want to put in src/operators
 EXTRA_OPERATORS =
 
+#----------------------------
+# other features
+#----------------------------
+
+# Create C++ interface package
+USE_CPP_PACKAGE = 0
 
 #----------------------------
 # plugins
diff --git a/matlab/+mxnet/model.m b/matlab/+mxnet/model.m
index af61091e9fc3..401029146f08 100644
--- a/matlab/+mxnet/model.m
+++ b/matlab/+mxnet/model.m
@@ -1,3 +1,21 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+% or more contributor license agreements.  See the NOTICE file
+% distributed with this work for additional information
+% regarding copyright ownership.  The ASF licenses this file
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+% with the License.  You may obtain a copy of the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing,
+% software distributed under the License is distributed on an
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+% KIND, either express or implied.  See the License for the
+% specific language governing permissions and limitations
+% under the License.
+%
+
 classdef model < handle
 %MODEL MXNet model, supports load and forward
 
diff --git a/matlab/+mxnet/private/callmxnet.m b/matlab/+mxnet/private/callmxnet.m
index 51f3f6f0c910..4cf8c1726e29 100644
--- a/matlab/+mxnet/private/callmxnet.m
+++ b/matlab/+mxnet/private/callmxnet.m
@@ -1,3 +1,21 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+% or more contributor license agreements.  See the NOTICE file
+% distributed with this work for additional information
+% regarding copyright ownership.  The ASF licenses this file
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+% with the License.  You may obtain a copy of the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing,
+% software distributed under the License is distributed on an
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+% KIND, either express or implied.  See the License for the
+% specific language governing permissions and limitations
+% under the License.
+%
+
 function callmxnet(func, varargin)
 %CALLMXNET call mxnet functions
 
@@ -7,7 +25,7 @@ function callmxnet(func, varargin)
   cd(mxnet_root);
   mxnet_root = pwd;
   cd(cur_pwd);
-  
+
   assert(exist([mxnet_root, '/lib/libmxnet.so'   ], 'file') == 2 || ...
          exist([mxnet_root, '/lib/libmxnet.dylib'], 'file') == 2 || ...
          exist([mxnet_root, '/lib/libmxnet.dll'  ], 'file') == 2, ...
diff --git a/matlab/+mxnet/private/parse_json.m b/matlab/+mxnet/private/parse_json.m
index 6aa0b4e5a0f3..fbbb616c7ffc 100644
--- a/matlab/+mxnet/private/parse_json.m
+++ b/matlab/+mxnet/private/parse_json.m
@@ -1,4 +1,22 @@
-function data = parse_json(fname,varargin)
+% Licensed to the Apache Software Foundation (ASF) under one
+% or more contributor license agreements.  See the NOTICE file
+% distributed with this work for additional information
+% regarding copyright ownership.  The ASF licenses this file
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+% with the License.  You may obtain a copy of the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing,
+% software distributed under the License is distributed on an
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+% KIND, either express or implied.  See the License for the
+% specific language governing permissions and limitations
+% under the License.
+%
+
+    function data = parse_json(fname,varargin)
 %PARSE_JSON parse a JSON (JavaScript Object Notation) file or string
 %
 % Based on jsonlab (https://github.com/fangq/jsonlab) created by Qianqian Fang. Jsonlab is lisonced under BSD or GPL v3.
diff --git a/matlab/demo.m b/matlab/demo.m
index a914175ef006..659b687e0b75 100644
--- a/matlab/demo.m
+++ b/matlab/demo.m
@@ -1,3 +1,21 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+% or more contributor license agreements.  See the NOTICE file
+% distributed with this work for additional information
+% regarding copyright ownership.  The ASF licenses this file
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+% with the License.  You may obtain a copy of the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing,
+% software distributed under the License is distributed on an
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+% KIND, either express or implied.  See the License for the
+% specific language governing permissions and limitations
+% under the License.
+%
+
 %% Assumes model symbol and parameters already downloaded using .sh script
 
 %% Load the model
diff --git a/matlab/tests/prepare_data.m b/matlab/tests/prepare_data.m
index 6d450cdd36c1..429cbc5c64f2 100644
--- a/matlab/tests/prepare_data.m
+++ b/matlab/tests/prepare_data.m
@@ -1,3 +1,21 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+% or more contributor license agreements.  See the NOTICE file
+% distributed with this work for additional information
+% regarding copyright ownership.  The ASF licenses this file
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+% with the License.  You may obtain a copy of the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing,
+% software distributed under the License is distributed on an
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+% KIND, either express or implied.  See the License for the
+% specific language governing permissions and limitations
+% under the License.
+%
+
 %% download cifar10 dataset
 system('wget https://www.cs.toronto.edu/~kriz/cifar-10-matlab.tar.gz')
 system('tar -xzvf cifar-10-matlab.tar.gz')
diff --git a/matlab/tests/test_prediction.m b/matlab/tests/test_prediction.m
index fe7d7a68ecc3..ee73c2d21b5e 100644
--- a/matlab/tests/test_prediction.m
+++ b/matlab/tests/test_prediction.m
@@ -1,3 +1,21 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+% or more contributor license agreements.  See the NOTICE file
+% distributed with this work for additional information
+% regarding copyright ownership.  The ASF licenses this file
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+% with the License.  You may obtain a copy of the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing,
+% software distributed under the License is distributed on an
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+% KIND, either express or implied.  See the License for the
+% specific language governing permissions and limitations
+% under the License.
+%
+
 %% prepare
 
 addpath('..')
diff --git a/mshadow b/mshadow
index 497eb9180b24..2d7780c3f2ee 160000
--- a/mshadow
+++ b/mshadow
@@ -1 +1 @@
-Subproject commit 497eb9180b24592b7332e7e08f2c053ec5346524
+Subproject commit 2d7780c3f2eefe4453fa419862d1b2089bedb8d5
diff --git a/nnvm b/nnvm
index bcfbf903429d..e4a138ab947d 160000
--- a/nnvm
+++ b/nnvm
@@ -1 +1 @@
-Subproject commit bcfbf903429d086f16b19b4d202788de06e45536
+Subproject commit e4a138ab947d682c83625840bbcd66f70feb4b14
diff --git a/perl-package/AI-MXNet/Changes b/perl-package/AI-MXNet/Changes
index f8ecc7509737..e9fd9f73041c 100644
--- a/perl-package/AI-MXNet/Changes
+++ b/perl-package/AI-MXNet/Changes
@@ -1,5 +1,10 @@
 Revision history for Perl extension AI::MXNet
 
+1.1     Sun Oct  1 10:19:08 PDT 2017
+        - Major update, added support for new imperative MXNet interface Gluon and realtime GPU kernels callable from Perl space.
+        - Bugfixes for distributed training.
+        - Miscellaneous fixes and performance improvements.
+
 1.0102 Sun Aug  6 16:55:08 PDT 2017
         - bugfixes in Image.pm, updated tests, added PearsonCorrelation metric, added Convolutional RNN modules.
 
diff --git a/perl-package/AI-MXNet/MANIFEST b/perl-package/AI-MXNet/MANIFEST
index 48cb31dd6b8e..abbd285e2878 100644
--- a/perl-package/AI-MXNet/MANIFEST
+++ b/perl-package/AI-MXNet/MANIFEST
@@ -1,77 +1,105 @@
-META.yml
-MANIFEST
+Changes
 examples/calculator.pl
-examples/plot_network.pl
 examples/char_lstm.pl
+examples/cudnn_lstm_bucketing.pl
 examples/get_ptb_data.sh
+examples/gluon/dcgan.pl
+examples/gluon/mnist.pl
 examples/lstm_bucketing.pl
 examples/mnist.pl
-examples/cudnn_lstm_bucketing.pl
-Makefile.PL
-Changes
-META.json
-t/test_recordio.t
-t/test_random.t
-t/test_init.t
-t/test_model_parallel.t
-t/test_optimizers.t
-t/test_multi_device_exec.t
-t/test_ndarray.t
-t/test_io.t
-t/AI-MXNet.t
-t/test_kvstore.t
-t/test_attr.t
-t/test_module.t
-t/test_symbol.t
-t/test_conv.t
-t/test_viz.t
-t/test_rnn.t
-t/test_io_image.t
-t/test_executor.t
-t/test_infer_shape.t
+examples/plot_network.pl
 lib/AI/MXNet.pm
-lib/AI/MXNet/Random.pm
+lib/AI/MXNet/AutoGrad.pm
+lib/AI/MXNet/Base.pm
 lib/AI/MXNet/CachedOp.pm
+lib/AI/MXNet/Callback.pm
 lib/AI/MXNet/Context.pm
-lib/AI/MXNet/Contrib/AutoGrad.pm
-lib/AI/MXNet/Contrib/Symbol.pm
+lib/AI/MXNet/Contrib.pm
 lib/AI/MXNet/Contrib/NDArray.pm
-lib/AI/MXNet/Profiler.pm
-lib/AI/MXNet/Module.pm
-lib/AI/MXNet/Monitor.pm
-lib/AI/MXNet/Function/Parameters.pm
-lib/AI/MXNet/Initializer.pm
-lib/AI/MXNet/Types.pm
-lib/AI/MXNet/Util/Printable.pm
-lib/AI/MXNet/Rtc.pm
-lib/AI/MXNet/RNN.pm
+lib/AI/MXNet/Contrib/Symbol.pm
+lib/AI/MXNet/CudaModule.pm
 lib/AI/MXNet/Executor.pm
-lib/AI/MXNet/Visualization.pm
-lib/AI/MXNet/Optimizer.pm
-lib/AI/MXNet/Contrib.pm
+lib/AI/MXNet/Executor/Group.pm
+lib/AI/MXNet/Function/Parameters.pm
+lib/AI/MXNet/Gluon.pm
+lib/AI/MXNet/Gluon/Block.pm
+lib/AI/MXNet/Gluon/Data.pm
+lib/AI/MXNet/Gluon/Data/Loader.pm
+lib/AI/MXNet/Gluon/Data/Sampler.pm
+lib/AI/MXNet/Gluon/Data/Set.pm
+lib/AI/MXNet/Gluon/Data/Vision.pm
+lib/AI/MXNet/Gluon/Loss.pm
+lib/AI/MXNet/Gluon/Mouse.pm
+lib/AI/MXNet/Gluon/NN.pm
+lib/AI/MXNet/Gluon/NN/BasicLayers.pm
+lib/AI/MXNet/Gluon/NN/ConvLayers.pm
+lib/AI/MXNet/Gluon/Parameter.pm
+lib/AI/MXNet/Gluon/RNN.pm
+lib/AI/MXNet/Gluon/RNN/Cell.pm
+lib/AI/MXNet/Gluon/RNN/Layer.pm
+lib/AI/MXNet/Gluon/Trainer.pm
+lib/AI/MXNet/Gluon/Utils.pm
 lib/AI/MXNet/Image.pm
-lib/AI/MXNet/Symbol/AttrScope.pm
-lib/AI/MXNet/Symbol/Doc.pm
-lib/AI/MXNet/Symbol/Base.pm
-lib/AI/MXNet/Symbol/NameManager.pm
-lib/AI/MXNet/KVStoreServer.pm
+lib/AI/MXNet/Initializer.pm
+lib/AI/MXNet/IO.pm
 lib/AI/MXNet/KVStore.pm
-lib/AI/MXNet/RecordIO.pm
-lib/AI/MXNet/Base.pm
-lib/AI/MXNet/NDArray/Slice.pm
-lib/AI/MXNet/NDArray/Doc.pm
-lib/AI/MXNet/NDArray/Base.pm
-lib/AI/MXNet/Symbol.pm
+lib/AI/MXNet/KVStoreServer.pm
+lib/AI/MXNet/Logging.pm
+lib/AI/MXNet/LRScheduler.pm
 lib/AI/MXNet/Metric.pm
-lib/AI/MXNet/Executor/Group.pm
+lib/AI/MXNet/Module.pm
+lib/AI/MXNet/Module/Base.pm
+lib/AI/MXNet/Module/Bucketing.pm
+lib/AI/MXNet/Monitor.pm
 lib/AI/MXNet/NDArray.pm
+lib/AI/MXNet/NDArray/Base.pm
+lib/AI/MXNet/NDArray/Doc.pm
+lib/AI/MXNet/NDArray/Slice.pm
+lib/AI/MXNet/Optimizer.pm
+lib/AI/MXNet/Profiler.pm
+lib/AI/MXNet/Random.pm
+lib/AI/MXNet/RecordIO.pm
+lib/AI/MXNet/RNN.pm
 lib/AI/MXNet/RNN/Cell.pm
 lib/AI/MXNet/RNN/IO.pm
-lib/AI/MXNet/LRScheduler.pm
-lib/AI/MXNet/Callback.pm
-lib/AI/MXNet/IO.pm
-lib/AI/MXNet/Module/Bucketing.pm
-lib/AI/MXNet/Module/Base.pm
+lib/AI/MXNet/Symbol.pm
+lib/AI/MXNet/Symbol/AttrScope.pm
+lib/AI/MXNet/Symbol/Base.pm
+lib/AI/MXNet/Symbol/Doc.pm
+lib/AI/MXNet/Symbol/NameManager.pm
+lib/AI/MXNet/Symbol/Random.pm
 lib/AI/MXNet/TestUtils.pm
-lib/AI/MXNet/Logging.pm
+lib/AI/MXNet/Types.pm
+lib/AI/MXNet/Util/Printable.pm
+lib/AI/MXNet/Visualization.pm
+Makefile.PL
+MANIFEST			This list of files
+META.json
+META.yml
 README
+t/AI-MXNet.t
+t/test_attr.t
+t/test_autograd.t
+t/test_conv.t
+t/test_cuda_module.t
+t/test_executor.t
+t/test_gluon.t
+t/test_gluon_data.t
+t/test_gluon_rnn.t
+t/test_infer_shape.t
+t/test_init.t
+t/test_io.t
+t/test_io_image.t
+t/test_kvstore.t
+t/test_loss.t
+t/test_metric.t
+t/test_model_parallel.t
+t/test_module.t
+t/test_multi_device_exec.t
+t/test_ndarray.t
+t/test_optimizers.t
+t/test_random.t
+t/test_recordio.t
+t/test_rnn.t
+t/test_symbol.t
+t/test_viz.t
diff --git a/perl-package/AI-MXNet/META.json b/perl-package/AI-MXNet/META.json
index 692f1ddaae39..5ab77257ab8b 100644
--- a/perl-package/AI-MXNet/META.json
+++ b/perl-package/AI-MXNet/META.json
@@ -30,9 +30,10 @@
       },
       "runtime" : {
          "requires" : {
-            "AI::MXNetCAPI" : "1.0102",
-            "AI::NNVMCAPI" : "1.01",
+            "AI::MXNetCAPI" : "1.1",
+            "AI::NNVMCAPI" : "1.1",
             "Function::Parameters" : "1.0705",
+            "Hash::Ordered" : "0.012",
             "GraphViz" : "2.14",
             "Mouse" : "v2.1.0",
             "PDL" : "2.007"
@@ -43,5 +44,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.0102"
+   "version" : "1.1"
 }
diff --git a/perl-package/AI-MXNet/META.yml b/perl-package/AI-MXNet/META.yml
index 5b920182f159..1d1d76006f9e 100644
--- a/perl-package/AI-MXNet/META.yml
+++ b/perl-package/AI-MXNet/META.yml
@@ -17,10 +17,11 @@ no_index:
     - t
     - inc
 requires:
-  AI::MXNetCAPI: '1.0102'
-  AI::NNVMCAPI: '1.01'
+  AI::MXNetCAPI: '1.1'
+  AI::NNVMCAPI: '1.1'
   Function::Parameters: '1.0705'
+  Hash::Ordered: '0.012'
   GraphViz: '2.14'
   Mouse: v2.1.0
   PDL: '2.007'
-version: '1.0102'
+version: '1.1'
diff --git a/perl-package/AI-MXNet/Makefile.PL b/perl-package/AI-MXNet/Makefile.PL
index 2c9bda83330c..c4f97943782f 100644
--- a/perl-package/AI-MXNet/Makefile.PL
+++ b/perl-package/AI-MXNet/Makefile.PL
@@ -19,15 +19,16 @@ my %WriteMakefileArgs = (
   "LICENSE" => "apache_2_0",
   "NAME" => "AI::MXNet",
   "PREREQ_PM" => {
-    "AI::MXNetCAPI" => "1.0102",
-    "AI::NNVMCAPI" => "1.01",
+    "AI::MXNetCAPI" => "1.1",
+    "AI::NNVMCAPI" => "1.1",
     "Function::Parameters" => "1.0705",
+    "Hash::Ordered" => "0.012",
     "Mouse" => "v2.1.0",
     "PDL" => "2.007",
     "GraphViz" => "2.14"
   },
   "TEST_REQUIRES" => {},
-  "VERSION" => "1.0101",
+  "VERSION" => "1.1",
   "test" => {
     "TESTS" => "t/*.t"
   }
@@ -35,9 +36,10 @@ my %WriteMakefileArgs = (
 
 
 my %FallbackPrereqs = (
-  "AI::MXNetCAPI" => "1.0102",
-  "AI::NNVMCAPI" => "1.01",
+  "AI::MXNetCAPI" => "1.1",
+  "AI::NNVMCAPI" => "1.1",
   "Function::Parameters" => "1.0705",
+  "Hash::Ordered" => "0.012",
   "Mouse" => "v2.1.0",
   "PDL" => "2.007",
   "GraphViz" => "2.14"
diff --git a/perl-package/AI-MXNet/README b/perl-package/AI-MXNet/README
index 86b6cf18dbac..5b211d68ab75 100644
--- a/perl-package/AI-MXNet/README
+++ b/perl-package/AI-MXNet/README
@@ -1,5 +1,5 @@
 This archive contains the distribution AI-MXNet,
-version 1.0102:
+version 1.1:
 
   Perl interface to MXNet machine learning library
 
diff --git a/perl-package/AI-MXNet/examples/char_lstm.pl b/perl-package/AI-MXNet/examples/char_lstm.pl
index 54a9e3672f63..9a80ddadf618 100755
--- a/perl-package/AI-MXNet/examples/char_lstm.pl
+++ b/perl-package/AI-MXNet/examples/char_lstm.pl
@@ -133,7 +133,7 @@ sub BUILD
         [$offset + 1 , $offset + $self->batch_size*$self->seq_size]
     )->reshape([$self->batch_size, $self->seq_size]);
     $self->seq_counter($self->seq_counter + 1);
-    if($self->seq_counter == $seq_size - 1)
+    if($self->seq_counter == $self->seq_size - 1)
     {
         $self->counter($self->counter + 1);
         $self->seq_counter(0);
diff --git a/perl-package/AI-MXNet/examples/get_ptb_data.sh b/perl-package/AI-MXNet/examples/get_ptb_data.sh
index d2641cb32b81..0a0c7051b010 100755
--- a/perl-package/AI-MXNet/examples/get_ptb_data.sh
+++ b/perl-package/AI-MXNet/examples/get_ptb_data.sh
@@ -17,6 +17,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+echo ""
+echo "NOTE: Please review the licensing of the datasets in this script before proceeding"
+echo "See https://catalog.ldc.upenn.edu/ldc99t42 for the licensing"
+echo "Once that is done, please uncomment the wget commands in this script"
+echo ""
 
 RNN_DIR=$(cd `dirname $0`; pwd)
 DATA_DIR="${RNN_DIR}/data/"
@@ -26,7 +31,7 @@ if [[ ! -d "${DATA_DIR}" ]]; then
   mkdir -p ${DATA_DIR}
 fi
 
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
-wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.train.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.valid.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/ptb/ptb.test.txt;
+#wget -P ${DATA_DIR} https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt;
diff --git a/perl-package/AI-MXNet/examples/gluon/dcgan.pl b/perl-package/AI-MXNet/examples/gluon/dcgan.pl
new file mode 100755
index 000000000000..2bdc56149d7e
--- /dev/null
+++ b/perl-package/AI-MXNet/examples/gluon/dcgan.pl
@@ -0,0 +1,206 @@
+#!/usr/bin/perl
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use AI::MXNet::Gluon qw(gluon);
+use AI::MXNet::AutoGrad qw(autograd);
+use AI::MXNet::Gluon::NN qw(nn);
+use AI::MXNet::Base;
+use Getopt::Long qw(HelpMessage);
+use Time::HiRes qw(time);
+use PDL::IO::Pic;
+
+my $batch_size = 64;
+my $nz  = 100;
+my $ngf = 64;
+my $ndf = 64;
+my $nepoch = 25;
+my $lr =0.0002;
+my $beta1 = 0.5;
+my $nc = 3;
+## change to my $ctx = mx->cpu(); if needed
+my $ctx = mx->gpu();
+
+my $train_data = gluon->data->DataLoader(
+    gluon->data->vision->MNIST('./data', train=>1, transform => \&transformer),
+    batch_size=>$batch_size, shuffle=>1, last_batch=>'discard'
+);
+
+my $val_data = gluon->data->DataLoader(
+    gluon->data->vision->MNIST('./data', train=>0, transform=> \&transformer),
+    batch_size=>$batch_size, shuffle=>0
+);
+
+sub transformer
+{
+    my ($data, $label) = @_;
+    # resize to 64x64
+    $data = mx->image->imresize($data, 64, 64);
+    $data = $data->reshape([1, 64, 64]);
+    # normalize to [-1, 1]
+    $data = $data->astype('float32')/128 - 1;
+    # if image is greyscale, repeat 3 times to get RGB image.
+    if($data->shape->[0] == 1)
+    {
+        $data = mx->nd->tile($data, [3, 1, 1]);
+    }
+    return ($data, $label);
+}
+
+sub visualize
+{
+    my ($data, $fake, $iter) = @_;
+    mkdir "data_images";
+    mkdir "data_images/$iter";
+    mkdir "fake_images";
+    mkdir "fake_images/$iter";
+    for my $i (0..$batch_size-1)
+    {
+        my $d = ((pdl_shuffle($data->at($i)->at(0)->aspdl, [reverse(0..63)]) + 1)*128)->byte;
+        my $f = ((pdl_shuffle($fake->at($i)->at(0)->aspdl, [reverse(0..63)]) + 1)*128)->byte;
+        $d->wpic("data_images/$iter/$i.jpg");
+        $f->wpic("fake_images/$iter/$i.jpg");
+    }
+}
+
+# build the generator
+my $netG = nn->Sequential();
+$netG->name_scope(sub {
+    # input is Z, going into a convolution
+    $netG->add(nn->Conv2DTranspose($ngf * 8, 4, 1, 0, use_bias=>0));
+    $netG->add(nn->BatchNorm());
+    $netG->add(nn->Activation('relu'));
+    # state size-> ($ngf*8) x 4 x 4
+    $netG->add(nn->Conv2DTranspose($ngf * 4, 4, 2, 1, use_bias=>0));
+    $netG->add(nn->BatchNorm());
+    $netG->add(nn->Activation('relu'));
+    # state size-> ($ngf*8) x 8 x 8
+    $netG->add(nn->Conv2DTranspose($ngf * 2, 4, 2, 1, use_bias=>0));
+    $netG->add(nn->BatchNorm());
+    $netG->add(nn->Activation('relu'));
+    # state size-> ($ngf*8) x 16 x 16
+    $netG->add(nn->Conv2DTranspose($ngf, 4, 2, 1, use_bias=>0));
+    $netG->add(nn->BatchNorm());
+    $netG->add(nn->Activation('relu'));
+    # state size-> ($ngf*8) x 32 x 32
+    $netG->add(nn->Conv2DTranspose($nc, 4, 2, 1, use_bias=>0));
+    $netG->add(nn->Activation('tanh'));
+    # state size-> (nc) x 64 x 64
+});
+
+# build the discriminator
+my $netD = nn->Sequential();
+$netD->name_scope(sub {
+    # input is (nc) x 64 x 64
+    $netD->add(nn->Conv2D($ndf, 4, 2, 1, use_bias=>0));
+    $netD->add(nn->LeakyReLU(0.2));
+    # state size-> ($ndf) x 32 x 32
+    $netD->add(nn->Conv2D($ndf * 2, 4, 2, 1, use_bias=>0));
+    $netD->add(nn->BatchNorm());
+    $netD->add(nn->LeakyReLU(0.2));
+    # state size-> ($ndf) x 16 x 16
+    $netD->add(nn->Conv2D($ndf * 4, 4, 2, 1, use_bias=>0));
+    $netD->add(nn->BatchNorm());
+    $netD->add(nn->LeakyReLU(0.2));
+    # state size-> ($ndf) x 8 x 8
+    $netD->add(nn->Conv2D($ndf * 8, 4, 2, 1, use_bias=>0));
+    $netD->add(nn->BatchNorm());
+    $netD->add(nn->LeakyReLU(0.2));
+    # state size-> ($ndf) x 4 x 4
+    $netD->add(nn->Conv2D(2, 4, 1, 0, use_bias=>0));
+});
+
+# loss
+my $loss = gluon->loss->SoftmaxCrossEntropyLoss();
+
+# initialize the generator and the discriminator
+$netG->initialize(mx->init->Normal(0.02), ctx=>$ctx);
+$netD->initialize(mx->init->Normal(0.02), ctx=>$ctx);
+
+# trainer for the generator and the discriminator
+my $trainerG = gluon->Trainer($netG->collect_params(), 'adam', {learning_rate => $lr, beta1 => $beta1});
+my $trainerD = gluon->Trainer($netD->collect_params(), 'adam', {learning_rate => $lr, beta1 => $beta1});
+# ============printing==============
+my $real_label = mx->nd->ones([$batch_size], ctx=>$ctx);
+my $fake_label = mx->nd->zeros([$batch_size], ctx=>$ctx);
+
+my $metric = mx->metric->Accuracy();
+print "Training...\n";
+
+my $iter = 0;
+for my $epoch (0..$nepoch-1)
+{
+    my $tic = time;
+    my $btic = time;
+    my $fake; my $data;
+    while(defined(my $d = <$train_data>))
+    {
+        $data = $d->[0];
+        ############################
+        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
+        ###########################
+        # train with real_t
+        $data = $data->as_in_context($ctx);
+        my $noise = mx->nd->random->normal(0, 1, shape=>[$batch_size, $nz, 1, 1], ctx=>$ctx);
+
+        my ($output, $errD, $errG);
+        autograd->record(sub {
+            $output = $netD->($data);
+            $output = $output->reshape([$batch_size, 2]);
+            my $errD_real = $loss->($output, $real_label);
+            $metric->update([$real_label], [$output]);
+
+            $fake = $netG->($noise);
+            $output = $netD->($fake->detach());
+            $output = $output->reshape([$batch_size, 2]);
+            my $errD_fake = $loss->($output, $fake_label);
+            $errD = $errD_real + $errD_fake;
+            $errD->backward();
+            $metric->update([$fake_label], [$output]);
+        });
+        $trainerD->step($batch_size);
+
+        ############################
+        # (2) Update G network: maximize log(D(G(z)))
+        ###########################
+        autograd->record(sub {
+            $output = $netD->($fake);
+            $output = $output->reshape([-1, 2]);
+            $errG = $loss->($output, $real_label);
+            $errG->backward();
+        });
+
+        $trainerG->step($batch_size);
+        my ($name, $acc) = $metric->get();
+        if(not $iter%100)
+        {
+            AI::MXNet::Logging->info("speed: %.2f samples/s", $batch_size / (time-$btic));
+            AI::MXNet::Logging->info("discriminator loss = %f, generator loss = %f, binary training acc = %f at iter %d epoch %d",
+                mx->nd->mean($errD)->asscalar(), mx->nd->mean($errG)->asscalar(), $acc, $iter, $epoch);
+        }
+        $iter++;
+        $btic = time;
+    }
+    my ($name, $acc) = $metric->get();
+    $metric->reset();
+    visualize($data, $fake, $epoch);
+    AI::MXNet::Logging->info("\nbinary training acc at epoch %d: %s=%f", $epoch, $name, $acc);
+    AI::MXNet::Logging->info("time: %f", time - $tic);
+}
diff --git a/perl-package/AI-MXNet/examples/gluon/mnist.pl b/perl-package/AI-MXNet/examples/gluon/mnist.pl
new file mode 100755
index 000000000000..2d4eff01d8ce
--- /dev/null
+++ b/perl-package/AI-MXNet/examples/gluon/mnist.pl
@@ -0,0 +1,136 @@
+#!/usr/bin/perl
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use AI::MXNet::Gluon qw(gluon);
+use AI::MXNet::AutoGrad qw(autograd);
+use AI::MXNet::Gluon::NN qw(nn);
+use AI::MXNet::Base;
+use Getopt::Long qw(HelpMessage);
+
+GetOptions(
+    'lr=f'           => \(my $lr           = 0.1),
+    'log-interval=i' => \(my $log_interval = 100),
+    'momentum=f'     => \(my $momentum     = 0.9),
+    'hybridize=i'    => \(my $hybridize    = 0  ),
+    'cuda=i'         => \(my $cuda         = 0  ),
+    'load_params=i'  => \(my $load_params  = 0  ),
+    'batch-size=i'   => \(my $batch_size   = 100),
+    'epochs=i'       => \(my $epochs       = 1 ),
+    'help'           => sub { HelpMessage(0) },
+) or HelpMessage(1);
+
+
+# define network
+
+my $net = nn->Sequential();
+$net->name_scope(sub {
+    $net->add(nn->Dense(128, activation=>'relu'));
+    $net->add(nn->Dense(64, activation=>'relu'));
+    $net->add(nn->Dense(10));
+});
+$net->hybridize() if $hybridize;
+$net->load_params('mnist.params') if $load_params;
+# data
+
+sub transformer
+{
+    my ($data, $label) = @_;
+    $data = $data->reshape([-1])->astype('float32')/255;
+    return ($data, $label);
+}
+
+my $train_data = gluon->data->DataLoader(
+    gluon->data->vision->MNIST('./data', train=>1, transform => \&transformer),
+    batch_size=>$batch_size, shuffle=>1, last_batch=>'discard'
+);
+
+my $val_data = gluon->data->DataLoader(
+    gluon->data->vision->MNIST('./data', train=>0, transform=> \&transformer),
+    batch_size=>$batch_size, shuffle=>0
+);
+
+# train
+
+sub test
+{
+    my $ctx = shift;
+    my $metric = mx->metric->Accuracy();
+    while(defined(my $d = <$val_data>))
+    {
+        my ($data, $label) = @$d;
+        $data = $data->as_in_context($ctx);
+        $label = $label->as_in_context($ctx);
+        my $output = $net->($data);
+        $metric->update([$label], [$output]);
+    }
+    return $metric->get;
+}
+
+sub train
+{
+    my ($epochs, $ctx) = @_;
+    # Collect all parameters from net and its children, then initialize them.
+    $net->initialize(mx->init->Xavier(magnitude=>2.24), ctx=>$ctx);
+    # Trainer is for updating parameters with gradient.
+    my $trainer = gluon->Trainer($net->collect_params(), 'sgd', { learning_rate => $lr, momentum => $momentum });
+    my $metric = mx->metric->Accuracy();
+    my $loss = gluon->loss->SoftmaxCrossEntropyLoss();
+
+    for my $epoch (0..$epochs-1)
+    {
+        # reset data iterator and metric at begining of epoch.
+        $metric->reset();
+        enumerate(sub {
+            my ($i, $d) = @_;
+            my ($data, $label) = @$d;
+            $data = $data->as_in_context($ctx);
+            $label = $label->as_in_context($ctx);
+            # Start recording computation graph with record() section.
+            # Recorded graphs can then be differentiated with backward.
+            my $output;
+            autograd->record(sub {
+                $output = $net->($data);
+                my $L = $loss->($output, $label);
+                $L->backward;
+            });
+            # take a gradient step with batch_size equal to data.shape[0]
+            $trainer->step($data->shape->[0]);
+            # update metric at last.
+            $metric->update([$label], [$output]);
+
+            if($i % $log_interval == 0 and $i > 0)
+            {
+                my ($name, $acc) = $metric->get();
+                print "[Epoch $epoch Batch $i] Training: $name=$acc\n";
+            }
+        }, \@{ $train_data });
+
+        my ($name, $acc) = $metric->get();
+        print "[Epoch $epoch] Training: $name=$acc\n";
+
+        my ($val_name, $val_acc) = test($ctx);
+        print "[Epoch $epoch] Validation: $val_name=$val_acc\n"
+    }
+    $net->save_params('mnist.params');
+}
+
+train($epochs, $cuda ? mx->gpu(0) : mx->cpu);
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
index 40e84a6078e6..bd7d5b4fc37d 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet.pm
@@ -25,7 +25,7 @@ use AI::MXNet::NDArray;
 use AI::MXNet::Symbol;
 use AI::MXNet::Executor;
 use AI::MXNet::Executor::Group;
-use AI::MXNet::Rtc;
+use AI::MXNet::CudaModule;
 use AI::MXNet::Random;
 use AI::MXNet::Initializer;
 use AI::MXNet::Optimizer;
@@ -44,9 +44,10 @@ use AI::MXNet::Visualization;
 use AI::MXNet::RecordIO;
 use AI::MXNet::Image;
 use AI::MXNet::Contrib;
-use AI::MXNet::Contrib::AutoGrad;
 use AI::MXNet::CachedOp;
-our $VERSION = '1.0102';
+use AI::MXNet::AutoGrad;
+use AI::MXNet::Gluon;
+our $VERSION = '1.1';
 
 sub import
 {
@@ -69,6 +70,7 @@ sub import
             sub rnd { 'AI::MXNet::Random' }
             sub random { 'AI::MXNet::Random' }
             sub Context { shift; AI::MXNet::Context->new(\@_) }
+            sub context { 'AI::MXNet::Context' }
             sub cpu { AI::MXNet::Context->cpu(\$_[1]//0) }
             sub gpu { AI::MXNet::Context->gpu(\$_[1]//0) }
             sub kv { 'AI::MXNet::KVStore' }
@@ -81,15 +83,22 @@ sub import
             sub rnn { 'AI::MXNet::RNN' }
             sub callback { 'AI::MXNet::Callback' }
             sub img { 'AI::MXNet::Image' }
+            sub image { 'AI::MXNet::Image' }
             sub contrib { 'AI::MXNet::Contrib' }
+            sub autograd { 'AI::MXNet::AutoGrad' }
             sub name { '$short_name' }
+            sub rtc { '$short_name' }
+            sub CudaModule { shift; AI::MXNet::CudaModule->new(\@_) }
             sub AttrScope { shift; AI::MXNet::Symbol::AttrScope->new(\@_) }
             *AI::MXNet::Symbol::AttrScope::current = sub { \$${short_name}::AttrScope; };
             \$${short_name}::AttrScope = AI::MXNet::Symbol::AttrScope->new;
             sub Prefix { AI::MXNet::Symbol::Prefix->new(prefix => \$_[1]) }
             *AI::MXNet::Symbol::NameManager::current = sub { \$${short_name}::NameManager; };
+            *AI::MXNet::Symbol::NameManager::set_current = sub { \$${short_name}::NameManager = \$_[1]; };
             \$${short_name}::NameManager = AI::MXNet::Symbol::NameManager->new;
             *AI::MXNet::Context::current_ctx = sub { \$${short_name}::Context; };
+            *AI::MXNet::Context::current_context = sub { \$${short_name}::Context; };
+            *AI::MXNet::Context::set_current = sub { \$${short_name}::Context = \$_[1]; };
             \$${short_name}::Context = AI::MXNet::Context->new(device_type => 'cpu', device_id => 0);
             1;
 EOP
@@ -172,7 +181,7 @@ AI::MXNet - Perl interface to MXNet machine learning library
 
 =head1 BUGS AND INCOMPATIBILITIES
 
-    Parity with Python inteface is mostly achieved, few deprecated
+    Parity with Python interface is mostly achieved, few deprecated
     and not often used features left unported for now.
 
 =head1 SEE ALSO
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm b/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm
new file mode 100644
index 000000000000..221840e300aa
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/AutoGrad.pm
@@ -0,0 +1,477 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::AutoGrad;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+use Scalar::Util qw(blessed);
+use Carp qw(confess);
+
+sub import
+{
+    my ($class, $short_name) = @_;
+    if($short_name)
+    {
+        $short_name =~ s/[^\w:]//g;
+        if(length $short_name)
+        {
+            my $short_name_package =<<"EOP";
+            package $short_name;
+            use parent 'AI::MXNet::AutoGrad';
+            1;
+EOP
+            eval $short_name_package;
+        }
+    }
+}
+
+=head1 NAME
+
+    AI::MXNet::AutoGrad - Autograd for NDArray.
+=cut
+
+=head2 set_is_training
+
+    Set status to training/not training. When training, graph will be constructed
+    for gradient computation. Operators will also run with ctx.is_train=True. For example,
+    Dropout will drop inputs randomly when is_train=True while simply passing through
+    if is_train=False.
+
+    Parameters
+    ----------
+    is_train: bool
+
+    Returns
+    -------
+    previous state before this set.
+=cut
+
+
+method set_is_training(Bool $is_train)
+{
+    return scalar(check_call(AI::MXNetCAPI::AutogradSetIsTraining($is_train)));
+}
+
+=head2 set_is_recording
+
+    Set status to recording/not recording. When recording, graph will be constructed
+    for gradient computation.
+
+    Parameters
+    ----------
+    is_recoding: bool
+
+    Returns
+    -------
+    previous state before this set.
+=cut
+
+method set_is_recording(Bool $is_recording)
+{
+    return scalar(check_call(AI::MXNetCAPI::AutogradSetIsRecording($is_recording)));
+}
+
+=head2 is_recording
+
+    Get status on recording/not recording.
+
+    Returns
+    -------
+    Current state of recording.
+=cut
+
+method is_recording()
+{
+    return scalar(check_call(AI::MXNetCAPI::AutogradIsRecording()));
+}
+
+=head2 is_training
+
+    Get status on training/predicting.
+
+    Returns
+    -------
+    Current state of training/predicting.
+=cut
+
+method is_training()
+{
+    return scalar(check_call(AI::MXNetCAPI::AutogradIsTraining()));
+}
+
+=head2 mark_variables
+
+    Mark AI::MXNet::NDArrays as variables to compute gradient for autograd.
+
+    Parameters
+    ----------
+    ArrayRef[AI::MXNet::NDArray] $variables
+    ArrayRef[AI::MXNet::NDArray] $gradients
+    GradReq|ArrayRef[GradReq]   :$grad_reqs='write'
+=cut
+
+method mark_variables(
+    ArrayRef[AI::MXNet::NDArray]  $variables,
+    ArrayRef[AI::MXNet::NDArray]  $gradients,
+    GradReq|ArrayRef[GradReq]    :$grad_reqs='write'
+)
+{
+    my @variable_handles = map { $_->handle } @{ $variables };
+    my @gradient_handles = map { $_->handle } @{ $gradients };
+    my @grad_reqs;
+    if(not ref $grad_reqs)
+    {
+        @grad_reqs = (GRAD_REQ_MAP->{ $grad_reqs }) x scalar(@variable_handles);
+    }
+    else
+    {
+        @grad_reqs = map { GRAD_REQ_MAP->{ $_ } } @{ $grad_reqs };
+    }
+    check_call(
+        AI::MXNetCAPI::AutogradMarkVariables(
+            scalar(@variable_handles),
+            \@variable_handles,
+            \@grad_reqs,
+            \@gradient_handles
+        )
+    );
+}
+
+=head2 backward
+
+    Compute the gradients of heads w.r.t previously marked variables.
+
+    Parameters
+    ----------
+    $heads: ArrayRef[AI::MXNet::NDArray]
+        Output NDArray(s)
+    :$head_grads=: Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray|Undef]]
+        Gradients with respect to heads.
+    :$retain_graph=0: bool, optional
+        Whether to retain graph.
+    :$train_mode=1: bool, optional
+        Whether to do backward for training or predicting.
+=cut
+method backward(
+    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray] $heads,
+    Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray|Undef]] :$head_grads=,
+    Bool :$retain_graph=0,
+    Bool :$train_mode=1
+)
+{
+    my ($head_handles, $hgrad_handles) = _parse_head($heads, $head_grads);
+    check_call(
+        AI::MXNetCAPI::AutogradBackwardEx(
+            scalar(@{ $head_handles }),
+            $head_handles,
+            $hgrad_handles,
+            0,
+            [],
+            $retain_graph,
+            0,
+            $train_mode
+        )
+    );
+}
+
+=head2 compute_gradient
+
+    Compute the gradients of outputs w.r.t variables.
+
+    Parameters
+    ----------
+    outputs: array ref of NDArray
+
+    Returns
+    -------
+    gradients: array ref of NDArray
+=cut
+
+
+method compute_gradient(ArrayRef[AI::MXNet::NDArray] $outputs)
+{
+    __PACKAGE__->backward($outputs);
+}
+
+=head2 grad_and_loss
+
+    Return function that computes both gradient of arguments and loss value.
+
+    Parameters
+    ----------
+    func: a perl sub
+        The forward (loss) function.
+    argnum: an int or a array ref of int
+        The index of argument to calculate gradient for.
+
+    Returns
+    -------
+    grad_and_loss_func: a perl sub
+        A function that would compute both the gradient of arguments and loss value.
+=cut
+
+method grad_and_loss(CodeRef $func, Maybe[Int|ArrayRef[Int]] $argnum=)
+{
+    return sub {
+        my @args = @_;
+        my @variables = @_;
+        if(defined $argnum)
+        {
+            my @argnum = ref $argnum ? @$argnum : ($argnum);
+            @variables = map { $args[$_] } @argnum;
+        }
+        map {
+            assert(
+                (blessed($_) and $_->isa('AI::MXNet::NDArray')),
+                "type of autograd input should NDArray")
+        } @variables;
+        my @grads = map { $_->zeros_like } @variables;
+        __PACKAGE__->mark_variables(\@variables, \@grads);
+        my $outputs;
+        __PACKAGE__->record(sub { $outputs = $func->(@args) });
+        __PACKAGE__->backward(ref $outputs eq 'ARRAY' ? $outputs : [$outputs]);
+        return (\@grads, $outputs);
+    };
+}
+
+=head2 grad
+
+    Compute the gradients of heads w.r.t variables. Gradients will be
+    returned as new NDArrays instead of stored into `variable.grad`.
+    Supports recording gradient graph for computing higher order gradients.
+
+    .. Note: Currently only a very limited set of operators support higher order
+    gradients.
+
+    Parameters
+    ----------
+    $heads: NDArray or array ref of NDArray
+        Output NDArray(s)
+    $variables: NDArray or list of NDArray
+        Input variables to compute gradients for.
+    :$head_grads=: NDArray or list of NDArray or undef
+        Gradients with respect to heads.
+    :$retain_graph=: bool
+        Whether to keep computation graph to differentiate again, instead
+        of clearing history and release memory. Defaults to the same value
+        as create_graph.
+    :$create_graph=0: bool
+        Whether to record gradient graph for computing higher order
+    $train_mode=1: bool, optional
+        Whether to do backward for training or prediction.
+
+    Returns
+    -------
+    NDArray or list of NDArray:
+        Gradients with respect to variables.
+
+    Examples
+    --------
+    >>> $x = mx->nd->ones([1]);
+    >>> $x->attach_grad();
+    >>> mx->autograd->record(sub {
+            $z = mx->nd->elemwise_add(mx->nd->exp($x), $x);
+        });
+    >>> $dx = mx->autograd->grad($z, [$x], create_graph=>1)
+    >>> $dx->backward();
+    >>> print($dx->grad->aspdl)
+    [3.71828175]
+=cut
+
+method grad(
+    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray] $heads,
+    AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray] $variables,
+    Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray|Undef]] :$head_grads=,
+    Bool :$retain_graph=,
+    Bool :$create_graph=0,
+    Bool :$train_mode=1
+)
+{
+    my ($head_handles, $hgrad_handles) = _parse_head($heads, $head_grads);
+    my @var_handles;
+    if(blessed $variables)
+    {
+        @var_handles = ($variables->handle);
+    }
+    else
+    {
+        assert(scalar(@{ $variables }), "variables cannot be an empty array.");
+        @var_handles = map { $_->handle } @{ $variables };
+    }
+
+    $retain_graph //= $create_graph;
+
+    my ($grad_vars, $grad_stypes)
+        =
+    check_call(
+        AI::MXNetCAPI::AutogradBackwardEx(
+            scalar(@{ $head_handles }),
+            $head_handles,
+            $hgrad_handles,
+            scalar(@var_handles),
+            \@var_handles,
+            $retain_graph,
+            $create_graph,
+            $train_mode
+        )
+    );
+
+    my @ret;
+    for(zip($grad_vars, $grad_stypes)) {
+        my ($handle, $stype) = @$_;
+        push @ret, AI::MXNet::NDArray->new(handle => $handle, stype => $stype);
+    }
+    if(blessed $variables)
+    {
+        return $ret[0];
+    }
+    return \@ret;
+}
+
+=head2 train_mode
+
+    Executes $sub within an autograd training scope context.
+    Parameters
+    ----------
+    CodeRef $sub: a perl sub
+=cut
+
+method train_mode(CodeRef $sub)
+{
+    my $prev = __PACKAGE__->set_is_training(1);
+    eval { $sub->(); };
+    __PACKAGE__->set_is_training(0) unless $prev;
+    confess($@) if $@;
+}
+
+=head2 predict_mode
+
+    Executes $sub within an autograd predicting scope context.
+    Parameters
+    ----------
+    CodeRef $sub: a perl sub
+=cut
+
+method predict_mode(CodeRef $sub)
+{
+    my $prev = __PACKAGE__->set_is_training(0);
+    eval { $sub->(); };
+    __PACKAGE__->set_is_training(1) if $prev;
+    confess($@) if $@;
+}
+
+=head2 record
+
+    Executes $sub within an autograd recording scope context
+    and captures code that needs gradients to be calculated.
+    Parameters
+    ----------
+    CodeRef $sub: a perl sub
+    Maybe[Bool] :$train_mode=1
+=cut
+
+method record(CodeRef $sub, Maybe[Bool] :$train_mode=1)
+{
+    my $prev_train;
+    if(defined $train_mode)
+    {
+        $prev_train = __PACKAGE__->set_is_training($train_mode);
+    }
+    my $prev_recording = __PACKAGE__->set_is_recording(1);
+    eval { $sub->(); };
+    if(defined $train_mode)
+    {
+        $prev_train = __PACKAGE__->set_is_training($prev_train) if not $prev_train == $train_mode;
+    }
+    __PACKAGE__->set_is_recording(0) unless $prev_recording;
+    confess($@) if $@;
+}
+
+=head2 pause
+
+    Executes $sub within an autograd recording scope context
+    and captures code that needs gradients to be calculated.
+    Parameters
+    ----------
+    CodeRef $sub: a perl sub
+    Maybe[Bool] :$train_mode=0
+=cut
+
+method pause(CodeRef $sub, Maybe[Bool] :$train_mode=0)
+{
+    my $prev_train;
+    if(defined $train_mode)
+    {
+        $prev_train = __PACKAGE__->set_is_training($train_mode);
+    }
+    my $prev_recording = __PACKAGE__->set_is_recording(0);
+    eval { $sub->(); };
+    if(defined $train_mode)
+    {
+        $prev_train = __PACKAGE__->set_is_training($prev_train) if not $prev_train == $train_mode;
+    }
+    __PACKAGE__->set_is_recording(1) if $prev_recording;
+    confess($@) if $@;
+}
+
+=head2 get_symbol
+
+    Retrieve recorded computation history as `Symbol`.
+
+    Parameters
+    ----------
+    x : NDArray
+        Array representing the head of computation graph.
+    Returns
+    -------
+    Symbol
+        The retrieved Symbol.
+=cut
+
+method get_symbol(AI::MXNet::NDArray $x)
+{
+    my $handle = scalar(check_call(AI::MXNetCAPI::AutogradGetSymbol($x->handle)));
+    return AI::MXNet::Symbol->new(handle => $handle);
+}
+
+# parse head gradient for backward and grad.
+func _parse_head($heads, $head_grads)
+{
+    if(blessed $heads)
+    {
+        $heads = [$heads];
+    }
+    if(blessed $head_grads)
+    {
+        $head_grads = [$head_grads];
+    }
+    my @head_handles = map { $_->handle } @{ $heads };
+    my @hgrad_handles;
+    if(defined $head_grads)
+    {
+        assert(
+            (@{ $heads } == @{ $head_grads }),
+            "heads and head_grads must be lists of the same length"
+        );
+        @hgrad_handles = map { defined($_) ? $_->handle : undef } @{ $head_grads };
+    }
+    return (\@head_handles, \@hgrad_handles);
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
index 0c42fa9306cb..f748ecbe1f37 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Base.pm
@@ -20,8 +20,8 @@ use strict;
 use warnings;
 use PDL;
 use PDL::Types qw();
-use AI::MXNetCAPI 1.0102;
-use AI::NNVMCAPI 1.01;
+use AI::MXNetCAPI 1.1;
+use AI::NNVMCAPI 1.1;
 use AI::MXNet::Types;
 use Time::HiRes;
 use Carp;
@@ -30,23 +30,28 @@ use base qw(Exporter);
 use List::Util qw(shuffle);
 
 @AI::MXNet::Base::EXPORT = qw(product enumerate assert zip check_call build_param_doc
-                              pdl cat dog svd bisect_left pdl_shuffle
+                              pdl cat dog svd bisect_left pdl_shuffle as_array
                               DTYPE_STR_TO_MX DTYPE_MX_TO_STR DTYPE_MX_TO_PDL
                               DTYPE_PDL_TO_MX DTYPE_MX_TO_PERL GRAD_REQ_MAP);
 @AI::MXNet::Base::EXPORT_OK = qw(pzeros pceil);
+
 use constant DTYPE_STR_TO_MX => {
     float32 => 0,
     float64 => 1,
     float16 => 2,
     uint8   => 3,
-    int32   => 4
+    int32   => 4,
+    int8    => 5,
+    int64   => 6
 };
 use constant DTYPE_MX_TO_STR => {
     0 => 'float32',
     1 => 'float64',
     2 => 'float16',
     3 => 'uint8',
-    4 => 'int32'
+    4 => 'int32',
+    5 => 'int8',
+    6 => 'int64'
 };
 use constant DTYPE_MX_TO_PDL => {
     0 => 6,
@@ -54,17 +59,22 @@ use constant DTYPE_MX_TO_PDL => {
     2 => 6,
     3 => 0,
     4 => 3,
+    5 => 0,
+    6 => 5,
     float32 => 6,
     float64 => 7,
     float16 => 6,
     uint8   => 0,
-    int32   => 3
+    int32   => 3,
+    int8    => 0,
+    int64   => 5
 };
 use constant DTYPE_PDL_TO_MX => {
     6 => 0,
     7 => 1,
     0 => 3,
     3 => 4,
+    5 => 6
 };
 use constant DTYPE_MX_TO_PERL => {
     0 => 'f',
@@ -72,11 +82,15 @@ use constant DTYPE_MX_TO_PERL => {
     2 => 'S',
     3 => 'C',
     4 => 'l',
+    5 => 'c',
+    6 => 'q',
     float32 => 'f',
     float64 => 'd',
     float16 => 'S',
     uint8   => 'C',
-    int32   => 'l'
+    int32   => 'l',
+    int8    => 'c',
+    int64   => 'q'
 };
 use constant GRAD_REQ_MAP => {
     null  => 0,
@@ -106,12 +120,17 @@ use constant GRAD_REQ_MAP => {
 
 sub zip
 {
-    my ($sub, @arrays) = @_;
-    my $len = @{ $arrays[0] };
-    for (my $i = 0; $i < $len; $i++)
+    if('CODE' eq ref $_[0])
     {
-        $sub->(map { $_->[$i] } @arrays);
+        # continue supporting the callback style
+        my $code = shift;
+        $code->(@$_) for AI::MXNetCAPI::py_zip(map { \@$_ } @_);
+        return;
     }
+    # the map() here may seem like a no-op, but triggers overloading or
+    # whatever else is needed to make array-ish things actually arrays
+    # before entering the low level list builder.
+    return AI::MXNetCAPI::py_zip(map { \@$_ } @_);
 }
 
 =head2 enumerate
@@ -256,16 +275,14 @@ sub build_param_doc
     $remove_dup //= 1;
     my %param_keys;
     my @param_str;
-    zip(sub {
-            my ($key, $type_info, $desc) = @_;
-            return if exists $param_keys{$key} and $remove_dup;
+    for(zip($arg_names, $arg_types, $arg_descs)) {
+            my ($key, $type_info, $desc) = @$_;
+            next if exists $param_keys{$key} and $remove_dup;
             $param_keys{$key} = 1;
             my $ret = sprintf("%s : %s", $key, $type_info);
             $ret .= "\n    ".$desc if length($desc);
             push @param_str,  $ret;
-        },
-        $arg_names, $arg_types, $arg_descs
-    );
+    }
     return sprintf("Parameters\n----------\n%s\n", join("\n", @param_str));
 }
 
@@ -279,6 +296,52 @@ sub _notify_shutdown
     check_call(AI::MXNetCAPI::NotifyShutdown());
 }
 
+sub _indent
+{
+    my ($s_, $numSpaces) = @_;
+    my @s = split(/\n/, $s_);
+    if (@s == 1)
+    {
+        return $s_;
+    }
+    my $first = shift(@s);
+    @s = ($first, map { (' 'x$numSpaces) . $_ } @s);
+    return join("\n", @s);
+}
+
+sub as_array
+{
+    return ref $_[0] eq 'ARRAY' ? $_[0] : [$_[0]];
+}
+
+my %internal_arguments = (prefix => 1, params => 1, shared => 1);
+my %attributes_per_class;
+sub process_arguments
+{
+    my $orig  = shift;
+    my $class = shift;
+    if($class->can('python_constructor_arguments'))
+    {
+        if(not exists $attributes_per_class{$class})
+        {
+            %{ $attributes_per_class{$class} } = map { $_->name => 1 } $class->meta->get_all_attributes;
+        }
+        my %kwargs;
+        while(@_ >= 2 and not ref $_[-2] and (exists $attributes_per_class{$class}{ $_[-2] } or exists $internal_arguments{ $_[-2] }))
+        {
+            my $v = pop(@_);
+            my $k = pop(@_);
+            $kwargs{ $k } = $v;
+        }
+        if(@_)
+        {
+            @kwargs{ @{ $class->python_constructor_arguments }[0..@_-1] } = @_;
+        }
+        return $class->$orig(%kwargs);
+    }
+    return $class->$orig(@_);
+}
+
 END {
     _notify_shutdown();
     Time::HiRes::sleep(0.01);
@@ -288,5 +351,18 @@ END {
 *pceil  = \&ceil;
 ## making sure that we can stringify arbitrarily large piddles
 $PDL::toolongtoprint = 1000_000_000;
+## convenience subs
+{
+    my $orig_at = PDL->can('at');
+    no warnings 'redefine';
+    *PDL::at = sub {
+        my ($self, @args) = @_;
+        return $orig_at->($self, @args) if @args != 1;
+        return $orig_at->($self, @args) if $self->ndims == 1;
+        return $self->slice(('X')x($self->ndims-1), $args[0])->squeeze;
+    };
+    *PDL::len    = sub { shift->dim(-1) };
+    *PDL::dtype  = sub { DTYPE_MX_TO_STR->{ DTYPE_PDL_TO_MX->{ shift->type->numval } } };
+}
 
 1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
index 2eca42436dc7..d21b690aee26 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Context.pm
@@ -65,7 +65,8 @@ use overload
     '""' => sub {
         my ($self) = @_;
         return sprintf("%s(%s)", $self->device_type, $self->device_id);
-    };
+    },
+    fallback => 1;
 =head1 NAME
 
     AI::MXNet::Context - A device context.
@@ -143,6 +144,13 @@ method current_ctx()
     return $AI::MXNet::current_ctx;
 }
 
+method set_current(AI::MXNet::Context $current)
+{
+    $AI::MXNet::current_ctx = $current;
+}
+
+*current_context = \&current_ctx;
+
 method deepcopy()
 {
     return __PACKAGE__->new(
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
index a81030bdc6e0..2a6e18e14054 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib.pm
@@ -24,6 +24,5 @@ use AI::MXNet::Contrib::NDArray;
 sub sym    { 'AI::MXNet::Contrib::Symbol'  }
 sub symbol { 'AI::MXNet::Contrib::Symbol'  }
 sub nd     { 'AI::MXNet::Contrib::NDArray' }
-sub autograd { 'AI::MXNet::Contrib::AutoGrad' }
 
 1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/AutoGrad.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/AutoGrad.pm
deleted file mode 100644
index ff659982b813..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Contrib/AutoGrad.pm
+++ /dev/null
@@ -1,244 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Contrib::AutoGrad;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use AI::MXNet::Function::Parameters;
-use Scalar::Util qw(blessed);
-
-=head1 NAME
-
-    AI::MXNet::AutoGrad - Autograd for NDArray.
-=cut
-
-=head2 set_is_training
-
-    Set status to training/not training. When training, graph will be constructed
-    for gradient computation. Operators will also run with ctx.is_train=True. For example,
-    Dropout will drop inputs randomly when is_train=True while simply passing through
-    if is_train=False.
-
-    Parameters
-    ----------
-    is_train: bool
-
-    Returns
-    -------
-    previous state before this set.
-=cut
-
-
-method set_is_training(Bool $is_train)
-{
-    my $prev = scalar(check_call(AI::MXNetCAPI::AutogradSetIsTraining($is_train ? 1 : 0)));
-    return $prev ? 1 : 0
-}
-
-=head2 mark_variables
-
-    Mark AI::MXNet::NDArrays as variables to compute gradient for autograd.
-
-    Parameters
-    ----------
-    variables: array ref of AI::MXNet::NDArrays
-    gradients: array ref of AI::MXNet::NDArrays
-    grad_reqs: array ref of strings
-=cut
-
-method mark_variables(
-    ArrayRef[AI::MXNet::NDArray]  $variables,
-    ArrayRef[AI::MXNet::NDArray]  $gradients,
-    GradReq|ArrayRef[GradReq]     $grad_reqs='write'
-)
-{
-    my @variable_handles = map { $_->handle } @{ $variables };
-    my @gradient_handles = map { $_->handle } @{ $gradients };
-    my @grad_reqs;
-    if(not ref $grad_reqs)
-    {
-        @grad_reqs = (GRAD_REQ_MAP->{ $grad_reqs }) x scalar(@variable_handles);
-    }
-    else
-    {
-        @grad_reqs = map { GRAD_REQ_MAP->{ $_ } } @{ $grad_reqs };
-    }
-    check_call(
-        AI::MXNetCAPI::AutogradMarkVariables(
-            scalar(@variable_handles),
-            \@variable_handles,
-            \@grad_reqs,
-            \@gradient_handles
-        )
-    );
-}
-
-=head2 backward
-
-     Compute the gradients of outputs w.r.t variables.
-
-     Parameters
-     ----------
-     outputs: array ref of NDArray
-     out_grads: array ref of NDArray or undef
-     retain_graph: bool, defaults to false
-=cut
-
-
-method backward(
-    ArrayRef[AI::MXNet::NDArray] $outputs,
-    Maybe[ArrayRef[AI::MXNet::NDArray|Undef]] $out_grads=,
-    Bool $retain_graph=0
-)
-{
-    my @output_handles = map { $_->handle } @{ $outputs };
-    if(not defined $out_grads)
-    {
-        check_call(
-            AI::MXNetCAPI::AutogradBackward(
-                scalar(@output_handles),
-                \@output_handles,
-                [],
-                $retain_graph
-            )
-        );
-        return;
-    }
-
-    my @ograd_handles;
-    for my $arr (@$out_grads)
-    {
-        push @ograd_handles, (defined $arr ? $arr->handle : undef);
-    }
-    assert(
-        (@ograd_handles == @output_handles),
-        "outputs and out_grads must have the same length"
-    );
-
-    check_call(
-        AI::MXNetCAPI::AutogradBackward(
-            scalar(@output_handles),
-            \@output_handles,
-            \@ograd_handles,
-            $retain_graph
-        )
-    );
-}
-
-=head2 compute_gradient
-
-    Compute the gradients of outputs w.r.t variables.
-
-    Parameters
-    ----------
-    outputs: array ref of NDArray
-
-    Returns
-    -------
-    gradients: array ref of NDArray
-=cut
-
-
-method compute_gradient(ArrayRef[AI::MXNet::NDArray] $outputs)
-{
-    __PACKAGE__->backward($outputs);
-}
-
-=head2 grad_and_loss
-
-    Return function that computes both gradient of arguments and loss value.
-
-    Parameters
-    ----------
-    func: a perl sub
-        The forward (loss) function.
-    argnum: an int or a array ref of int
-        The index of argument to calculate gradient for.
-
-    Returns
-    -------
-    grad_and_loss_func: a perl sub
-        A function that would compute both the gradient of arguments and loss value.
-=cut
-
-method grad_and_loss(CodeRef $func, Maybe[Int|ArrayRef[Int]] $argnum=)
-{
-    return sub {
-        my @args = @_;
-        my @variables = @_;
-        if(defined $argnum)
-        {
-            my @argnum = ref $argnum ? @$argnum : ($argnum);
-            @variables = map { $_[$_] } @argnum;
-        }
-        map {
-            assert(
-                (blessed($_) and $_->isa('AI::MXNet::NDArray')),
-                "type of autograd input should NDArray")
-        } @variables;
-        my @grads = map { $_->zeros_like } @variables;
-        __PACKAGE__->mark_variables(\@variables, \@grads);
-        my $prev = __PACKAGE__->set_is_training(1);
-        my $outputs = $func->(@args);
-        __PACKAGE__->set_is_training(0) unless $prev;
-        __PACKAGE__->compute_gradient(ref $outputs eq 'ARRAY' ? $outputs : [$outputs]);
-        return (\@grads, $outputs);
-    };
-}
-
-=head2 grad
-
-    Return function that computes gradient of arguments.
-
-    Parameters
-    ----------
-    func: a perl sub
-        The forward (loss) function.
-    argnum: an int or arry ref of int
-        The index of argument to calculate gradient for.
-
-    Returns
-    -------
-    grad_func: a perl function
-        A function that would compute the gradient of arguments.
-=cut
-
-
-method grad(CodeRef $func, Maybe[Int|ArrayRef[Int]] $argnum=)
-{
-    my $grad_with_loss_func = __PACKAGE__->grad_and_loss($func, $argnum);
-    return sub {
-        return ($grad_with_loss_func->(@_))[0];
-    };
-}
-
-method train_section(CodeRef $sub)
-{
-    my $prev = __PACKAGE__->set_is_training(1);
-    $sub->();
-    __PACKAGE__->set_is_training(0) unless $prev;
-}
-
-method test_section(CodeRef $sub)
-{
-    my $prev = __PACKAGE__->set_is_training(0);
-    $sub->();
-    __PACKAGE__->set_is_training(1) if $prev;
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm b/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm
new file mode 100644
index 000000000000..5fa66b26472e
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/CudaModule.pm
@@ -0,0 +1,294 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::CudaModule;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+
+our %DTYPE_CPP_TO_STR = qw(
+    float    float32
+    double   float64
+    __half   float16
+    uint8_t  uint8
+    int      int32
+    int32_t  int32
+    int8_t   int8
+    char     int8
+    int64_t  int64
+);
+
+=head1 DESCRIPTION
+
+    Interface to runtime cuda kernel compile module.
+    Compile and run CUDA code from Perl.
+
+    In CUDA 7.5, you need to prepend your kernel definitions
+    with 'extern "C"' to avoid name mangling::
+
+        $source = '
+        extern "C" __global__ void axpy(const float *x, float *y, float alpha) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            y[i] += alpha * x[i];
+        }
+        ';
+        $module = mx->rtc->CudaModule(source);
+        $func = $module->get_kernel("axpy", "const float *x, float *y, float alpha");
+        $x = mx->nd->ones([10]), ctx=>mx->gpu(0));
+        $y = mx->nd->zeros([10]), ctx=>mx->gpu(0));
+        $func->launch([$x, $y, 3.0], mx->gpu(0), [1, 1, 1], [10, 1, 1]);
+        print $y->aspdl;
+
+    Starting from CUDA 8.0, you can instead export functions by name.
+    This also allows you to use templates::
+
+        my $source = '
+        template<typename DType>
+        __global__ void axpy(const DType *x, DType *y, DType alpha) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            y[i] += alpha * x[i];
+        }
+        ';
+        $module = mx->rtc->CudaModule($source, exports=>['axpy<float>', 'axpy<double>']);
+        $func32 = $module->get_kernel("axpy<float>", "const float *x, float *y, float alpha");
+        $x = mx->nd->ones([10], dtype=>'float32', ctx=>mx->gpu(0));
+        $y = mx->nd->zeros([10], dtype=>'float32', ctx=>mx->gpu(0));
+        $func32->launch([$x, $y, 3.0], mx->gpu(0), [1, 1, 1], [10, 1, 1]);
+        print $y->aspdl;
+
+        $func64 = $module->get_kernel("axpy<double>", "const double *x, double *y, double alpha");
+        $x = mx->nd->ones([10], dtype=>'float64', ctx=>mx->gpu(0));
+        $y = mx->nd->zeros([10], dtype=>'float64', ctx=>mx->gpu(0));
+        $func32->launch([$x, $y, 3.0], mx->gpu(0), [1, 1, 1], [10, 1, 1]);
+        print $y->aspdl;
+
+
+    Parameters
+    ----------
+    source : str
+        Complete source code.
+    options : array ref of str
+        Compiler flags. For example, use "-I/usr/local/cuda/include" to
+        add cuda headers to include path.
+    exports : array ref of str
+        Export kernel names.
+=cut
+
+has 'source' => (is => 'rw', isa => 'Str', required => 1);
+has [qw/options exports/] => (is => 'rw', isa => 'Str|ArrayRef[Str]', default => sub { [] });
+has 'handle' => (is => 'rw', isa => 'CudaModuleHandle');
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
+method python_constructor_arguments() { ['source', 'options', 'exports'] }
+
+sub BUILD
+{
+    my $self = shift;
+    $self->options([$self->options]) unless ref $self->options;
+    $self->options([$self->exports]) unless ref $self->exports;
+    my $handle = check_call(
+                    AI::MXNetCAPI::RtcCudaModuleCreate(
+                        $self->source,
+                        scalar(@{ $self->options }),
+                        $self->options,
+                        scalar(@{ $self->exports }),
+                        $self->exports
+                    )
+    );
+    $self->handle($handle);
+}
+
+sub DEMOLISH
+{
+    check_call(AI::MXNetCAPI::RtcCudaModuleFree(shift->handle));
+}
+
+=head2 get_kernel
+
+        Get CUDA kernel from compiled module.
+
+        Parameters
+        ----------
+        name : str
+            String name of the kernel.
+        signature : str
+            Function signature for the kernel. For example, if a kernel is
+            declared as::
+
+                extern "C" __global__ void axpy(const float *x, double *y, int alpha)
+
+            Then its signature should be::
+
+                const float *x, double *y, int alpha
+
+            or::
+
+                const float *, double *, int
+
+            Note that `*` in signature marks an argument as array and
+            `const` marks an argument as constant (input) array.
+
+        Returns
+        -------
+        AI::MXNet::CudaKernel
+            CUDA kernels that can be launched on GPUs.
+=cut
+
+method get_kernel(Str $name, Str $signature)
+{
+    my @is_ndarray;
+    my @is_const;
+    my @dtypes;
+    my $pattern = qr/^\s*(const)?\s*([\w_]+)\s*(\*)?\s*([\w_]+)?\s*$/;
+    $signature =~ s/\s+/ /g;
+    my @args = split(/,/, $signature);
+    for my $arg (@args)
+    {
+        if(not $arg =~ $pattern or $2 eq 'const')
+        {
+            confess(
+                "Invalid function prototype \"$arg\". Must be in the ".
+                'form of "(const) type (*) (name)'
+            );
+        }
+        push @is_const, $1 ? 1 : 0;
+        my $dtype = $2;
+        push @is_ndarray, $3 ? 1 : 0;
+        if(not exists $DTYPE_CPP_TO_STR{$dtype})
+        {
+            my $types = join(',', sort keys %DTYPE_CPP_TO_STR);
+            confess("Unsupported kernel argument type $arg. Supported types are: $types.");
+        }
+        push @dtypes, DTYPE_STR_TO_MX->{$DTYPE_CPP_TO_STR{$dtype}};
+    }
+
+    my $handle = check_call(
+        AI::MXNetCAPI::RtcCudaKernelCreate(
+            $self->handle,
+            $name,
+            scalar(@dtypes),
+            \@is_ndarray,
+            \@is_const,
+            \@dtypes
+        )
+    );
+    return AI::MXNet::CudaKernel->new($handle, $name, \@is_ndarray, \@dtypes);
+}
+
+package AI::MXNet::CudaKernel;
+use Mouse;
+use AI::MXNet::Base;
+
+=head1 NAME
+
+    AI::MXNet::CudaKernel
+=cut
+
+=head1 DESCRIPTION
+
+    Constructs CUDA kernel.
+    Intended to be created by calling AI::MXNet::CudaModule->get_kernel only.
+=cut
+
+has [qw/handle name is_ndarray dtypes/] => (is => 'rw');
+around BUILDARGS => sub {
+    my ($orig, $class, $handle, $name, $is_ndarray, $dtypes) = @_;
+    return $class->$orig(handle => $handle, name => $name, is_ndarray => $is_ndarray, dtypes => $dtypes);
+};
+
+sub BUILD
+{
+    my $self = shift;
+    $self->dtypes([map { DTYPE_MX_TO_STR->{$_} } @{ $self->dtypes }]);
+}
+
+sub DEMOLISH
+{
+    check_call(AI::MXNetCAPI::RtcCudaKernelFree(shift->handle));
+}
+
+=head2 launch
+
+        Launch cuda kernel.
+
+        Parameters
+        ----------
+        $args : array ref of NDArray or numbers
+            List of arguments for kernel. NDArrays are expected for pointer
+            types (e.g. `float*`, `double*`) while numbers are expected for
+            non-pointer types (e.g. `int`, `float`).
+        $ctx : AI::MXNet::Context
+            The context to launch kernel on. Must be GPU context.
+        $grid_dims : array ref of 3 integers
+            Grid dimensions for CUDA kernel.
+        $block_dims : array ref of 3 integers
+            Block dimensions for CUDA kernel.
+        $shared_mem=0 : integer, optional
+            Size of dynamically allocated shared memory. Defaults to 0.
+=cut
+
+method launch(
+    ArrayRef[AI::MXNet::NDArray|Num] $args,
+    AI::MXNet::Context $ctx,
+    CudaKernelShape $grid_dims,
+    CudaKernelShape $block_dims,
+    Int $shared_mem=0
+)
+{
+    assert(($ctx->device_type eq 'gpu'), "Cuda kernel can only be launched on GPU");
+    confess("CudaKernel(${\ $self->name }) expects ".scalar(@{$self->dtypes}). "arguments but got ".scalar(@$args).".")
+        unless (@{ $args } == @{ $self->dtypes });
+    my @void_args;
+    enumerate(sub {
+        my ($i, $arg, $is_nd, $dtype) = @_;
+        if($is_nd)
+        {
+            confess("The $i-th argument is expected to be a NDArray but got [$arg]")
+                unless blessed $arg;
+            push @void_args, $arg->handle;
+        }
+        else
+        {
+            my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
+            my $packed_arg;
+            ## special handling for float16
+            if($perl_pack_type eq 'S')
+            {
+                $packed_arg = pack("S", AI::MXNetCAPI::_float_to_half($arg));
+            }
+            else
+            {
+                $packed_arg = pack($perl_pack_type, $arg);
+
+            }
+            push @void_args, $packed_arg;
+        }
+    }, $args, $self->is_ndarray, $self->dtypes);
+    check_call(
+        AI::MXNetCAPI::RtcCudaKernelCall(
+            $self->handle,
+            $ctx->device_id,
+            \@void_args,
+            @{ $grid_dims },
+            @{ $block_dims },
+            $shared_mem
+        )
+    );
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
index 20a6f580a3db..3a1a343c62a7 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Executor.pm
@@ -38,7 +38,7 @@ has [qw/_arg_dict
         _aux_dict
         _output_dict
         outputs
-        _output_dirty/] => (is => 'rw', init_arg => undef);
+    /]                  => (is => 'rw', init_arg => undef);
 =head1 NAME
 
     AI::MXNet::Executor - The actual executing object of MXNet.
@@ -191,14 +191,6 @@ method forward(Int $is_train=0, %kwargs)
             $is_train
         )
     );
-    if($self->_output_dirty)
-    {
-        AI::MXNet::Logging->warning(
-            "Calling forward the second time after forward(is_train=1) "
-            ."without calling backward first. Is this intended?"
-        );
-    }
-    $self->_output_dirty($is_train);
     return $self->outputs;
 }
 
@@ -212,9 +204,17 @@ method forward(Int $is_train=0, %kwargs)
         The gradient on the outputs to be propagated back.
         This parameter is only needed when bind is called
         on outputs that are not a loss function.
+
+    is_train : bool, default 1
+        Whether this backward is for training or inference. Note that in rare
+        cases you want to call backward with is_train=0 to get gradient
+        during inference.
 =cut
 
-method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|HashRef[AI::MXNet::NDArray]] $out_grads=)
+method backward(
+    Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|HashRef[AI::MXNet::NDArray]] $out_grads=,
+    Bool $is_train=1
+)
 {
     $out_grads //= [];
     if(blessed $out_grads)
@@ -226,20 +226,13 @@ method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]|HashRef[AI
         $out_grads = [ @{ $out_grads }{ @{ $self->symbol->list_outputs() } } ];
     }
     check_call(
-        AI::MXNetCAPI::ExecutorBackward(
+        AI::MXNetCAPI::ExecutorBackwardEx(
             $self->handle,
             scalar(@{ $out_grads }),
-            [map { $_->handle } @{ $out_grads }]
+            [map { $_->handle } @{ $out_grads }],
+            $is_train
         )
     );
-    if(not $self->_output_dirty)
-    {
-        AI::MXNet::Logging->warning(
-            "Calling backward without calling forward(is_train=True) "
-            ."first. Behavior is undefined."
-        );
-    }
-    $self->_output_dirty(0);
 }
 
 =head2 set_monitor_callback
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm
index 611c93148f25..acacffde1ee2 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Executor/Group.pm
@@ -57,18 +57,18 @@ func _split_input_slice($batch_size, $work_load_list)
 # Load a array ref of arrays into a array ref of arrays specified by slices
 func _load_general($data, $targets, $major_axis)
 {
-    zip(sub {
-        my ($d_src, $d_targets, $axis) = @_;
+    for(zip($data, $targets, $major_axis)) {
+        my ($d_src, $d_targets, $axis) = @$_;
         if(blessed($d_targets) and $d_targets->isa('AI::MXNet::NDarray'))
         {
             $d_src->copyto($d_targets);
         }
         elsif(ref $d_targets eq 'ARRAY' and blessed $d_targets->[0])
         {
-            zip(sub {
-                my ($src, $dst) = @_;
+            for(zip($d_src, $d_targets)) {
+                my ($src, $dst) = @$_;
                 $src->copyto($dst);
-            }, $d_src, $d_targets);
+            }
         }
         else
         {
@@ -124,7 +124,7 @@ func _load_general($data, $targets, $major_axis)
                 }
             }
         }
-    }, $data, $targets, $major_axis);
+    }
 }
 
 # Load data into sliced arrays
@@ -144,8 +144,8 @@ func _load_label($batch, $targets, $major_axis)
 func _merge_multi_context($outputs, $major_axis)
 {
     my @rets;
-    zip(sub {
-        my ($tensors, $axis) = @_;
+    for(zip($outputs, $major_axis)) {
+        my ($tensors, $axis) = @$_;
         if($axis >= 0)
         {
             if(@$tensors == 1)
@@ -165,7 +165,7 @@ func _merge_multi_context($outputs, $major_axis)
             # first one, without checking they are actually the same
             push @rets, $tensors->[0];
         }
-    }, $outputs, $major_axis);
+    }
     return \@rets;
 }
 
@@ -353,9 +353,9 @@ method decide_slices(ArrayRef[AI::MXNet::DataDesc] $data_shapes)
 {
     confess("empty data_shapes array") unless @{ $data_shapes } > 0;
     my $major_axis = [map { AI::MXNet::DataDesc->get_batch_axis($_->layout) } @{ $data_shapes }];
-    zip(sub {
-        my ($desc, $axis) = @_;
-        return if($axis == -1);
+    for(zip($data_shapes, $major_axis)) {
+        my ($desc, $axis) = @$_;
+        next if($axis == -1);
         my $batch_size = $desc->shape->[$axis];
         if(defined $self->_p->batch_size)
         {
@@ -370,7 +370,7 @@ method decide_slices(ArrayRef[AI::MXNet::DataDesc] $data_shapes)
             $self->_p->batch_size($batch_size);
             $self->_p->slices(AI::MXNet::Executor::Group::_split_input_slice($self->_p->batch_size, $self->workload));
         }
-    }, $data_shapes, $major_axis);
+    }
     return $major_axis;
 }
 
@@ -590,16 +590,16 @@ method set_params(HashRef[AI::MXNet::NDArray] $arg_params, HashRef[AI::MXNet::ND
 method get_params(HashRef[AI::MXNet::NDArray] $arg_params, HashRef[AI::MXNet::NDArray] $aux_params)
 {
     my $weight = 0;
-    zip(sub {
-        my ($name, $block) = @_;
+    for(zip($self->param_names, $self->_p->param_arrays)) {
+        my ($name, $block) = @$_;
             my $weight = sum(map { $_->copyto(AI::MXNet::Context->cpu) } @{ $block }) / @{ $block };
             $weight->astype($arg_params->{$name}->dtype)->copyto($arg_params->{$name});
-    }, $self->param_names, $self->_p->param_arrays);
-    zip(sub {
-        my ($name, $block) = @_;
+    }
+    for(zip($self->_p->aux_names, $self->_p->aux_arrays)) {
+        my ($name, $block) = @$_;
             my $weight = sum(map { $_->copyto(AI::MXNet::Context->cpu) } @{ $block }) / @{ $block };
             $weight->astype($aux_params->{$name}->dtype)->copyto($aux_params->{$name});
-    }, $self->_p->aux_names, $self->_p->aux_arrays);
+    }
 }
 
 
@@ -668,15 +668,15 @@ method get_output_shapes()
 {
     my @shapes = map { $_->shape } @{ $self->execs->[0]->outputs };
     my @concat_shapes;
-    zip(sub {
-        my ($key, $shape, $axis) = @_;
+    for(zip($self->symbol->list_outputs, \@shapes, $self->_p->output_layouts)) {
+        my ($key, $shape, $axis) = @$_;
         my @the_shape = @{ $shape };
         if($axis >= 0)
         {
             $the_shape[$axis] = $self->_p->batch_size;
         }
         push @concat_shapes, AI::MXNet::DataDesc->new(name => $key, shape => \@the_shape);
-    }, $self->symbol->list_outputs, \@shapes, $self->_p->output_layouts);
+    }
     return \@concat_shapes;
 }
 
@@ -765,11 +765,11 @@ method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $out_grad
 {
     confess('re-bind with for_training=1 to run backward') unless $self->for_training;
     $out_grads //= [];
-    zip(sub {
-        my ($i, $exec, $islice) = @_;
+    for(zip([0..@{ $self->_p->execs }-1], $self->_p->execs, $self->_p->slices)) {
+        my ($i, $exec, $islice) = @$_;
         my @out_grads_slice;
-        zip(sub{
-            my ($grad, $axis) = @_;
+        for(zip($out_grads, $self->_p->output_layouts)) {
+            my ($grad, $axis) = @$_;
             if($axis >= 0)
             {
                 my $og_my_slice = $grad->slice_axis({
@@ -783,9 +783,9 @@ method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $out_grad
             {
                 push @out_grads_slice, $grad->copyto($self->contexts->[$i]);
             }
-        }, $out_grads, $self->_p->output_layouts);
+        }
         $exec->backward(\@out_grads_slice);
-    }, [0..@{ $self->_p->execs }-1], $self->_p->execs, $self->_p->slices);
+    }
 }
 
 =head2 update_metric
@@ -802,11 +802,11 @@ method backward(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]] $out_grad
 
 method update_metric(AI::MXNet::EvalMetric $eval_metric, ArrayRef[AI::MXNet::NDArray] $labels)
 {
-    zip(sub {
-        my ($texec, $islice) = @_;
+    for(zip($self->_p->execs, $self->_p->slices)) {
+        my ($texec, $islice) = @$_;
         my @labels_slice;
-        zip(sub {
-            my ($label, $axis) = @_;
+        for(zip($labels, $self->_p->label_layouts)) {
+            my ($label, $axis) = @$_;
             if($axis == 0)
             {
                 # slicing NDArray along axis 0 can avoid copying
@@ -825,9 +825,9 @@ method update_metric(AI::MXNet::EvalMetric $eval_metric, ArrayRef[AI::MXNet::NDA
             {
                 push @labels_slice, $label;
             }
-        }, $labels, $self->_p->label_layouts);
+        }
         $eval_metric->update(\@labels_slice, $texec->outputs);
-    }, $self->_p->execs, $self->_p->slices);
+    }
 }
 
 method _bind_ith_exec(
@@ -841,11 +841,12 @@ method _bind_ith_exec(
     my $context = $self->contexts->[$i];
     my $shared_data_arrays = $self->_p->shared_data_arrays->[$i];
     my %input_shapes = map { $_->name => $_->shape } @{ $data_shapes };
+    my %input_types  = map { $_->name => $_->dtype } @{ $data_shapes };
     if(defined $label_shapes)
     {
         %input_shapes = (%input_shapes, map { $_->name => $_->shape } @{ $label_shapes });
+        %input_types  = (%input_types,  map { $_->name => $_->dtype } @{ $label_shapes });
     }
-    my %input_types = map { $_->name => $_->dtype } @{ $data_shapes };
     my $executor = $self->symbol->simple_bind(
         ctx              => $context,
         grad_req         => $self->grad_req,
@@ -873,8 +874,8 @@ method _bind_ith_exec(
 method _sliced_shape(ArrayRef[AI::MXNet::DataDesc] $shapes, Int $i, ArrayRef[Int] $major_axis)
 {
     my @sliced_shapes;
-    zip(sub {
-        my ($desc, $axis) = @_;
+    for(zip($shapes, $major_axis)) {
+        my ($desc, $axis) = @$_;
         my @shape = @{ $desc->shape };
         if($axis >= 0)
         {
@@ -886,7 +887,7 @@ method _sliced_shape(ArrayRef[AI::MXNet::DataDesc] $shapes, Int $i, ArrayRef[Int
             dtype   => $desc->dtype,
             layout  => $desc->layout
         );
-    }, $shapes, $major_axis);
+    }
     return \@sliced_shapes;
 }
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
new file mode 100644
index 000000000000..687cd8c3a3e4
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon.pm
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::Gluon;
+use strict;
+use warnings;
+use AI::MXNet::Gluon::Loss;
+use AI::MXNet::Gluon::Trainer;
+use AI::MXNet::Gluon::Utils;
+use AI::MXNet::Gluon::Data;
+use AI::MXNet::Gluon::NN;
+use AI::MXNet::Gluon::RNN;
+
+sub import
+{
+    my ($class, $short_name) = @_;
+    if($short_name)
+    {
+        $short_name =~ s/[^\w:]//g;
+        if(length $short_name)
+        {
+            my $short_name_package =<<"EOP";
+            package $short_name;
+            sub data { 'AI::MXNet::Gluon::Data' }
+            sub nn { 'AI::MXNet::Gluon::NN_' }
+            sub rnn { 'AI::MXNet::Gluon::RNN_' }
+            sub loss { 'AI::MXNet::Gluon::Loss_' }
+            sub utils { 'AI::MXNet::Gluon::Utils' }
+            sub Trainer { shift; AI::MXNet::Gluon::Trainer->new(\@_); }
+            sub Parameter { shift; AI::MXNet::Gluon::Parameter->new(\@_); }
+            sub ParameterDict { shift; AI::MXNet::Gluon::ParameterDict->new(\@_); }
+            \@${short_name}::ISA = ('AI::MXNet::Gluon_');
+            1;
+EOP
+            eval $short_name_package;
+        }
+    }
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
new file mode 100644
index 000000000000..148df0471f2a
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Block.pm
@@ -0,0 +1,904 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# Scope for collecting child 'Block's
+use strict;
+use warnings;
+use AI::MXNet::Gluon::Parameter;
+package AI::MXNet::Gluon::BlockScope;
+use AI::MXNet::Function::Parameters;
+my $_current;
+use Mouse;
+has '_block'      => (is => 'ro', init_arg => 'block');
+has [qw/_counter _old_scope
+    _name_scope/] => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_counter({});
+}
+
+# Creates prefix and params for new Block.
+method create($prefix, $params, $hint)
+{
+    my $current = $_current;
+    if(not defined $current)
+    {
+        if(not defined $prefix)
+        {
+            $prefix = AI::MXNet::Symbol::NameManager->current->get(undef, $hint) . '_';
+        }
+        if(not defined $params)
+        {
+            $params = AI::MXNet::Gluon::ParameterDict->new(prefix => $prefix);
+        }
+        else
+        {
+            $params = AI::MXNet::Gluon::ParameterDict->new(prefix => $params->prefix, shared => $params);
+        }
+        return ($prefix, $params);
+    }
+
+    if(not defined $prefix)
+    {
+        my $count = $current->_counter->{ $hint } // 0;
+        $prefix = sprintf('%s%d_', $hint, $count);
+        $current->_counter->{$hint} = $count + 1;
+    }
+    if(not defined $params)
+    {
+        my $parent = $current->_block->params;
+        $params = AI::MXNet::Gluon::ParameterDict->new(prefix => $parent->prefix.$prefix, shared => $parent->_shared);
+    }
+    else
+    {
+        $params = AI::MXNet::Gluon::ParameterDict->new(prefix => $params->prefix, $params);
+    }
+    return ($current->_block->prefix.$prefix, $params);
+}
+
+method __enter__()
+{
+    $self->_old_scope($_current);
+    $_current = $self;
+    $self->_name_scope(AI::MXNet::Symbol::NameManager->current);
+    AI::MXNet::Symbol::NameManager->set_current(AI::MXNet::Symbol::Prefix->new(prefix => $self->_block->prefix));
+    return $self;
+}
+
+method __exit__()
+{
+    AI::MXNet::Symbol::NameManager->set_current($self->_name_scope);
+    $self->_name_scope(undef);
+    $_current = $self->_old_scope;
+}
+
+package AI::MXNet::Gluon::Block;
+use AI::MXNet::Gluon::Mouse;
+
+=head2 NAME
+
+    AI::MXNet::Gluon::Block - Base class for all neural network layers and models.
+
+=head2 DESCRIPTION
+
+    Base class for all neural network layers and models. Your models should
+    subclass this class.
+
+    `Block` can be nested recursively in a tree structure. You can create and
+    assign child `Block` as regular attributes::
+
+        from mxnet.gluon import Block, nn
+        from mxnet import ndarray as F
+
+        class Model(Block):
+            def __init__(self, **kwargs):
+                super(Model, self).__init__(**kwargs)
+                # use name_scope to give child Blocks appropriate names.
+                # It also allows sharing Parameters between Blocks recursively.
+                with self.name_scope():
+                    self.dense0 = nn.Dense(20)
+                    self.dense1 = nn.Dense(20)
+
+                x = F.relu(self.dense0(x))
+                return F.relu(self.dense1(x))
+
+        model = Model()
+        model.initialize(ctx=mx.cpu(0))
+        model(F.zeros((10, 10), ctx=mx.cpu(0)))
+
+
+    Child `Block` assigned this way will be registered and `collect_params`
+    will collect their Parameters recursively.
+
+    Parameters
+    ----------
+    prefix : str
+        Prefix acts like a name space. It will be prepended to the names of all
+        Parameters and child `Block`s in this `Block`'s `name_scope`. Prefix
+        should be unique within one model to prevent name collisions.
+    params : ParameterDict or None
+        `ParameterDict` for sharing weights with the new `Block`. For example,
+        if you want `dense1` to share `dense0`'s weights, you can do::
+
+            dense0 = nn.Dense(20)
+            dense1 = nn.Dense(20, params=dense0.collect_params())
+=cut
+
+method _flatten(
+    $args
+)
+{
+    if(blessed $args and $args->isa('AI::MXNet::NDArray'))
+    {
+        return ([$args], 0);
+    }
+    elsif(blessed $args and $args->isa('AI::MXNet::Symbol'))
+    {
+        my $length = @{ $args->list_outputs() };
+        $length = $length > 1 ? $length : 0;
+        return ([$args], $length)
+    }
+    my @flat;
+    my @fmts;
+    for my $i (@{ $args })
+    {
+        my ($arg, $fmt) = __PACKAGE__->_flatten($i);
+        push @flat, @{ $arg };
+        push @fmts, $fmt;
+    }
+    return (\@flat, \@fmts);
+}
+
+method _regroup(
+    $args, $fmt
+)
+{
+    my $in_symbol = (blessed $args and $args->isa('AI::MXNet::Symbol'));
+    my @ret;
+    if(not ref $fmt)
+    {
+        my $len = @{$args} - 1;
+        if($fmt == 0)
+        {
+            @ret = ([@{$args}[1..$len]]);
+            if($in_symbol)
+            {
+                $ret[0] = AI::MXNet::Symbol->Group($ret[0]);
+            }
+            return (@{$args}[0], $ret[0]);
+        }
+        @ret = ([@{$args}[0..$fmt-1]], [@{$args}[$fmt..$len]]);
+        if($in_symbol)
+        {
+            @ret = map { AI::MXNet::Symbol->Group($_) } @ret;
+        }
+        return @ret;
+    }
+    for my $i (@{ $fmt })
+    {
+        my $res;
+        ($res, $args) = __PACKAGE__->_regroup($args, $i);
+        push @ret, $res;
+    }
+    return (\@ret, $args);
+}
+
+has _prefix => (is => 'rw', init_arg => 'prefix', isa => 'Str');
+has _params => (is => 'rw', init_arg => 'params', isa => 'Maybe[AI::MXNet::Gluon::ParameterDict]');
+has [qw/_name _scope/] => (is => 'rw', init_arg => undef);
+has [qw/_children/]    => (is => 'rw', init_arg => undef, default => sub { [] });
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
+
+sub AUTOLOAD {
+    my $name = $AI::MXNet::Gluon::Block::AUTOLOAD;
+    $name =~ s/.*:://;
+    my $self = shift;
+    AI::MXNet::Gluon::Mouse::has($name => (is => 'rw', 'init_arg' => undef, 'caller' => ref $self));
+    $self->$name(@_);
+}
+
+sub BUILD
+{
+    my $self = shift;
+    my ($prefix, $params) = AI::MXNet::Gluon::BlockScope->create($self->_prefix, $self->_params, $self->_alias);
+    $self->_prefix($prefix);
+    $self->_params($params);
+    my $name = $prefix;
+    $name =~ s/_$//;
+    $self->_name($name);
+    $self->_scope(AI::MXNet::Gluon::BlockScope->new(block => $self));
+}
+
+method _class_name()
+{
+    my $class = ref $self || $self;
+    $class =~ s/^.+:://;
+    $class;
+}
+
+method __setattr__($name, $current, $prev=)
+{
+    if(defined $prev)
+    {
+        if(
+            (
+                blessed $prev
+                    and
+                ($prev->isa('AI::MXNet::Gluon::Parameter') or $prev->isa('AI::MXNet::Gluon::Block'))
+            )
+            and not (blessed $current and (ref($prev) eq ref($current)))
+        )
+        {
+            confess(
+                sprintf(
+                    "Changing attribute type for %s from %s to %s is not allowed.",
+                    $self->name,
+                    ref($prev),
+                    ref($current)||'no ref'
+                )
+            );
+        }
+        if(blessed $current and $current->isa('AI::MXNet::Gluon::Block'))
+        {
+            for(my $i = 0; $i < @{ $self->_children }; $i++)
+            {
+                if(Scalar::Util::refaddr($self->_children->[$i]) eq Scalar::Util::refaddr($prev))
+                {
+                    $self->_children->[$i] = $current;
+                }
+            }
+        }
+    }
+    if(blessed $current and $current->isa('AI::MXNet::Gluon::Block'))
+    {
+        $self->register_child($current);
+    }
+}
+
+method _alias()
+{
+    lc $self->_class_name;
+}
+
+method attributes_hash()
+{
+    +{ map { $_ => $self->$_ } $self->meta->get_attribute_list };
+}
+
+use overload
+    '""' => sub
+    {
+        my $self = shift;
+        my $s = "%s(\n{%s}\n)";
+        my @blocks;
+        my %attributes_hash = %{ $self->attributes_hash };
+        while(my ($k, $v) = each %attributes_hash)
+        {
+            if(blessed $v and $v->isa(__PACKAGE__))
+            {
+                push @blocks, "  ($k): ".AI::MXNet::Base::_indent("$v", 2);
+            }
+        }
+        sprintf("%s(\n{%s}\n)", $self->_class_name, join("\n", @blocks));
+    },
+    '&{}' => sub { my $self = shift; sub { $self->call(@_) } };
+
+method prefix()
+{
+    $self->_prefix;
+}
+
+method name()
+{
+    $self->_name;
+}
+
+method class()
+{
+    __PACKAGE__;
+}
+
+method name_scope(CodeRef $sub)
+{
+    $self->_scope->__enter__;
+    $sub->();
+    $self->_scope->__exit__;
+}
+
+=head2 params
+
+        Returns this `Block`'s parameter dictionary (does not include its
+        children's parameters).
+=cut
+
+method params()
+{
+    return $self->_params;
+}
+
+=head2 collect_params
+
+        Returns a `ParameterDict` containing this `Block` and all of its
+        children's Parameters.
+=cut
+
+method collect_params()
+{
+    my $ret = AI::MXNet::Gluon::ParameterDict->new(prefix => $self->_params->prefix);
+    $ret->update($self->params);
+    for my $cld (@{ $self->_children })
+    {
+        $ret->update($cld->collect_params());
+    }
+    return $ret;
+}
+
+=head2 save
+
+        Save parameters to file.
+
+        filename : str
+            Path to file.
+=cut
+
+method save_params($filename)
+{
+    $self->collect_params->save($filename, $self->prefix);
+}
+
+=head2 load
+
+        Load parameters from file.
+
+        $filename : str
+            Path to parameter file.
+        :$ctx= : Context or list of Context
+            Context(s) initialize loaded parameters on.
+        :$allow_missing : bool, default False
+            Whether to silently skip loading parameters not represents in the file.
+        :$ignore_extra : bool, default False
+            Whether to silently ignore parameters from the file that are not
+            present in this Block.
+=cut
+
+method load_params(
+    Str   $filename,
+    Maybe [AI::MXNet::Context|ArrayRef[AI::MXNet::Context]] :$ctx=,
+    Bool  :$allow_missing=0,
+    Bool  :$ignore_extra=0
+)
+{
+    $self->collect_params->load(
+        $filename,
+        ($ctx ? (ctx   => $ctx) : ()),
+        allow_missing  => $allow_missing,
+        ignore_extra   => $ignore_extra,
+        restore_prefix => $self->prefix
+    );
+}
+
+=head2 register_child
+
+        Registers block as a child of self. `Block`s assigned to self as
+        attributes will be registered automatically.
+=cut
+
+method register_child(AI::MXNet::Gluon::Block $block)
+{
+    push @{ $self->_children }, $block;
+}
+
+=head2 initialize
+
+        Initializes `Parameter`s of this `Block` and its children.
+
+        Equivalent to `block.collect_params().initialize(...)`
+=cut
+
+method initialize(
+    Initializer $init=AI::MXNet::Initializer->Uniform(),
+    AI::MXNet::Context|ArrayRef[AI::MXNet::Context] :$ctx=AI::MXNet::Context->current_ctx,
+    Bool :$verbose=0
+)
+{
+    $self->collect_params->initialize(init => $init, ctx => $ctx, verbose => $verbose);
+}
+
+
+=head2 hybridize
+
+        Activates or deactivates `HybridBlock`s recursively. Has no effect on
+        non-hybrid children.
+
+        Parameters
+        ----------
+        active : bool, default True
+            Whether to turn hybrid on or off.
+=cut
+
+method hybridize(Bool $active=1)
+{
+    $_->hybridize($active) for @{ $self->_children };
+}
+
+method call(@args)
+{
+    return $self->forward(@args);
+}
+
+=head2 forward
+
+        Overrides to implement forward computation using `NDArray`. Only
+        accepts positional arguments.
+
+        Parameters
+        ----------
+        @args : array of NDArray
+            Input tensors.
+=cut
+
+method forward(@args)
+{
+    confess("Not Implemented");
+}
+
+method register(Str $container)
+{
+    my $sub_name = $self->_class_name;
+    no strict 'refs';
+    *{$container.'_::'.$sub_name} = sub { shift; $self->new(@_) };
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon');
+
+package AI::MXNet::Gluon::HybridBlock;
+
+=head2 NAME
+
+    AI::MXNet::Gluon::HybridBlock
+
+=head2 DESCRIPTION
+
+    `HybridBlock` supports forwarding with both Symbol and NDArray.
+
+    Forward computation in `HybridBlock` must be static to work with `Symbol`s,
+    i.e. you cannot call `.asnumpy()`, `.shape`, `.dtype`, etc on tensors.
+    Also, you cannot use branching or loop logic that bases on non-constant
+    expressions like random numbers or intermediate results, since they change
+    the graph structure for each iteration.
+
+    Before activating with `hybridize()`, `HybridBlock` works just like normal
+    `Block`. After activation, `HybridBlock` will create a symbolic graph
+    representing the forward computation and cache it. On subsequent forwards,
+    the cached graph will be used instead of `hybrid_forward`.
+
+    Refer `Hybrid tutorial <http://mxnet.io/tutorials/gluon/hybrid.html>`_ to see
+    the end-to-end usage.
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::Gluon::Block';
+has [qw/
+        _reg_params _cached_graph
+        _cached_op _cached_params
+        _out_format _in_format
+        _active _in_idx
+/] => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_reg_params({});
+    $self->_cached_graph([]);
+    $self->_active(0);
+}
+
+method __setattr__($name, $current, $prev=)
+{
+    $self->SUPER::__setattr__($name, $current, $prev);
+    if(blessed $current and $current->isa('AI::MXNet::Gluon::Parameter'))
+    {
+        $self->_reg_params->{ $name } = $current;
+    }
+}
+
+method register_child(AI::MXNet::Gluon::HybridBlock $block)
+{
+    push @{ $self->_children }, $block;
+}
+
+method hybridize(Bool $active=1)
+{
+    $self->_active($active);
+    $self->SUPER::hybridize($active);
+}
+
+method _get_graph(@args)
+{
+    if(not @{ $self->_cached_graph })
+    {
+        my $args = [@args];
+        my ($in_format, $out_format);
+        ($args, $in_format) = __PACKAGE__->_flatten($args);
+        $self->_in_format($in_format);
+        my @inputs = map { AI::MXNet::Symbol->var("input_$_") } 0 .. @$args-1;
+        my ($grouped_inputs) = __PACKAGE__->_regroup(\@inputs, $self->_in_format);
+        my %params = map { $_ => $self->_reg_params->{$_}->var } keys %{ $self->_reg_params };
+        my @out;
+        $self->name_scope(sub {
+            @out = $self->hybrid_forward('AI::MXNet::Symbol', @{ $grouped_inputs }, %params);
+        });
+        my $out = @out > 1 ? [@out] : $out[0];
+        ($out, $out_format) = __PACKAGE__->_flatten($out);
+        $self->_out_format($out_format);
+        @{ $self->_cached_graph } = (\@inputs, AI::MXNet::Symbol->Group($out));
+    }
+    return @{ $self->_cached_graph };
+}
+
+=head2 infer_shape
+
+        Infers shape of Parameters from inputs.
+=cut
+
+method infer_shape(@args)
+{
+    my ($inputs, $out) = $self->_get_graph(@args);
+    my $args = \@args;
+    ($args) = __PACKAGE__->_flatten($args);
+    my %in;
+    for(zip($inputs, $args)) {
+        my ($i, $j) = @$_;
+        $in{ $i->name } = $j->shape;
+    }
+    my ($arg_shapes, undef, $aux_shapes) = $out->infer_shape(%in);
+    my %sdict;
+    for(zip($out->list_arguments(), $arg_shapes)) {
+        my ($i, $j) = @$_;
+        $sdict{ $i } = $j;
+    }
+    my %aux;
+    for(zip($out->list_auxiliary_states(), $aux_shapes)) {
+        my ($i, $j) = @$_;
+        $aux{ $i } = $j;
+    }
+    %sdict = (%sdict, %aux);
+    for my $i ($self->collect_params->values)
+    {
+        $i->shape($sdict{ $i->name })
+    }
+}
+
+method _build_cache(@args)
+{
+    my ($inputs, $out) = $self->_get_graph(@args);
+    $self->_cached_op(AI::MXNet::NDArray->CachedOp($out));
+    my %params = %{ $self->collect_params };
+    $self->_cached_params([map { $params{ $_ } } @{ $out->list_inputs }]);
+    assert(
+        (
+            ((keys %params) + (@{ $self->_cached_graph->[0] }))
+                ==
+            @{ $out->list_inputs }
+        ),
+        "Wrong number of inputs."
+    );
+    my %name2pos;
+    enumerate(sub {
+        my ($i, $var) = @_;
+        $name2pos{ $var->name } = $i;
+    }, $inputs);
+    my @in_idx;
+    enumerate(sub {
+        my ($i, $name) = @_;
+        if(not exists $params{ $name })
+        {
+            push @in_idx, [$i, $name2pos{ $name }];
+        }
+    }, $out->list_inputs);
+    $self->_in_idx(\@in_idx);
+}
+
+use Data::Dumper;
+method _call_cached_op(@args)
+{
+    if(not defined $self->_cached_op)
+    {
+        $self->_build_cache(@args);
+    }
+
+    my @cargs;
+    eval {
+        @cargs = map { defined($_) ? $_->data() : undef } @{ $self->_cached_params };
+    };
+    if($@)
+    {
+        if($@ =~ /DeferredInitializationError/)
+        {
+            $self->infer_shape(@args);
+            map { $_->_finish_deferred_init if defined } @{ $self->_cached_params };
+            @cargs = map { defined($_) ? $_->data() : undef } @{ $self->_cached_params };
+        }
+        else
+        {
+            confess($@);
+        }
+    }
+    my $args = [@args];
+    my $fmt;
+    ($args, $fmt) = __PACKAGE__->_flatten($args);
+    assert((Dumper($fmt) eq Dumper($self->_in_format)), "Invalid input format");
+    for (@{ $self->_in_idx })
+    {
+        $cargs[$_->[0]] = $args->[$_->[1]];
+    }
+    my $out = $self->_cached_op->(@cargs);
+    if(blessed $out and $out->isa('AI::MXNet::NDArray'))
+    {
+        $out = [$out];
+    }
+    my $ret = (__PACKAGE__->_regroup($out, $self->_out_format))[0];
+    if(ref($ret) eq 'ARRAY' and wantarray)
+    {
+        return @$ret;
+    }
+    else
+    {
+        return $ret;
+    }
+}
+
+=head2 forward
+
+        Defines the forward computation. Arguments can be either
+        `NDArray` or `Symbol`.
+=cut
+
+method forward($x, @args)
+{
+    if(blessed $x and $x->isa('AI::MXNet::NDArray'))
+    {
+        my @out;
+        my $out;
+        my $ctx = $x->context;
+        my $current_ctx = AI::MXNet::Context->current_ctx;
+        AI::MXNet::Context->set_current($ctx);
+        if($self->_active)
+        {
+            if(wantarray)
+            {
+                my @out = $self->_call_cached_op($x, @args);
+                AI::MXNet::Context->set_current($current_ctx);
+                return @out;
+            }
+            else
+            {
+                my $out = $self->_call_cached_op($x, @args);
+                AI::MXNet::Context->set_current($current_ctx);
+                return $out;
+            }
+        }
+        my %params;
+        eval {
+            %params = map { $_ => $self->_reg_params->{ $_ }->data($ctx) } keys %{ $self->_reg_params };
+        };
+        if($@)
+        {
+            if($@ =~ /DeferredInitializationError/)
+            {
+                $self->infer_shape($x, @args);
+                $_->_finish_deferred_init for $self->collect_params->values;
+                %params = map { $_ => $self->_reg_params->{ $_ }->data($ctx) } keys %{ $self->_reg_params };
+            }
+            else
+            {
+                confess($@);
+            }
+        }
+        @out = $self->hybrid_forward('AI::MXNet::NDArray', $x, @args, %params);
+        AI::MXNet::Context->set_current($current_ctx);
+        return wantarray ? @out : $out[0];
+    }
+    assert(
+        (blessed $x and $x->isa('AI::MXNet::Symbol')),
+        "HybridBlock requires the first argument to forward be either ".
+        "Symbol or NDArray, but got [".ref($x)."]"
+    );
+    my %params = map { $_ => $self->_reg_params->{ $_ }->var } keys %{ $self->_reg_params };
+    my @ret;
+    $self->name_scope(sub {
+        @ret = $self->hybrid_forward('AI::MXNet::Symbol', $x, @args, %params);
+    });
+    return wantarray ? @ret : $ret[0];
+}
+
+=head2 hybrid_forward
+
+        Overrides to construct symbolic graph for this `Block`.
+
+        Parameters
+        ----------
+        x : Symbol or NDArray
+            The first input tensor.
+        *args : list of Symbol or list of NDArray
+            Additional input tensors.
+=cut
+
+method hybrid_forward($F, $x, @args)
+{
+    confess("NotImplementedError");
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon');
+
+package AI::MXNet::Gluon::SymbolBlock;
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::SymbolBlock - Construct block from symbol.
+=cut
+
+=head1 DESCRIPTION
+
+    Construct block from symbol. This is useful for using pre-trained models
+    as feature extractors. For example, you may want to extract get the output
+    from fc2 layer in AlexNet.
+
+    Parameters
+    ----------
+    outputs : Symbol or list of Symbol
+        The desired output for SymbolBlock.
+    inputs : Symbol or list of Symbol
+        The Variables in output's argument that should be used as inputs.
+    params : ParameterDict
+        Parameter dictionary for arguments and auxililary states of outputs
+        that are not inputs.
+
+    Examples
+    --------
+    >>> # To extract the feature from fc1 and fc2 layers of AlexNet:
+    >>> alexnet = gluon.model_zoo.vision.alexnet(pretrained=True, ctx=mx.cpu(),
+                                                 prefix='model_')
+    >>> inputs = mx.sym.var('data')
+    >>> out = alexnet(inputs)
+    >>> internals = out.get_internals()
+    >>> print(internals.list_outputs())
+    ['data', ..., 'model_dense0_relu_fwd_output', ..., 'model_dense1_relu_fwd_output', ...]
+    >>> outputs = [internals['model_dense0_relu_fwd_output'],
+                   internals['model_dense1_relu_fwd_output']]
+    >>> # Create SymbolBlock that shares parameters with alexnet
+    >>> feat_model = gluon.SymbolBlock(outputs, inputs, params=alexnet.collect_params())
+    >>> x = mx.nd.random_normal(shape=(16, 3, 224, 224))
+    >>> print(feat_model(x))
+=cut
+
+has [qw/outputs inputs/] => (is => 'rw', isa => 'AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]');
+method python_constructor_arguments() { [qw/outputs inputs/] }
+
+sub BUILD
+{
+    my ($self, $orig_params) = @_;
+    $self->_prefix('');
+    $self->_params(AI::MXNet::Gluon::ParameterDict->new(prefix => '', shared => $orig_params->{params}));
+    if(blessed $self->inputs and @{ $self->inputs->list_outputs } == 1)
+    {
+        $self->inputs([$self->inputs]);
+    }
+    if(blessed $self->outputs and @{ $self->outputs->list_outputs } == 1)
+    {
+        $self->outputs([$self->outputs]);
+    }
+    my ($syms, $in_format) = __PACKAGE__->_flatten($self->inputs);
+    my ($out, $out_format) = __PACKAGE__->_flatten($self->outputs);
+    $self->_in_format($in_format);
+    $self->_out_format($out_format);
+    $out = AI::MXNet::Symbol->Group($out);
+
+    my %input_names;
+    for my $i (@{ $syms })
+    {
+        assert(
+            (@{ $i->get_internals->list_outputs() } == 1),
+            "Input symbols must be variable, but $i is an output of operators"
+        );
+        $input_names{ $i->name } = 1;
+    }
+
+    for my $i (@{ $out->list_arguments })
+    {
+        if(not exists $input_names{$i})
+        {
+            $self->params->get($i, allow_deferred_init => 1);
+        }
+    }
+
+    for my $i (@{ $out->list_auxiliary_states })
+    {
+        if(not exists $input_names{$i})
+        {
+            $self->params->get($i, grad_req => 'null', allow_deferred_init => 1);
+        }
+    }
+
+    $self->_cached_graph([$syms, $out]);
+    $self->_build_cache;
+}
+
+method forward($x, @args)
+{
+    if(blessed $x and $x->isa('AI::MXNet::NDArray'))
+    {
+        my @out;
+        my $out;
+        my $ctx = $x->context;
+        my $current_ctx = AI::MXNet::Context->current_ctx;
+        AI::MXNet::Context->set_current($ctx);
+        if(wantarray)
+        {
+            my @out = $self->_call_cached_op($x, @args);
+            AI::MXNet::Context->set_current($current_ctx);
+            return @out;
+        }
+        else
+        {
+            my $out = $self->_call_cached_op($x, @args);
+            AI::MXNet::Context->set_current($current_ctx);
+            return $out;
+        }
+    }
+    assert(
+        (blessed $x and $x->isa('AI::MXNet::Symbol')),
+        "HybridBlock requires the first argument to forward be either ".
+        "Symbol or NDArray, but got [".ref($x)."]"
+    );
+    my $args = \@args;
+    my $in_fmt;
+    ($args, $in_fmt) = __PACKAGE__->_flatten([$x, @$args]);
+    assert((Data::Dumper::Dumper($in_fmt) eq Data::Dumper::Dumper($self->_in_format)), "Invalid input format");
+    my $ret = $self->_cached_graph->[1]->deepcopy;
+    my %in;
+    for(zip($self->_cached_graph->[0], $args)) {
+        my ($k, $v) = @$_;
+        $in{$k->name} = $v;
+    }
+    $ret->_compose(%in);
+    $ret = (__PACKAGE__->_regroup($ret, $self->_out_format))[0];
+    if(ref($ret) eq 'ARRAY' and wantarray)
+    {
+        return @$ret;
+    }
+    else
+    {
+        return $ret;
+    }
+}
+
+method hybrid_forward(@args)
+{
+    confess('NotImplementedError');
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon');
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data.pm
new file mode 100644
index 000000000000..e2287c23dafe
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data.pm
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::Gluon::Data;
+use strict;
+use warnings;
+use AI::MXNet::Gluon::Data::Set;
+use AI::MXNet::Gluon::Data::Sampler;
+use AI::MXNet::Gluon::Data::Loader;
+use AI::MXNet::Gluon::Data::Vision;
+sub vision { 'AI::MXNet::Gluon::Data::Vision' }
+
+1;
+
+
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Loader.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Loader.pm
new file mode 100644
index 000000000000..e6a0e7461a80
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Loader.pm
@@ -0,0 +1,186 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Loader::DataLoader - Dataset generator.
+=cut
+
+=head1 DESCRIPTION
+
+    Loads data from a dataset and returns mini-batches of data.
+
+    Parameters
+    ----------
+    dataset : Dataset
+        Source dataset. Note that numpy and mxnet arrays can be directly used
+        as a Dataset.
+    batch_size : int
+        Size of mini-batch.
+    shuffle : bool
+        Whether to shuffle the samples.
+    sampler : Sampler
+        The sampler to use. Either specify sampler or shuffle, not both.
+    last_batch : {'keep', 'discard', 'rollover'}
+        How to handle the last batch if batch_size does not evenly divide
+        `len(dataset)`.
+
+        keep - A batch with less samples than previous batches is returned.
+        discard - The last batch is discarded if its incomplete.
+        rollover - The remaining samples are rolled over to the next epoch.
+    batch_sampler : Sampler
+        A sampler that returns mini-batches. Do not specify batch_size,
+        shuffle, sampler, and last_batch if batch_sampler is specified.
+=cut
+
+use strict;
+use warnings;
+package AI::MXNet::Gluon::Data::Loader::DataLoader;
+use AI::MXNet::Function::Parameters;
+use Mouse;
+
+method _class_name()
+{
+    my $class = ref $self || $self;
+    $class =~ s/^.+:://;
+    $class;
+}
+
+method register(Str $container)
+{
+    my $sub_name = $self->_class_name;
+    no strict 'refs';
+    *{$container.'::'.$sub_name} = sub { shift; $self->new(@_) };
+}
+
+# Collate data into batch.
+func _batchify($data, $dtype)
+{
+    if(blessed $data->[0] and $data->[0]->isa('AI::MXNet::NDArray'))
+    {
+        return AI::MXNet::NDArray->stack(@{ $data });
+    }
+    elsif(ref $data->[0] eq 'ARRAY')
+    {
+        my (@data, @label);
+        for my $i (@$data)
+        {
+            my ($d, $l) = @$i;
+            push @data, $d;
+            push @label, $l;
+        }
+        return [_batchify(\@data, $dtype), _batchify(\@label, $dtype)];
+    }
+    else
+    {
+        return AI::MXNet::NDArray->array($data, dtype => $dtype);
+    }
+}
+
+has 'dataset'       => (is => 'rw', isa => 'AI::MXNet::Gluon::Data::Set|AI::MXNet::NDArray|PDL', required => 1);
+has 'batch_size'    => (is => 'ro', isa => 'Int');
+has 'shuffle'       => (is => 'ro', isa => 'Bool', default => 0);
+has 'sampler'       => (is => 'rw', isa => 'AI::MXNet::Gluon::Data::Sampler');
+has 'batch_sampler' => (is => 'rw', isa => 'AI::MXNet::Gluon::Data::Sampler');
+has 'last_batch'    => (is => 'rw', isa => 'Str', default => 'keep');
+
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
+method python_constructor_arguments() { ['dataset', 'batch_size'] }
+
+sub BUILD
+{
+    my $self = shift;
+    if($self->dataset->isa('PDL'))
+    {
+        $self->dataset(AI::MXNet::NDArray->array($self->dataset));
+    }
+    if(not defined $self->batch_sampler)
+    {
+        if(not defined $self->batch_size)
+        {
+            confess("batch_size must be specified unless batch_sampler is specified");
+        }
+        if(not defined $self->sampler)
+        {
+            if($self->shuffle)
+            {
+                $self->sampler(
+                    AI::MXNet::Gluon::Data::Sampler::RandomSampler->new(
+                        length => $self->dataset->len
+                    )
+                );
+            }
+            else
+            {
+                $self->sampler(
+                    AI::MXNet::Gluon::Data::Sampler::SequentialSampler->new(
+                        length => $self->dataset->len,
+                    )
+                );
+            }
+        }
+        elsif($self->shuffle)
+        {
+            confess("shuffle must not be specified if sampler is specified");
+        }
+        $self->batch_sampler(
+            AI::MXNet::Gluon::Data::Sampler::BatchSampler->new(
+                sampler => $self->sampler,
+                batch_size => $self->batch_size,
+                last_batch => $self->last_batch
+            )
+        );
+    }
+    elsif(defined $self->batch_size or $self->shuffle or defined $self->sampler or defined $self->last_batch)
+    {
+        confess("batch_size, shuffle, sampler and last_batch must ".
+                "not be specified if batch_sampler is specified.");
+    }
+}
+
+use overload
+    '<>' => sub {
+        my $self = shift;
+        my $sampler = $self->batch_sampler;
+        my $batch = <$sampler>;
+        if(not defined $batch)
+        {
+            return undef;
+        };
+        return _batchify([map { $self->dataset->at($_) } @{ $batch }], eval { $self->dataset->label->dtype }//'int32');
+    };
+
+method len()
+{
+    $self->batch_sampler->len;
+}
+
+use overload '@{}' => sub { shift->list };
+
+method list()
+{
+    my @ret;
+    while(defined(my $data = <$self>))
+    {
+        push @ret, $data;
+    }
+    return \@ret;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data');
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Sampler.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Sampler.pm
new file mode 100644
index 000000000000..e19f9d8c5f32
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Sampler.pm
@@ -0,0 +1,285 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+
+package AI::MXNet::Gluon::Data::Sampler;
+use AI::MXNet::Function::Parameters;
+use Mouse;
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
+
+method _class_name()
+{
+    my $class = ref $self || $self;
+    $class =~ s/^.+:://;
+    $class;
+}
+
+method register(Str $container)
+{
+    my $sub_name = $self->_class_name;
+    no strict 'refs';
+    *{$container.'::'.$sub_name} = sub { shift; $self->new(@_) };
+}
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Sampler
+=cut
+
+=head1 DESCRIPTION
+
+    Base class for samplers.
+
+    All samplers should subclass AI::MXNet::Gluon::Data::Sampler 
+    and define method 'len' and 'next'
+    methods.
+=cut
+
+use overload '<>' =>  sub { shift->next },
+             '@{}' => sub { shift->list };
+
+method list()
+{
+    my @ret;
+    while(defined(my $data = <$self>))
+    {
+        push @ret, $data;
+    }
+    return \@ret;
+}
+
+method len() { confess('Not Implemented') }
+method next() { confess('Not Implemented') }
+
+package AI::MXNet::Gluon::Data::Sampler::SequentialSampler;
+use Mouse;
+extends 'AI::MXNet::Gluon::Data::Sampler';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Sampler::SequentialSampler
+=cut
+
+=head1 DESCRIPTION
+
+    Samples elements from [0, length) sequentially.
+
+    Parameters
+    ----------
+    length : int
+        Length of the sequence.
+=cut
+has 'length'   => (is => 'ro', isa => 'Int', required => 1);
+has '_current' => (is => 'rw', init_arg => undef, default => 0);
+method python_constructor_arguments() { ['length'] }
+
+method next()
+{
+    my $current = $self->_current;
+    if($self->_current == $self->length)
+    {
+        $self->reset;
+        return undef;
+    }
+    else
+    {
+        $self->_current($self->_current + 1);
+        return $current;
+    }
+};
+
+method reset() { $self->_current(0) }
+method len() { $self->length }
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data');
+
+package AI::MXNet::Gluon::Data::Sampler::RandomSampler;
+use Mouse;
+use List::Util qw(shuffle);
+extends 'AI::MXNet::Gluon::Data::Sampler';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Sampler::RandomSampler
+=cut
+
+=head1 DESCRIPTION
+
+    Samples elements from [0, length) randomly without replacement.
+
+    Parameters
+    ----------
+    length : int
+        Length of the sequence.
+=cut
+has 'length'   => (is => 'ro', isa => 'Int', required => 1);
+has '_current' => (is => 'rw', init_arg => undef, default => 0);
+has '_indices' => (is => 'rw', init_arg => undef);
+method python_constructor_arguments() { ['length'] }
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_indices([shuffle(0..$self->length-1)]);
+}
+
+method next()
+{
+    my $current = $self->_current;
+    if($self->_current == $self->length)
+    {
+        $self->reset;
+        return undef;
+    }
+    else
+    {
+        $self->_current($self->_current + 1);
+        return $self->_indices->[$current];
+    }
+};
+
+method reset() { @{ $self->_indices } = shuffle(@{ $self->_indices }); $self->_current(0) }
+method len() { $self->length }
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data');
+
+package AI::MXNet::Gluon::Data::Sampler::BatchSampler;
+use Mouse;
+use List::Util qw(shuffle);
+extends 'AI::MXNet::Gluon::Data::Sampler';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Sampler::BatchSampler
+=cut
+
+=head1 DESCRIPTION
+
+    Wraps over another AI::MXNet::Gluon::Data::Sampler and return mini-batches of samples.
+
+    Parameters
+    ----------
+    sampler : AI::MXNet::Gluon::Data::Sampler
+        The source Sampler.
+    batch_size : int
+        Size of mini-batch.
+    last_batch : {'keep', 'discard', 'rollover'}
+        Specifies how the last batch is handled if batch_size does not evenly
+        divide sequence length.
+
+        If 'keep', the last batch will be returned directly, but will contain
+        less element than `batch_size` requires.
+
+        If 'discard', the last batch will be discarded.
+
+        If 'rollover', the remaining elements will be rolled over to the next
+        iteration.
+
+    Examples
+    --------
+    >>> $sampler = gluon->data->SequentialSampler(10)
+    >>> $batch_sampler = gluon->data->BatchSampler($sampler, batch_size => 3, last_batch => 'keep');
+    >>> @{ $batch_sampler }
+    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
+=cut
+has 'batch_size' => (is => 'ro', isa => 'Int', required => 1);
+has 'sampler'    => (is => 'ro', isa => 'AI::MXNet::Gluon::Data::Sampler', required => 1);
+has 'last_batch' => (is => 'ro', isa => 'Str', default => 'keep');
+has '_prev'      => (is => 'rw', init_arg => undef);
+has '_kept'      => (is => 'rw', init_arg => undef);
+method python_constructor_arguments() { ['sampler', 'batch_size', 'last_batch'] }
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_prev([]);
+}
+
+method next()
+{
+    if($self->_kept)
+    {
+        $self->_kept(0);
+        return undef;
+    }
+    $self->_kept(0);
+    my $batch = $self->_prev;
+    $self->_prev([]);
+    my $sampler = $self->sampler;
+    while(defined(my $i = <$sampler>))
+    {
+        push @{ $batch }, $i;
+        if(@{ $batch } == $self->batch_size)
+        {
+            return $batch;
+        }
+    }
+    if(@{ $batch })
+    {
+        if($self->last_batch eq 'keep')
+        {
+            $self->_kept(1);
+            return $batch;
+        }
+        elsif($self->last_batch eq 'discard')
+        {
+            return undef;
+        }
+        elsif($self->last_batch eq 'rollover')
+        {
+            $self->_prev($batch);
+            return undef;
+        }
+        else
+        {
+            confess(
+                "last_batch must be one of 'keep', 'discard', or 'rollover', ".
+                "but got ${\ $self->last_batch }"
+            );
+        }
+    }
+    return undef;
+}
+
+method len()
+{
+    if($self->last_batch eq 'keep')
+    {
+        return int(($self->sampler->len + $self->batch_size - 1) / $self->batch_size);
+    }
+    elsif($self->last_batch eq 'discard')
+    {
+        return int($self->sampler->len/$self->batch_size);
+    }
+    elsif($self->last_batch eq 'rollover')
+    {
+        return int((@{ $self->_prev } + $self->sampler->len) / $self->batch_size);
+    }
+    else
+    {
+        confess(
+            "last_batch must be one of 'keep', 'discard', or 'rollover', ".
+            "but got ${\ $self->last_batch }"
+        );
+    }
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data');
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Set.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Set.pm
new file mode 100644
index 000000000000..753659055e34
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Set.pm
@@ -0,0 +1,155 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+package AI::MXNet::Gluon::Data::Set;
+use AI::MXNet::Function::Parameters;
+use Mouse;
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
+method _class_name()
+{
+    my $class = ref $self || $self;
+    $class =~ s/^.+:://;
+    $class;
+}
+
+method register(Str $container)
+{
+    my $sub_name = $self->_class_name;
+    no strict 'refs';
+    *{$container.'::'.$sub_name} = sub { shift; $self->new(@_) };
+}
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Set
+=cut
+
+=head1 DESCRIPTION
+
+    Abstract dataset class. All datasets should have this interface.
+
+    Subclasses need to override method at($i), which returns the i-th
+    element, method len() which returns the total number elements.
+
+    AI::MXNet::NDArray can be directly used as a dataset.
+=cut
+
+method at(Index $idx) { confess("Not Implemented") }
+
+method len() { confess("Not Implemented") }
+
+package AI::MXNet::Gluon::Data::ArrayDataset;
+use AI::MXNet::Base;
+use Mouse;
+extends 'AI::MXNet::Gluon::Data::Set';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::ArrayDataset
+=cut
+
+=head1 DESCRIPTION
+
+    A dataset with a data array and a label array.
+
+    The i-th sample is `(data[i], label[i])`.
+
+    Parameters
+    ----------
+    data : AI::MXNet::NDArray or PDL
+        The data array.
+    label : AI::MXNet::NDArray or PDL
+        The label array.
+=cut
+has [qw/data label/] => (is => 'rw', isa => 'PDL|AI::MXNet::NDArray', required => 1);
+method python_constructor_arguments() { ['data', 'label'] }
+
+sub BUILD
+{
+    my $self = shift;
+    assert(($self->data->len == $self->label->len), "data and label lengths must be the same");
+    if($self->label->isa('AI::MXNet::NDArray') and @{$self->label->shape} == 1)
+    {
+        $self->label($self->label->aspdl);
+    }
+    if($self->data->isa('PDL'))
+    {
+        $self->data(AI::MXNet::NDArray->array($self->data));
+    }
+}
+
+method at(Index $idx)
+{
+    return [
+        $self->data->at($idx),
+        $self->label->at($idx)
+    ];
+}
+
+method len()
+{
+    return $self->data->len
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data');
+
+package AI::MXNet::Gluon::Data::RecordFileSet;
+use AI::MXNet::Base;
+use Mouse;
+extends 'AI::MXNet::Gluon::Data::Set';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::RecordFileSet
+=cut
+
+=head1 DESCRIPTION
+
+    A dataset wrapping over a RecordIO (.rec) file.
+
+    Each sample is a string representing the raw content of an record.
+
+    Parameters
+    ----------
+    filename : str
+        Path to rec file.
+=cut
+has 'filename' => (is => 'ro', isa =>'Str', required => 1);
+has '_record'  => (is => 'rw', init_arg => undef);
+method python_constructor_arguments() { ['filename'] }
+
+sub BUILD
+{
+    my $self = shift;
+    my $idx_file = $self->filename;
+    $idx_file =~ s/\.[^.]+$/.idx/;
+    $self->_record(
+        AI::MXNet::IndexedRecordIO->new(
+            idx_path => $idx_file, uri => $self->filename, flag => 'r'
+        )
+    );
+}
+
+method at(Index $idx) { return $self->_record->read_idx($idx); }
+
+method len() { return scalar(@{ $self->_record->keys }) }
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data');
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Vision.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Vision.pm
new file mode 100644
index 000000000000..5711af350e5f
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Data/Vision.pm
@@ -0,0 +1,433 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::Gluon::Data::Vision::DownloadedDataSet;
+use strict;
+use warnings;
+use File::Path qw(make_path);
+use Archive::Tar;
+use IO::Zlib;
+use IO::File;
+use Mouse;
+use AI::MXNet::Function::Parameters;
+has 'root'           => (is => 'ro', isa => 'Str', required => 1);
+has 'train'          => (is => 'ro', isa => 'Bool', required => 1);
+has 'transform'      => (is => 'ro', isa => 'Maybe[CodeRef]');
+has [qw(data label)] => (is => 'rw', init_arg => undef);
+extends 'AI::MXNet::Gluon::Data::Set';
+method python_constructor_arguments() { ['root', 'train', 'transform'] }
+
+sub BUILD
+{
+    my $self = shift;
+    my $root = $self->root;
+    $root =~ s/~/$ENV{HOME}/;
+    if(not -d $root)
+    {
+        make_path($root);
+    }
+    $self->_get_data;
+}
+
+method at(Index $idx)
+{
+    if(defined $self->transform)
+    {
+        return [$self->transform->($self->data->at($idx), $self->label->at($idx))];
+    }
+    return [$self->data->at($idx), $self->label->at($idx)];
+}
+
+method len() { $self->label->len }
+method _get_data() { confess("Not Implemented") }
+
+package AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::MNIST;
+use Mouse;
+use AI::MXNet::Gluon::Utils qw(download);
+use AI::MXNet::Base;
+extends 'AI::MXNet::Gluon::Data::Vision::DownloadedDataSet';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::MNIST
+=cut
+
+=head1 DESCRIPTION
+
+    MNIST handwritten digits dataset from `http://yann.lecun.com/exdb/mnist`_.
+
+    Each sample is an image (in 3D NDArray) with shape (28, 28, 1).
+
+    Parameters
+    ----------
+    root : str
+        Path to temp folder for storing data.
+        Defaults to ~/.mxnet/datasets/mnist
+    train : bool
+        Whether to load the training or testing set.
+        Defaults to True
+    transform : function
+        A user defined callback that transforms each instance. For example
+
+    transform => sub { my ($data, $label) = @_; return ($data->astype('float32')/255, $label) }
+=cut
+
+has [qw/_base_url _train_data _train_label _test_data _test_label/] => (is => 'rw');
+has '+root'  => (default => '~/.mxnet/datasets/mnist');
+has '+train' => (default => 1);
+has '_base_url'    => (is => 'ro', default => 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/');
+has '_train_data'  => (is => 'ro', default => sub { ['train-images-idx3-ubyte.gz',
+                                                     '6c95f4b05d2bf285e1bfb0e7960c31bd3b3f8a7d'] });
+has '_train_label' => (is => 'ro', default => sub { ['train-labels-idx1-ubyte.gz',
+                                                     '2a80914081dc54586dbdf242f9805a6b8d2a15fc'] });
+has '_test_data'   => (is => 'ro', default => sub { ['t10k-images-idx3-ubyte.gz',
+                                                     'c3a25af1f52dad7f726cce8cacb138654b760d48'] });
+has '_test_label'  => (is => 'ro', default => sub { ['t10k-labels-idx1-ubyte.gz',
+                                                     '763e7fa3757d93b0cdec073cef058b2004252c17'] });
+
+method _get_data()
+{
+    my ($data, $label);
+    if($self->train)
+    {
+        ($data, $label) = ($self->_train_data, $self->_train_label);
+    }
+    else
+    {
+        ($data, $label) = ($self->_test_data, $self->_test_label);
+    }
+    my $data_file = download($self->_base_url . $data->[0], path => $self->root,
+                             sha1_hash => $data->[1]);
+    my $label_file = download($self->_base_url . $label->[0], path => $self->root,
+                             sha1_hash => $label->[1]);
+    my $fh = new IO::Zlib;
+    my ($l, $d);
+    if ($fh->open($label_file, "rb"))
+    {
+        $fh->read($l, 100_000_000);
+        $l = substr($l, 8);
+        my $p = PDL->new_from_specification(PDL::Type->new(0), length($l));
+        ${$p->get_dataref} = $l;
+        $p->upd_data;
+        $l = $p;
+        $fh->close;
+        $l = AI::MXNet::NDArray->array($l, dtype => 'int32')->aspdl;
+    }
+    if ($fh->open($data_file, "rb"))
+    {
+        $fh->read($d, 100_000_000);
+        $d = substr($d, 16);
+        my $p = PDL->new_from_specification(PDL::Type->new(0), length($d));
+        ${$p->get_dataref} = $d;
+        $p->upd_data;
+        $d = $p;
+        $fh->close;
+        $d->reshape(1, 28, 28, $l->dim(-1));
+    }
+    $self->data(AI::MXNet::NDArray->array($d, dtype => 'uint8'));
+    $self->label($l);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
+
+package AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::FashionMNIST;
+use Mouse;
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::FashionMNIST
+=cut
+
+=head1 DESCRIPTION
+
+    A dataset of Zalando's article images consisting of fashion products,
+    a drop-in replacement of the original MNIST dataset from
+    `https://github.com/zalandoresearch/fashion-mnist`_.
+
+    Each sample is an image (in 3D NDArray) with shape (28, 28, 1).
+
+    Parameters
+    ----------
+    root : str
+        Path to temp folder for storing data.
+        Defaults to ~/.mxnet/datasets/mnist
+    train : bool
+        Whether to load the training or testing set.
+        Defaults to True
+    transform : function
+        A user defined callback that transforms each instance. For example
+
+    transform => sub { my ($data, $label) = @_; return ($data->astype('float32')/255, $label) }
+=cut
+
+extends 'AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::MNIST';
+has '+root'         => (default => '~/.mxnet/datasets/fashion-mnist');
+has '+_base_url'    => (default => 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/');
+has '+_train_data'  => (default => sub { ['train-images-idx3-ubyte.gz',
+                                          '0cf37b0d40ed5169c6b3aba31069a9770ac9043d'] });
+has '+_train_label' => (default => sub { ['train-labels-idx1-ubyte.gz',
+                                          '236021d52f1e40852b06a4c3008d8de8aef1e40b'] });
+has '+_test_data'   => (default => sub { ['t10k-images-idx3-ubyte.gz',
+                                          '626ed6a7c06dd17c0eec72fa3be1740f146a2863'] });
+has '+_test_label'  => (default => sub { ['t10k-labels-idx1-ubyte.gz',
+                                          '17f9ab60e7257a1620f4ad76bbbaf857c3920701'] });
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
+
+package AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::CIFAR10;
+use Mouse;
+use AI::MXNet::Gluon::Utils qw(download);
+use AI::MXNet::Base;
+use Cwd;
+extends 'AI::MXNet::Gluon::Data::Vision::DownloadedDataSet';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Vision::DownloadedDataSet::CIFAR10
+=cut
+
+=head1 DESCRIPTION
+
+    CIFAR10 image classification dataset from `https://www.cs.toronto.edu/~kriz/cifar.html`_.
+
+    Each sample is an image (in 3D NDArray) with shape (32, 32, 1).
+
+    Parameters
+    ----------
+    root : str
+        Path to temp folder for storing data.
+    train : bool
+        Whether to load the training or testing set.
+    transform : function
+        A user defined callback that transforms each instance. For example:
+
+    transform => sub { my ($data, $label) = @_; return ($data->astype('float32')/255, $label) }
+=cut
+has '+root'  => (default => '~/.mxnet/datasets/cifar10');
+has '+train' => (default => 1);
+has '_file_hashes' => (is => 'ro', default => sub { +{
+    qw/data_batch_1.bin aadd24acce27caa71bf4b10992e9e7b2d74c2540
+       data_batch_2.bin c0ba65cce70568cd57b4e03e9ac8d2a5367c1795
+       data_batch_3.bin 1dd00a74ab1d17a6e7d73e185b69dbf31242f295
+       data_batch_4.bin aab85764eb3584312d3c7f65fd2fd016e36a258e
+       data_batch_5.bin 26e2849e66a845b7f1e4614ae70f4889ae604628
+       test_batch.bin   67eb016db431130d61cd03c7ad570b013799c88c/
+    } });
+
+method _read_batch(Str $filename)
+{
+    my $data = join('', IO::File->new($filename)->getlines);
+    $data = PDL->new_from_specification(PDL::Type->new(0), length($data))->reshape(3073, length($data)/3073);
+    $data = AI::MXNet::NDArray->array($data, dtype => 'uint8');
+    return (
+        $data->slice('X', [1, -1])->sever->reshape([-1, 3, 32, 32])->transpose([0, 2, 3, 1]),
+        $data->slice('X', 0)->astype('int32')
+    );
+}
+
+method _get_data()
+{
+    my @file_paths = map { [$_, join('/', $self->root, 'cifar-10-batches-bin/', $_)] } keys %{ $self->_file_hashes };
+    if(grep { not -f $_->[1] or not check_sha1($_->[1], $self->_file_hashes->{ $_->[0] }) } @file_paths)
+    {
+        my $filename = download(
+            'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/cifar10/cifar-10-binary.tar.gz',
+            path => $self->root,
+            sha1_hash => 'fab780a1e191a7eda0f345501ccd62d20f7ed891'
+        );
+        my $tar = Archive::Tar->new($filename);
+        my $cwd = cwd();
+        chdir($self->root);
+        $tar->extract;
+        chdir($cwd);
+    }
+    my ($data, $label);
+    if($self->train)
+    {
+        my (@data, @label);
+        for my $i (1..5)
+        {
+            my $filename = join('/', $self->root, "data_batch_$i.bin");
+            my ($data, $label) = $self->_read_batch($filename);
+            push @data, $data;
+            push @label, $label;
+        }
+        $data = AI::MXNet::NDArray->concatenate(\@data);
+        $label = AI::MXNet::NDArray->concatenate(\@label);
+    }
+    else
+    {
+        my $filename = join('/', $self->root, "test_batch.bin");
+        ($data, $label) = $self->_read_batch($filename);
+    }
+    $self->data(\@{$data});
+    $self->label($label->aspdl);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
+
+package AI::MXNet::Gluon::Data::Vision::RecordFileSet::ImageRecordDataset;
+use Mouse;
+extends 'AI::MXNet::Gluon::Data::RecordFileSet';
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Vision::RecordFileSet::ImageRecordDataset
+=cut
+
+=head1 DESCRIPTION
+
+    A dataset wrapping over a RecordIO file containing images.
+
+    Each sample is an image and its corresponding label.
+
+    Parameters
+    ----------
+    filename : str
+        Path to rec file.
+    flag : {0, 1}, default 1
+        If 0, always convert images to greyscale.
+
+        If 1, always convert images to colored (RGB).
+    transform : function
+        A user defined callback that transforms each instance. For example::
+=cut
+has 'flag'      => (is => 'rw', isa => 'Bool', default => 1);
+has 'transform' => (is => 'rw', isa => 'Maybe[CodeRef]');
+
+method at(Int $idx)
+{
+    my $record = $self->SUPER::at($idx);
+    my ($header, $img) = AI::MXNet::RecordIO->unpack($record);
+    if(defined $self->transform)
+    {
+        my $data = [AI::MXNet::Image->imdecode($img)];
+        return [$self->transform->(
+            AI::MXNet::Image->imdecode($img, flag => $self->flag), $header->label
+        )];
+    }
+    return [AI::MXNet::Image->imdecode($img, flag => $self->flag), $header->label];
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
+
+package AI::MXNet::Gluon::Data::Vision::Set::ImageFolderDataset;
+use Mouse;
+extends 'AI::MXNet::Gluon::Data::Set';
+=head1 NAME
+
+    AI::MXNet::Gluon::Data::Vision::ImageFolderDataset
+=cut
+
+=head1 DESCRIPTION
+
+    A dataset for loading image files stored in a folder structure like::
+
+        root/car/0001.jpg
+        root/car/xxxa.jpg
+        root/car/yyyb.jpg
+        root/bus/123.jpg
+        root/bus/023.jpg
+        root/bus/wwww.jpg
+
+    Parameters
+    ----------
+    root : str
+        Path to root directory.
+    flag : {0, 1}, default 1
+        If 0, always convert loaded images to greyscale (1 channel).
+        If 1, always convert loaded images to colored (3 channels).
+    transform : callable
+        A function that takes data and label and transforms them::
+
+            transform = lambda data, label: (data.astype(np.float32)/255, label)
+
+    Attributes
+    ----------
+    synsets : list
+        List of class names. `synsets[i]` is the name for the integer label `i`
+    items : list of tuples
+        List of all images in (filename, label) pairs.
+=cut
+has 'root'      => (is => 'rw', isa => 'Str');
+has 'flag'      => (is => 'rw', isa => 'Bool', default => 1);
+has 'transform' => (is => 'rw', isa => 'Maybe[CodeRef]');
+has [qw/exts
+    synsets
+    items/]     => (is => 'rw', init_arg => undef);
+method python_constructor_arguments() { ['root', 'flag', 'transform'] }
+
+sub BUILD
+{
+    my $self = shift;
+    my $root = $self->root;
+    $root =~ s/~/$ENV{HOME}/;
+    $self->root($root);
+    $self->exts({'.jpg', 1, '.jpeg', 1, '.png', 1});
+    $self->list_images($self->root);
+}
+
+method list_images(Str $root)
+{
+    $self->synsets([]);
+    $self->items([]);
+
+    for my $path (sort(glob("$root/*")))
+    {
+        my $folder = $path;
+        $folder =~ s,^.+/,,;
+        if(not -d $path)
+        {
+            AI::MXNet::Logging->warning("Ignoring %s, which is not a directory.", $folder);
+            next;
+        }
+        my $label = @{ $self->synsets };
+        push @{ $self->synsets }, $folder;
+        for my $filename (sort(glob("$path/*")))
+        {
+            my ($ext) = $filename =~ /(\.[^\.]+)$/;
+            if(not $ext or not exists $self->exts->{lc $ext})
+            {
+                AI::MXNet::Logging->warning(
+                    'Ignoring %s of type %s. Only support .jpg, .jpeg, .png',
+                    $filename, $ext//'undef'
+                );
+                next;
+            }
+            push @{ $self->items }, [$filename, AI::MXNet::NDArray->array([$label], dtype => 'int32')->aspdl];
+        }
+    }
+}
+
+method at(Int $idx)
+{
+    my $img = AI::MXNet::Image->imread($self->items->[$idx][0], flag => $self->flag);
+    my $label = $self->items->[$idx][1];
+    if(defined $self->transform)
+    {
+        return [$self->transform($img, $label)];
+    }
+    return [$img, $label];
+}
+
+method len()
+{
+    return scalar(@{ $self->items });
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Data::Vision');
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
new file mode 100644
index 000000000000..74590c649b75
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Loss.pm
@@ -0,0 +1,517 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+package AI::MXNet::Gluon::Loss;
+use AI::MXNet::Gluon::Block;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Loss - Base class for loss.
+=cut
+
+=head2 DESCRIPTION
+
+    Base class for loss.
+
+    Parameters
+    ----------
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+=cut
+
+=head2 _apply_weighting
+
+    Apply weighting to loss.
+
+    Parameters
+    ----------
+    loss : Symbol
+        The loss to be weighted.
+    weight : float or None
+        Global scalar weight for loss.
+    sample_weight : Symbol or None
+        Per sample weighting. Must be broadcastable to
+        the same shape as loss. For example, if loss has
+        shape (64, 10) and you want to weight each sample
+        in the batch separately, `sample_weight` should have
+        shape (64, 1).
+
+    Returns
+    -------
+    loss : Symbol
+        Weighted loss
+=cut
+
+
+method _apply_weighting(Str $F, GluonInput $loss, Maybe[Num] $weight=, Maybe[GluonInput] $sample_weight=)
+{
+    if(defined $sample_weight)
+    {
+        $loss = $F->broadcast_mul($loss, $sample_weight);
+    }
+    if(defined $weight)
+    {
+        $loss = $loss * $weight;
+    }
+    return $loss;
+}
+
+# for symbolic output.shape is not available so we reshape
+# to empty shape and let it be inferred from output's shape
+# via the '-' operator later.
+
+method _reshape_label_as_output(GluonClass $F, GluonInput $output, GluonInput $label)
+{
+    if($F eq 'AI::MXNet::NDArray')
+    {
+        return $label->reshape($output->shape);
+    }
+    else
+    {
+        return $label->reshape(shape => []);
+    }
+}
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+has 'weight'     => (is => 'rw', isa => 'Num');
+has 'batch_axis' => (is => 'rw', isa => 'Int', default => 0);
+
+use overload '""' => sub {
+        my $self = shift;
+        sprintf(
+            "%s(batch_axis=%s, w=%s)",
+            $self->_class_name,
+            $self->batch_axis,
+            $self->weight
+        );
+    };
+
+method hybrid_forward($F, $x, @args)
+{
+    confess('NotImplementedError');
+}
+
+package AI::MXNet::Gluon::L2Loss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::L2Loss
+=cut
+
+=head1 DESCRIPTION
+
+    Calculates the mean squared error between output and label:
+
+    Output and label can have arbitrary shape as long as they have the same
+    number of elements.
+
+    Parameters
+    ----------
+    weight : float or None
+        Global scalar weight for loss.
+    sample_weight : Symbol or None
+        Per sample weighting. Must be broadcastable to
+        the same shape as loss. For example, if loss has
+        shape (64, 10) and you want to weight each sample
+        in the batch, `sample_weight` should have shape (64, 1).
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+=cut
+has '+weight'     => (default => 1);
+has '+batch_axis' => (default => 0);
+
+method hybrid_forward(GluonClass $F, GluonInput $output, GluonInput $label, Maybe[GluonInput] $sample_weight=)
+{
+    $label = __PACKAGE__->_reshape_label_as_output($F, $output, $label);
+    my $loss = $F->square($output - $label);
+    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight/2, $sample_weight);
+    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+package AI::MXNet::Gluon::L1Loss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+has '+weight'     => (default => 1);
+has '+batch_axis' => (default => 0);
+
+=head1 NAME
+
+    AI::MXNet::Gluon::L1Loss
+=cut
+
+=head1 DESCRIPTION
+
+    Calculates the mean absolute error between output and label:
+
+    .. math::
+        L = \\frac{1}{2}\\sum_i \\vert {output}_i - {label}_i \\vert.
+
+    Output and label must have the same shape.
+
+    Parameters
+    ----------
+    weight : float or None
+        Global scalar weight for loss.
+    sample_weight : Symbol or None
+        Per sample weighting. Must be broadcastable to
+        the same shape as loss. For example, if loss has
+        shape (64, 10) and you want to weight each sample
+        in the batch, `sample_weight` should have shape (64, 1).
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+=cut
+
+method hybrid_forward(GluonClass $F, GluonInput $output, GluonInput $label, Maybe[GluonInput] $sample_weight=)
+{
+    $label = __PACKAGE__->_reshape_label_as_output($F, $output, $label);
+    my $loss = $F->abs($output - $label);
+    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
+    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+package AI::MXNet::Gluon::SigmoidBinaryCrossEntropyLoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+has 'from_sigmoid' => (is => 'ro', isa => 'Bool', default => 0);
+has '+batch_axis'  => (default => 0);
+
+=head1 NAME
+
+    AI::MXNet::Gluon::SigmoidBinaryCrossEntropyLoss
+=cut
+
+=head1 DESCRIPTION
+
+    The cross-entropy loss for binary classification. (alias: SigmoidBCELoss)
+
+    BCE loss is useful when training logistic regression.
+
+    .. math::
+        loss(o, t) = - 1/n \sum_i (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
+
+
+    Parameters
+    ----------
+    from_sigmoid : bool, default is `False`
+        Whether the input is from the output of sigmoid. Set this to false will make
+        the loss calculate sigmoid and then BCE, which is more numerically stable through
+        log-sum-exp trick.
+    weight : float or None
+        Global scalar weight for loss.
+    sample_weight : Symbol or None
+        Per sample weighting. Must be broadcastable to
+        the same shape as loss. For example, if loss has
+        shape (64, 10) and you want to weight each sample
+        in the batch, `sample_weight` should have shape (64, 1).
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+=cut
+
+method hybrid_forward(GluonClass $F, GluonInput $output, GluonInput $label, Maybe[GluonInput] $sample_weight=)
+{
+    $label = __PACKAGE__->_reshape_label_as_output($F, $output, $label);
+    my $loss;
+    if(not $self->from_sigmoid)
+    {
+        my $max_val = (-$output)->maximum(0);
+        $loss = $output - $output*$label + $max_val + $F->log($F->exp(-$max_val)+$F->exp(-$output-$max_val));
+    }
+    else
+    {
+        $loss = -($F->log($output+1e-12)*$label + $F->log(1-$output+1e-8)*(1-$label));
+    }
+    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
+    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+package AI::MXNet::Gluon::SigmoidBCELoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::SigmoidBinaryCrossEntropyLoss';
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+package AI::MXNet::Gluon::SoftmaxCrossEntropyLoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+
+
+=head1 NAME
+
+    AI::MXNet::Gluon::SoftmaxCrossEntropyLoss
+=cut
+
+=head1 DESCRIPTION
+
+    Computes the softmax cross entropy loss. (alias: SoftmaxCELoss)
+
+    If `sparse_label` is `True`, label should contain integer category indicators:
+
+    .. math::
+        p = {softmax}({output})
+
+        L = -\\sum_i {log}(p_{i,{label}_i})
+
+    Label's shape should be output's shape without the `axis` dimension. i.e. for
+    `output.shape` = (1,2,3,4) and axis = 2, `label.shape` should be (1,2,4).
+
+    If `sparse_label` is `False`, label should contain probability distribution
+    with the same shape as output:
+
+    .. math::
+        p = {softmax}({output})
+
+        L = -\\sum_i \\sum_j {label}_j {log}(p_{ij})
+
+    Parameters
+    ----------
+    axis : int, default -1
+        The axis to sum over when computing softmax and entropy.
+    sparse_label : bool, default True
+        Whether label is an integer array instead of probability distribution.
+    from_logits : bool, default False
+        Whether input is a log probability (usually from log_softmax) instead
+        of unnormalized numbers.
+    weight : float or None
+        Global scalar weight for loss.
+    sample_weight : Symbol or None
+        Per sample weighting. Must be broadcastable to
+        the same shape as loss. For example, if loss has
+        shape (64, 10) and you want to weight each sample
+        in the batch, `sample_weight` should have shape (64, 1).
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+=cut
+
+has 'axis'         => (is => 'ro', isa => 'Int', default => -1);
+has '+batch_axis'  => (default => 0);
+has 'sparse_label' => (is => 'ro', isa => 'Bool', default => 1);
+has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 0);
+
+method hybrid_forward(GluonClass $F, GluonInput $output, GluonInput $label, Maybe[GluonInput] $sample_weight=)
+{
+    if(not $self->from_logits)
+    {
+        $output = $F->log_softmax($output);
+    }
+    my $loss;
+    if($self->sparse_label)
+    {
+        $loss = -$F->pick($output, $label, axis=>$self->axis, keepdims => 1);
+    }
+    else
+    {
+        $loss = -$F->sum($output*$label, axis => $self->axis, keepdims => 1);
+    }
+    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
+    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+package AI::MXNet::Gluon::SoftmaxCELoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::SoftmaxCrossEntropyLoss';
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+
+package AI::MXNet::Gluon::KLDivLoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+has '+batch_axis'  => (default => 0);
+has 'from_logits'  => (is => 'ro', isa => 'Bool', default => 1);
+
+=head1 NAME
+
+    AI::MXNet::Gluon::KLDivLoss
+=cut
+
+=head1 DESCRIPTION
+
+    The Kullback-Leibler divergence loss.
+
+    KL divergence is a useful distance measure for continuous distributions
+    and is often useful when performing direct regression over the space of
+    (discretely sampled) continuous output distributions.
+
+    .. _Kullback-Leibler divergence:
+        https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
+    .. math::
+        L = 1/n \\sum_i (label_i * (log(label_i) - output_i))
+
+    Label's shape should be the same as output's.
+
+    Parameters
+    ----------
+    from_logits : bool, default is `True`
+        Whether the input is log probability (usually from log_softmax) instead
+        of unnormalized numbers.
+    weight : float or None
+        Global scalar weight for loss.
+    sample_weight : Symbol or None
+        Per sample weighting. Must be broadcastable to
+        the same shape as loss. For example, if loss has
+        shape (64, 10) and you want to weight each sample
+        in the batch, `sample_weight` should have shape (64, 1).
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+=cut
+
+method hybrid_forward(GluonClass $F, GluonInput $output, GluonInput $label, Maybe[GluonInput] $sample_weight=)
+{
+    if(not $self->from_logits)
+    {
+        $output = $F->log_softmax($output);
+    }
+    my $loss = $label * ($F->log($label+1e-12) - $output);
+    $loss = __PACKAGE__->_apply_weighting($F, $loss, $self->weight, $sample_weight);
+    return $F->mean($loss, axis => $self->batch_axis, exclude => 1);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+package AI::MXNet::Gluon::CTCLoss;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Loss';
+has 'layout'        => (is => 'rw', isa => 'Str', default => 'NTC');
+has 'label_layout'  => (is => 'rw', isa => 'Str', default => 'NT');
+
+=head1 NAME
+
+    AI::MXNet::Gluon::CTCLoss
+=cut
+
+=head1 DESCRIPTION
+
+    Connectionist Temporal Classification Loss.
+
+    See `"Connectionist Temporal Classification: Labelling Unsegmented
+    Sequence Data with Recurrent Neural Networks"
+    <http://www.cs.toronto.edu/~graves/icml_2006.pdf>`_ paper for more information.
+
+    Parameters
+    ----------
+    layout : str, default 'NTC'
+        Layout of the output sequence activation vector.
+    label_layout : str, default 'NT'
+        Layout of the labels.
+    weight : float or None
+        Global scalar weight for loss.
+    sample_weight : Symbol or None
+        Per sample weighting. Must be broadcastable to
+        the same shape as loss. For example, if loss has
+        shape (64, 10) and you want to weight each sample
+        in the batch, `sample_weight` should have shape (64, 1).
+        This should be used as the fifth argument when calling this loss.
+
+    Input shapes:
+        `data` is an activation tensor (i.e. before softmax).
+        Its shape depends on `layout`. For `layout='TNC'`, this
+        input has shape `(sequence_length, batch_size, alphabet_size)`
+        Note that the last dimension with index `alphabet_size-1` is reserved for special
+        blank character.
+
+        `label` is the label index matrix with zero-indexed labels.
+        Its shape depends on `label_layout`. For `label_layout='TN'`, this
+        input has shape `(label_sequence_length, batch_size)`. Padding mask of value ``-1``
+        is available for dealing with unaligned label lengths.
+        When `label_lengths` is specified, label lengths are directly used and padding mask
+        is not allowed in the label.
+        When `label_lengths` is not specified, the first occurrence of ``-1``
+        in each sample marks the end of the label sequence of that sample.
+
+        For example, suppose the vocabulary is `[a, b, c]`, and in one batch we have three
+        sequences 'ba', 'cbb', and 'abac'. We can index the labels as `{'a': 0, 'b': 1, 'c': 2}`.
+        The alphabet size should be 4, and we reserve the channel index 3 for blank label
+        in data tensor. The padding mask value for extra length is -1, so the resulting `label`
+        tensor should be padded to be::
+
+          [[1, 0, -1, -1], [2, 1, 1, -1], [0, 1, 0, 2]]
+
+        `data_lengths` is optional and defaults to None.
+        When specified, it represents the actual lengths of data.
+        The shape should be (batch_size,).
+        If None, the data lengths are treated as being equal to the max sequence length.
+        This should be used as the third argument when calling this loss.
+
+        `label_lengths` is optional and defaults to None.
+        When specified, it represents the actual lengths of labels.
+        The shape should be (batch_size,).
+        If None, the label lengths are derived from the first occurrence of
+        the value specified by `padding_mask`.
+        This should be used as the fourth argument when calling this loss.
+
+    Output shape:
+        The CTC loss output has the shape (batch_size,).
+=cut
+use AI::MXNet::Base;
+
+sub BUILD
+{
+    my $self = shift;
+    assert(
+        (grep { $_ eq $self->layout } ('NTC', 'TNC')),\
+        "Only 'NTC' and 'TNC' layouts for output are supported. Got: ${\ $self->layout }"
+    );
+    assert(
+        (grep { $_ eq $self->label_layout } ('NT', 'TN')),\
+        "Only 'NT' and 'TN' layouts for label are supported. Got: ${\ $self->label_layout }"
+    );
+    $self->batch_axis(index($self->label_layout, 'N'));
+}
+
+method hybrid_forward(
+    GluonClass $F, GluonInput $data, GluonInput $label,
+    Maybe[GluonInput] $data_lengths=, Maybe[GluonInput] $label_lengths=, Maybe[GluonInput] $sample_weight=
+)
+{
+    if($self->layout eq 'NTC')
+    {
+        $data = $F->swapaxes($data, dim1 => 0, dim2 => 1);
+    }
+    if($self->batch_axis == 1)
+    {
+        $label = $F->swapaxes($label, dim1 => 0, dim2 => 1);
+    }
+    my $loss = $F->contrib->CTCLoss(
+        $data, $label,
+        (defined $data_lengths ? $data_lengths : ()),
+        (defined $label_lengths ? $label_lengths : ()),
+        use_data_lengths  => defined $data_lengths ? 1 : 0,
+        use_label_lengths => defined $label_lengths ? 1 : 0,
+        blank_label=>'last'
+    );
+    return $self->_apply_weighting($F, $loss, $self->weight, $sample_weight);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::Loss');
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Mouse.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Mouse.pm
new file mode 100644
index 000000000000..2d1e9cf6f09a
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Mouse.pm
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::Gluon::Mouse;
+use strict;
+use warnings;
+use Mouse;
+use Mouse::Exporter;
+no Mouse;
+
+Mouse::Exporter->setup_import_methods(
+    as_is   => [
+        'has',
+        \&Mouse::extends,
+        \&Mouse::with,
+        \&Mouse::before,
+        \&Mouse::after,
+        \&Mouse::around,
+        \&Mouse::override,
+        \&Mouse::super,
+        \&Mouse::augment,
+        \&Mouse::inner,
+        \&Scalar::Util::blessed,
+        \&Carp::confess
+    ]
+);
+
+sub init_meta { return Mouse::init_meta(@_) }
+sub has
+{
+    my $name = shift;
+    my %args = @_;
+    my $caller = delete $args{caller} // caller;
+    my $meta = $caller->meta;
+
+    $meta->throw_error(q{Usage: has 'name' => ( key => value, ... )})
+        if @_ % 2; # odd number of arguments
+
+    for my $n (ref($name) ? @{$name} : $name){
+        $meta->add_attribute(
+            $n,
+            trigger => sub { my $self = shift; $self->__setattr__($n, @_); },
+            %args
+        );
+    }
+    return;
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN.pm
new file mode 100644
index 000000000000..16b0415aa0b0
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN.pm
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::Gluon::NN;
+use strict;
+use warnings;
+use AI::MXNet::Gluon::Block;
+use AI::MXNet::Gluon::NN::BasicLayers;
+use AI::MXNet::Gluon::NN::ConvLayers;
+
+sub import
+{
+    my ($class, $short_name) = @_;
+    if($short_name)
+    {
+        $short_name =~ s/[^\w:]//g;
+        if(length $short_name)
+        {
+            my $short_name_package =<<"EOP";
+            package $short_name;
+            \@${short_name}::ISA = ('AI::MXNet::Gluon::NN_');
+            1;
+EOP
+            eval $short_name_package;
+        }
+    }
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/BasicLayers.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/BasicLayers.pm
new file mode 100644
index 000000000000..6d85c9abf713
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/BasicLayers.pm
@@ -0,0 +1,668 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+package AI::MXNet::Gluon::NN::Sequential;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Sequential
+=cut
+
+=head2 DESCRIPTION
+
+    Stacks `Block`s sequentially.
+
+    Example::
+
+        my $net = nn->Sequential()
+        # use net's name_scope to give child Blocks appropriate names.
+        net->name_scope(sub {
+            $net->add($nn->Dense(10, activation=>'relu'));
+            $net->add($nn->Dense(20));
+        });
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::Block';
+
+=head
+
+    Adds block on top of the stack.
+=cut
+
+method add(AI::MXNet::Gluon::Block $block)
+{
+    $self->register_child($block);
+}
+
+
+method forward($x)
+{
+    for my $block (@{ $self->_children })
+    {
+        $x = $block->($x);
+    }
+    return $x;
+}
+
+use overload
+    '""' => sub
+    {
+        my $self = shift;
+        my $s = "%s(\n{%s}\n)";
+        my @blocks;
+        my $k = 0;
+        for my $v (@{ $self->{_children} })
+        {
+            push @blocks, "  ($k): ".AI::MXNet::Base::_indent("$v", 2);
+            $k++;
+        }
+        sprintf("%s(\n{%s}\n)", $self->_class_name, join("\n", @blocks));
+    },
+    '@{}' => sub { shift->_children };
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::HybridSequential;
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::HybridSequential
+=cut
+
+=head2 DESCRIPTION
+
+    Stacks `Block`s sequentially.
+
+    Example::
+
+        my $net = nn->Sequential()
+        # use net's name_scope to give child Blocks appropriate names.
+        net->name_scope(sub {
+            $net->add($nn->Dense(10, activation=>'relu'));
+            $net->add($nn->Dense(20));
+        });
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+=head
+
+    Adds block on top of the stack.
+=cut
+
+method add(AI::MXNet::Gluon::HybridBlock $block)
+{
+    $self->register_child($block);
+}
+
+
+method forward($x)
+{
+    for my $block (@{ $self->_children })
+    {
+        $x = $block->($x);
+    }
+    return $x;
+}
+
+use overload
+    '""' => sub
+    {
+        my $self = shift;
+        my $s = "%s(\n{%s}\n)";
+        my @blocks;
+        my $k = 0;
+        for my $v (@{ $self->{_children} })
+        {
+            push @blocks, "  ($k): ".AI::MXNet::Base::_indent("$v", 2);
+            $k++;
+        }
+        sprintf("%s(\n{%s}\n)", $self->_class_name, join("\n", @blocks));
+    },
+    '@{}' => sub { shift->_children };
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Dense;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+method python_constructor_arguments()
+{
+    ['units'];
+}
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Dense
+
+=head1 DESCRIPTION
+
+    Just your regular densely-connected NN layer.
+
+    `Dense` implements the operation:
+    `output = activation(dot(input, weight) + bias)`
+    where `activation` is the element-wise activation function
+    passed as the `activation` argument, `weight` is a weights matrix
+    created by the layer, and `bias` is a bias vector created by the layer
+    (only applicable if `use_bias` is `True`).
+
+    Note: the input must be a tensor with rank 2. Use `flatten` to convert it
+    to rank 2 manually if necessary.
+
+    Parameters
+    ----------
+    units : int
+        Dimensionality of the output space.
+    activation : str
+        Activation function to use. See help on `Activation` layer.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias : bool
+        Whether the layer uses a bias vector.
+    flatten : bool, default true
+        Whether the input tensor should be flattened.
+        If true, all but the first axis of input data are collapsed together.
+        If false, all but the last axis of input data are kept the same, and the transformation
+        applies on the last axis.
+    weight_initializer : str or `Initializer`
+        Initializer for the `kernel` weights matrix.
+    bias_initializer: str or `Initializer`
+        Initializer for the bias vector.
+    in_units : int, optional
+        Size of the input data. If not specified, initialization will be
+        deferred to the first time `forward` is called and `in_units`
+        will be inferred from the shape of input data.
+    prefix : str or None
+        See document of `Block`.
+    params : ParameterDict or None
+    weight_initializer : str or `Initializer`
+        Initializer for the `kernel` weights matrix.
+    bias_initializer: str or `Initializer`
+        Initializer for the bias vector.
+    in_units : int, optional
+        Size of the input data. If not specified, initialization will be
+        deferred to the first time `forward` is called and `in_units`
+        will be inferred from the shape of input data.
+    prefix : str or None
+        See document of `Block`.
+    params : ParameterDict or None
+        See document of `Block`.
+
+    If flatten is set to be True, then the shapes are:
+    Input shape:
+        An N-D input with shape
+        `(batch_size, x1, x2, ..., xn) with x1 * x2 * ... * xn equal to in_units`.
+
+    Output shape:
+        The output would have shape `(batch_size, units)`.
+
+    If ``flatten`` is set to be false, then the shapes are:
+    Input shape:
+        An N-D input with shape
+        `(x1, x2, ..., xn, in_units)`.
+
+    Output shape:
+        The output would have shape `(x1, x2, ..., xn, units)`.
+=cut
+
+has 'units'               => (is => 'rw', isa => 'Int', required => 1);
+has 'activation'          => (is => 'rw', isa => 'Str');
+has 'use_bias'            => (is => 'rw', isa => 'Bool', default => 1);
+has 'flatten'             => (is => 'rw', isa => 'Bool', default => 1);
+has 'weight_initializer'  => (is => 'rw', isa => 'Initializer');
+has 'bias_initializer'    => (is => 'rw', isa => 'Initializer', default => 'zeros');
+has 'in_units'            => (is => 'rw', isa => 'Int', default => 0);
+has [qw/weight bias act/] => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->name_scope(sub {
+        $self->weight(
+            $self->params->get(
+                'weight', shape => [$self->units, $self->in_units],
+                init => $self->weight_initializer,
+                allow_deferred_init => 1
+            )
+        );
+        if($self->use_bias)
+        {
+            $self->bias(
+                $self->params->get(
+                    'bias', shape => [$self->units],
+                    init => $self->bias_initializer,
+                    allow_deferred_init => 1
+                )
+            );
+        }
+        if(defined $self->activation)
+        {
+            $self->act(
+                AI::MXNet::Gluon::NN::Activation->new(
+                    activation => $self->activation,
+                    prefix => $self->activation.'_'
+                )
+            );
+        }
+    });
+}
+
+method hybrid_forward(GluonClass $F, GluonInput $x, GluonInput :$weight, Maybe[GluonInput] :$bias=)
+{
+    my $act;
+    if(not defined $bias)
+    {
+        $act = $F->FullyConnected($x, $weight, no_bias => 1, num_hidden => $self->units, name => 'fwd');
+    }
+    else
+    {
+        $act = $F->FullyConnected($x, $weight, $bias, num_hidden => $self->units, flatten => $self->flatten, name => 'fwd')
+    }
+    if(defined $self->act)
+    {
+        $act = $self->act->($act);
+    }
+    return $act;
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    "${\ $self->_class_name }(${\ $self->units } -> ${\ $self->in_units },"
+    ." @{[ $self->act ? $self->act : 'linear' ]})"
+};
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Activation;
+
+=head1 
+
+    AI::MXNet::Gluon::NN::Activation
+=cut
+
+=head1 DESCRIPTION
+
+    Applies an activation function to input.
+
+    Parameters
+    ----------
+    activation : str
+        Name of activation function to use.
+        See mxnet.ndarray.Activation for available choices.
+
+    Input shape:
+        Arbitrary.
+
+    Output shape:
+        Same shape as input.
+=cut
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+has 'activation' => (is => 'ro', isa => 'Str', required => 1);
+
+method python_constructor_arguments()
+{
+    ['activation'];
+}
+
+method _alias()
+{
+    return $self->activation;
+}
+
+method hybrid_forward(GluonClass $F, GluonInput $x)
+{
+    return $F->Activation($x, act_type => $self->activation, name=>'fwd');
+}
+
+use overload '""' => sub { my $self = shift; "${\ $self->_class_name }(${\ $self->activation })"; };
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Dropout;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Dropout
+=cut
+
+=head1 DESCRIPTION
+
+    Applies Dropout to the input.
+
+    Dropout consists in randomly setting a fraction `rate` of input units
+    to 0 at each update during training time, which helps prevent overfitting.
+
+    Parameters
+    ----------
+    rate : float
+        Fraction of the input units to drop. Must be a number between 0 and 1.
+
+
+    Input shape:
+        Arbitrary.
+
+    Output shape:
+        Same shape as input.
+
+    References
+    ----------
+        `Dropout: A Simple Way to Prevent Neural Networks from Overfitting
+        <http://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf>`_
+=cut
+has 'rate' => (is => 'ro', isa => 'Dropout', required => 1);
+method python_constructor_arguments() { ['rate'] }
+
+method hybrid_forward(GluonClass $F, GluonInput $x)
+{
+    return $F->Dropout($x, p => $self->rate, name => 'fwd');
+}
+
+use overload '""' => sub { my $self = shift; "${\ $self->_class_name }(p = ${\ $self->rate })"; };
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::BatchNorm;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::BatchNorm
+=cut
+
+=head1 DESCRIPTION
+
+    Batch normalization layer (Ioffe and Szegedy, 2014).
+    Normalizes the input at each batch, i.e. applies a transformation
+    that maintains the mean activation close to 0 and the activation
+    standard deviation close to 1.
+
+    Parameters
+    ----------
+    axis : int, default 1
+        The axis that should be normalized. This is typically the channels
+        (C) axis. For instance, after a `Conv2D` layer with `layout='NCHW'`,
+        set `axis=1` in `BatchNorm`. If `layout='NHWC'`, then set `axis=3`.
+    momentum: float, default 0.9
+        Momentum for the moving average.
+    epsilon: float, default 1e-5
+        Small float added to variance to avoid dividing by zero.
+    center: bool, default True
+        If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: bool, default True
+        If True, multiply by `gamma`. If False, `gamma` is not used.
+        When the next layer is linear (also e.g. `nn.relu`),
+        this can be disabled since the scaling
+        will be done by the next layer.
+    beta_initializer: str or `Initializer`, default 'zeros'
+        Initializer for the beta weight.
+    gamma_initializer: str or `Initializer`, default 'ones'
+        Initializer for the gamma weight.
+    moving_mean_initializer: str or `Initializer`, default 'zeros'
+        Initializer for the moving mean.
+    moving_variance_initializer: str or `Initializer`, default 'ones'
+        Initializer for the moving variance.
+    in_channels : int, default 0
+        Number of channels (feature maps) in input data. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and `in_channels` will be inferred from the shape of input data.
+
+
+    Input shape:
+        Arbitrary.
+
+    Output shape:
+        Same shape as input.
+=cut
+
+has 'axis'             => (is => 'ro', isa => 'DimSize',     default => 1);
+has 'momentum'         => (is => 'ro', isa => 'Num',         default => 0.9);
+has 'epsilon'          => (is => 'ro', isa => 'Num',         default => 1e-5);
+has 'center'           => (is => 'ro', isa => 'Bool',        default => 1);
+has 'scale'            => (is => 'ro', isa => 'Bool',        default => 1);
+has 'beta_initializer' => (is => 'ro', isa => 'Initializer', default => 'zeros');
+has [qw/gamma_initializer
+        running_mean_initializer
+        running_variance_initializer
+    /]                 => (is => 'ro', isa => 'Initializer', default => 'ones');
+has 'in_channels'      => (is => 'ro', isa => 'DimSize',     default => 0);
+has [qw/_kwargs
+        gamma
+        beta
+        running_mean
+        running_var/]  => (is => 'rw', init_arg => undef);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_kwargs({
+        axis => $self->axis,
+        eps => $self->epsilon,
+        momentum => $self->momentum,
+        fix_gamma => $self->scale ? 0 : 1
+    });
+
+    $self->gamma(
+        $self->params->get(
+            'gamma', grad_req => $self->scale ? 'write' : 'null',
+            shape => [$self->in_channels], init => $self->gamma_initializer,
+            allow_deferred_init => 1, differentiable => $self->scale
+        )
+    );
+    $self->beta(
+        $self->params->get(
+            'beta', grad_req => $self->center ? 'write' : 'null',
+            shape => [$self->in_channels], init => $self->beta_initializer,
+            allow_deferred_init => 1, differentiable => $self->center
+        )
+    );
+    $self->running_mean(
+        $self->params->get(
+            'running_mean', grad_req => 'null',
+            shape => [$self->in_channels], init => $self->running_mean_initializer,
+            allow_deferred_init => 1, differentiable => 0
+        )
+    );
+    $self->running_var(
+        $self->params->get(
+            'running_var', grad_req => $self->center ? 'write' : 'null',
+            shape => [$self->in_channels], init => $self->running_variance_initializer,
+            allow_deferred_init => 1, differentiable => 0
+        )
+    );
+}
+
+method hybrid_forward(
+    GluonClass $F, GluonInput $x,
+    GluonInput :$gamma, GluonInput :$beta,
+    GluonInput :$running_mean, GluonInput :$running_var
+)
+{
+    return $F->BatchNorm(
+        $x, $gamma, $beta, $running_mean, $running_var,
+        name =>'fwd', %{ $self->_kwargs }
+    );
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    my $f = "%s(%s".($self->in_channels ? ", in_channels=".$self->in_channels : '').')';
+    my $content = join(", ", map { join('=', $_, $self->_kwargs->{$_}) } keys %{ $self->_kwargs });
+    return sprintf($f, $self->_class_name, $content);
+};
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::LeakyReLU;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::LeakyReLU
+=cut
+
+=head1 DESCRIPTION
+
+    Leaky version of a Rectified Linear Unit.
+
+    It allows a small gradient when the unit is not active
+
+        `f(x) = alpha * x for x < 0`,
+        `f(x) = x for x >= 0`.
+
+    Parameters
+    ----------
+    alpha : float
+        slope coefficient for the negative half axis. Must be >= 0.
+
+
+    Input shape:
+        Arbitrary.
+
+    Output shape:
+        Same shape as input.
+=cut
+has 'alpha' => (is => 'ro', isa => 'Num', required => 1);
+method python_constructor_arguments()
+{
+    ['alpha'];
+}
+
+method hybrid_forward(GluonClass $F, GluonInput $x)
+{
+    return $F->LeakyReLU($x, act_type => 'leaky', slope => $self->alpha, name => 'fwd');
+}
+
+use overload '""' => sub { my $self = shift; "${\ $self->_class_name }(${\ $self->alpha })"; };
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Embedding;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Embedding
+=cut
+
+=head1 DESCRIPTION
+
+    Turns non-negative integers (indexes/tokens) into dense vectors
+    of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
+
+
+    Parameters
+    ----------
+    input_dim : int
+        Size of the vocabulary, i.e. maximum integer index + 1.
+    output_dim : int
+        Dimension of the dense embedding.
+    dtype : str or np.dtype, default 'float32'
+        Data type of output embeddings.
+    weight_initializer : Initializer
+        Initializer for the `embeddings` matrix.
+
+
+    Input shape:
+        2D tensor with shape: `(N, M)`.
+
+    Output shape:
+        3D tensor with shape: `(N, M, output_dim)`.
+=cut
+
+has [qw/input_dim
+    output_dim/]         => (is => 'ro', isa => 'DimSize', required => 1);
+has 'dtype'              => (is => 'ro', isa => 'Dtype', default => 'float32');
+has 'weight_initalizer'  => (is => 'ro', isa => 'Maybe[Initializer]');
+has [qw/_kwargs weight/] => (is => 'rw', init_arg => undef);
+method python_constructor_arguments()
+{
+    ['input_dim', 'output_dim'];
+}
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_kwargs({
+        input_dim => $self->input_dim,
+        output_dim =>  $self->output_dim,
+        dtype => $self->dtype
+    });
+    $self->weight(
+        $self->params->get(
+            'weight',
+            shape => [$self->input_dim, $self->output_dim],
+            init => $self->weight_initializer,
+            allow_deferred_init => 1
+        )
+    );
+}
+
+method hybrid_forward(GluonClass $F, GluonInput $x, GluonInput :$weight)
+{
+    return $F->Embedding($x, $weight, name => 'fwd', %{ $self->_kwargs });
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    "${\ $self->_class_name }(${\ $self->input_dim } -> ${\ $self->output_dim }, ${\ $self->dtype })";
+};
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Flatten;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Flatten
+=cut
+
+=head1 DESCRIPTION
+
+    Flattens the input to two dimensional.
+
+    Input shape:
+        Arbitrary shape `(N, a, b, c, ...)`
+
+    Output shape:
+        2D tensor with shape: `(N, a*b*c...)`
+=cut
+
+method hybrid_forward(GluonClass $F, GluonInput $x)
+{
+    return $x->reshape([0, -1]);
+}
+
+use overload '""' => sub { shift->_class_name };
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/ConvLayers.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/ConvLayers.pm
new file mode 100644
index 000000000000..f56f8f5d333a
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/NN/ConvLayers.pm
@@ -0,0 +1,1363 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::Gluon::NN::Conv;
+use AI::MXNet::Function::Parameters;
+use AI::MXNet::Symbol;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+func _infer_weight_shape($op_name, $data_shape, $kwargs)
+{
+    my $sym = AI::MXNet::Symbol->$op_name(
+        AI::MXNet::Symbol->var('data', shape => $data_shape), %{ $kwargs }
+    );
+    return ($sym->infer_shape_partial)[0];
+}
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Conv
+=cut
+
+=head1 DESCRIPTION
+
+    Abstract nD convolution layer (private, used as implementation base).
+
+    This layer creates a convolution kernel that is convolved
+    with the layer input to produce a tensor of outputs.
+    If `use_bias` is `True`, a bias vector is created and added to the outputs.
+    Finally, if `activation` is not `None`,
+    it is applied to the outputs as well.
+
+    Parameters
+    ----------
+    channels : int
+        The dimensionality of the output space
+        i.e. the number of output channels in the convolution.
+    kernel_size : int or tuple/list of n ints
+        Specifies the dimensions of the convolution window.
+    strides: int or tuple/list of n ints,
+        Specifies the strides of the convolution.
+    padding : int or tuple/list of n ints,
+        If padding is non-zero, then the input is implicitly zero-padded
+        on both sides for padding number of points
+    dilation: int or tuple/list of n ints,
+        Specifies the dilation rate to use for dilated convolution.
+    groups : int
+        Controls the connections between inputs and outputs.
+        At groups=1, all inputs are convolved to all outputs.
+        At groups=2, the operation becomes equivalent to having two convolution
+        layers side by side, each seeing half the input channels, and producing
+        half the output channels, and both subsequently concatenated.
+    layout : str,
+        Dimension ordering of data and weight. Can be 'NCW', 'NWC', 'NCHW',
+        'NHWC', 'NCDHW', 'NDHWC', etc. 'N', 'C', 'H', 'W', 'D' stands for
+        batch, channel, height, width and depth dimensions respectively.
+        Convolution is performed over 'D', 'H', and 'W' dimensions.
+    in_channels : int, default 0
+        The number of input channels to this layer. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and `in_channels` will be inferred from the shape of input data.
+    activation : str
+        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias: bool
+        Whether the layer uses a bias vector.
+    weight_initializer : str or `Initializer`
+        Initializer for the `weight` weights matrix.
+    bias_initializer: str or `Initializer`
+        Initializer for the bias vector.
+=cut
+
+has 'channels'           => (is => 'rw', isa => 'Int', required => 1);
+has 'in_channels'        => (is => 'rw', isa => 'Int', default => 0);
+has 'kernel_size'        => (is => 'rw', isa => 'DimSize|Shape', required => 1);
+has [qw/strides
+        padding
+        dilation/]       => (is => 'rw', isa => 'DimSize|Shape');
+has 'groups'             => (is => 'rw', isa => 'Int');
+has [qw/layout
+    activation/]         => (is => 'rw', isa => 'Str');
+has 'op_name'            => (is => 'rw', isa => 'Str', default => 'Convolution');
+has 'use_bias'           => (is => 'rw', isa => 'Bool', default => 1);
+has 'weight_initializer' => (is => 'rw', isa => 'Maybe[Initializer]');
+has 'bias_initializer'   => (is => 'rw', isa => 'Maybe[Initializer]', default => 'zeros');
+has 'adj'                => (is => 'rw');
+has [qw/weight bias
+        kwargs act/]     => (is => 'rw', init_arg => undef);
+method python_constructor_arguments() { [qw/channels kernel_size strides padding dilation/] }
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_update_kernel_size;
+    $self->name_scope(sub {
+        if(not ref $self->strides)
+        {
+            $self->strides([($self->strides) x @{ $self->kernel_size }]);
+        }
+        if(not ref $self->padding)
+        {
+            $self->padding([($self->padding) x @{ $self->kernel_size }]);
+        }
+        if(not ref $self->dilation)
+        {
+            $self->dilation([($self->dilation) x @{ $self->kernel_size }]);
+        }
+        $self->kwargs({
+            kernel => $self->kernel_size, stride => $self->strides, dilate => $self->dilation,
+            pad => $self->padding, num_filter => $self->channels, num_group => $self->groups,
+            no_bias => $self->use_bias ? 0 : 1, layout => $self->layout
+        });
+        if(defined $self->adj)
+        {
+            $self->kwargs->{adj} = $self->adj;
+        }
+
+        my @dshape = (0)x(@{ $self->kernel_size } + 2);
+        $dshape[index($self->layout, 'N')] = 1;
+        $dshape[index($self->layout, 'C')] = $self->in_channels;
+        my $wshapes = _infer_weight_shape($self->op_name, \@dshape, $self->kwargs);
+        $self->weight(
+            $self->params->get(
+                'weight', shape => $wshapes->[1],
+                init => $self->weight_initializer,
+                allow_deferred_init => 1
+            )
+        );
+        if($self->use_bias)
+        {
+            $self->bias(
+                $self->params->get(
+                    'bias', shape => $wshapes->[2],
+                    init => $self->bias_initializer,
+                    allow_deferred_init => 1
+                )
+            );
+        }
+        if(defined $self->activation)
+        {
+            $self->act(
+                AI::MXNet::Gluon::NN::Activation->new(
+                    activation => $self->activation,
+                    prefix     => $self->activation.'_'
+                )
+            );
+        }
+    });
+}
+
+method hybrid_forward(GluonClass $F, GluonInput $x, GluonInput :$weight, Maybe[GluonInput] :$bias=)
+{
+    my $op_name = $self->op_name;
+    my $act = $F->$op_name($x, $weight, defined $bias ? $bias : (), name => 'fwd', %{ $self->kwargs });
+    if(defined $self->act)
+    {
+        $act = $self->act->($act);
+    }
+    return $act;
+}
+
+method _alias() { 'conv' }
+
+use Data::Dumper;
+use overload '""' => sub {
+    my $self = shift;
+    my $s = '%s(%s, kernel_size=(%s), stride=(%s)';
+    my $len_kernel_size = @{ $self->kwargs->{kernel} };
+    if(Dumper($self->kwargs->{pad}) ne Dumper([(0)x$len_kernel_size]))
+    {
+        $s .= ', padding=(' . join(',', @{ $self->kwargs->{pad} }) . ')';
+    }
+    if(Dumper($self->kwargs->{dilate}) ne Dumper([(1)x$len_kernel_size]))
+    {
+        $s .= ', dilation=(' . join(',', @{ $self->kwargs->{dilate} }) . ')';
+    }
+    if($self->can('out_pad') and Dumper($self->out_pad) ne Dumper([(0)x$len_kernel_size]))
+    {
+        $s .= ', output_padding=(' . join(',', @{ $self->kwargs->{dilate} }) . ')';
+    }
+    if($self->kwargs->{num_group} != 1)
+    {
+        $s .= ', groups=' . $self->kwargs->{num_group};
+    }
+    if(not defined $self->bias)
+    {
+        $s .= ', bias=False';
+    }
+    $s .= ')';
+    return sprintf(
+        $s,
+        $self->_class_name,
+        $self->in_channels
+            ? sprintf("%d -> %d", $self->in_channels, $self->channels)
+            : sprintf("%d", $self->channels),
+        join(',', @{ $self->kwargs->{kernel} }),
+        join(',', @{ $self->kwargs->{stride} })
+    );
+};
+
+package AI::MXNet::Gluon::NN::Conv1D;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::Conv';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Conv1D
+=cut
+
+=head1 DESCRIPTION
+
+    1D convolution layer (e.g. temporal convolution).
+
+    This layer creates a convolution kernel that is convolved
+    with the layer input over a single spatial (or temporal) dimension
+    to produce a tensor of outputs.
+    If `use_bias` is True, a bias vector is created and added to the outputs.
+    Finally, if `activation` is not `None`,
+    it is applied to the outputs as well.
+
+    If `in_channels` is not specified, `Parameter` initialization will be
+    deferred to the first time `forward` is called and `in_channels` will be
+    inferred from the shape of input data.
+
+
+    Parameters
+    ----------
+    channels : int
+        The dimensionality of the output space, i.e. the number of output
+        channels (filters) in the convolution.
+    kernel_size :int or tuple/list of 1 int
+        Specifies the dimensions of the convolution window.
+    strides : int or tuple/list of 1 int,
+        Specify the strides of the convolution.
+    padding : int or a tuple/list of 1 int,
+        If padding is non-zero, then the input is implicitly zero-padded
+        on both sides for padding number of points
+    dilation : int or tuple/list of 1 int
+        Specifies the dilation rate to use for dilated convolution.
+    groups : int
+        Controls the connections between inputs and outputs.
+        At groups=1, all inputs are convolved to all outputs.
+        At groups=2, the operation becomes equivalent to having two conv
+        layers side by side, each seeing half the input channels, and producing
+        half the output channels, and both subsequently concatenated.
+    layout: str, default 'NCW'
+        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
+        'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
+        respectively. Convolution is applied on the 'W' dimension.
+    in_channels : int, default 0
+        The number of input channels to this layer. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and `in_channels` will be inferred from the shape of input data.
+    activation : str
+        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias : bool
+        Whether the layer uses a bias vector.
+    weight_initializer : str or `Initializer`
+        Initializer for the `weight` weights matrix.
+    bias_initializer : str or `Initializer`
+        Initializer for the bias vector.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 3D array of shape
+        (batch_size, in_channels, width) if `layout` is `NCW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 3D array of shape
+        (batch_size, channels, out_width) if `layout` is `NCW`.
+        out_width is calculated as::
+
+            out_width = floor((width+2*padding-dilation*(kernel_size-1)-1)/stride)+1
+=cut
+
+has '+strides'    => (default => 1);
+has '+padding'    => (default => 0);
+has '+dilation'   => (default => 1);
+has '+groups'     => (default => 1);
+has '+layout'     => (default => 'NCW');
+
+method _update_kernel_size()
+{
+    if(not ref $self->kernel_size)
+    {
+        $self->kernel_size([$self->kernel_size]);
+    }
+    confess("kernel_size must be a number or an array ref of 1 ints")
+        unless @{ $self->kernel_size } == 1;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Conv2D;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::Conv';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Conv2D
+=cut
+
+=head1 DESCRIPTION
+
+    2D convolution layer (e.g. spatial convolution over images).
+
+    This layer creates a convolution kernel that is convolved
+    with the layer input to produce a tensor of
+    outputs. If `use_bias` is True,
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    If `in_channels` is not specified, `Parameter` initialization will be
+    deferred to the first time `forward` is called and `in_channels` will be
+    inferred from the shape of input data.
+
+    Parameters
+    ----------
+    channels : int
+        The dimensionality of the output space, i.e. the number of output
+        channels (filters) in the convolution.
+    kernel_size :int or tuple/list of 2 int
+        Specifies the dimensions of the convolution window.
+    strides : int or tuple/list of 2 int,
+        Specify the strides of the convolution.
+    padding : int or a tuple/list of 2 int,
+        If padding is non-zero, then the input is implicitly zero-padded
+        on both sides for padding number of points
+    dilation : int or tuple/list of 2 int
+        Specifies the dilation rate to use for dilated convolution.
+    groups : int
+        Controls the connections between inputs and outputs.
+        At groups=1, all inputs are convolved to all outputs.
+        At groups=2, the operation becomes equivalent to having two conv
+        layers side by side, each seeing half the input channels, and producing
+        half the output channels, and both subsequently concatenated.
+    layout : str, default 'NCHW'
+        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
+        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
+        dimensions respectively. Convolution is applied on the 'H' and
+        'W' dimensions.
+    in_channels : int, default 0
+        The number of input channels to this layer. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and `in_channels` will be inferred from the shape of input data.
+    activation : str
+        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias : bool
+        Whether the layer uses a bias vector.
+    weight_initializer : str or `Initializer`
+        Initializer for the `weight` weights matrix.
+    bias_initializer : str or `Initializer`
+        Initializer for the bias vector.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 4D array of shape
+        (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 4D array of shape
+        (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+        out_height and out_width are calculated as::
+
+            out_height = floor((height+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
+            out_width = floor((width+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
+=cut
+
+has '+strides'    => (default => sub { [1, 1] });
+has '+padding'    => (default => sub { [0, 0] });
+has '+dilation'   => (default => sub { [1, 1] });
+has '+groups'     => (default => 1);
+has '+layout'     => (default => 'NCHW');
+
+method _update_kernel_size()
+{
+    if(not ref $self->kernel_size)
+    {
+        $self->kernel_size([($self->kernel_size)x2]);
+    }
+    confess("kernel_size must be a number or an array ref of 2 ints")
+        unless @{ $self->kernel_size } == 2;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Conv3D;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::Conv';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Conv3D
+=cut
+
+=head1 DESCRIPTION
+
+    3D convolution layer (e.g. spatial convolution over volumes).
+
+    This layer creates a convolution kernel that is convolved
+    with the layer input to produce a tensor of
+    outputs. If `use_bias` is `True`,
+    a bias vector is created and added to the outputs. Finally, if
+    `activation` is not `None`, it is applied to the outputs as well.
+
+    If `in_channels` is not specified, `Parameter` initialization will be
+    deferred to the first time `forward` is called and `in_channels` will be
+    inferred from the shape of input data.
+
+    Parameters
+    ----------
+    channels : int
+        The dimensionality of the output space, i.e. the number of output
+        channels (filters) in the convolution.
+    kernel_size :int or tuple/list of 3 int
+        Specifies the dimensions of the convolution window.
+    strides : int or tuple/list of 3 int,
+        Specify the strides of the convolution.
+    padding : int or a tuple/list of 3 int,
+        If padding is non-zero, then the input is implicitly zero-padded
+        on both sides for padding number of points
+    dilation : int or tuple/list of 3 int
+        Specifies the dilation rate to use for dilated convolution.
+    groups : int
+        Controls the connections between inputs and outputs.
+        At groups=1, all inputs are convolved to all outputs.
+        At groups=2, the operation becomes equivalent to having two conv
+        layers side by side, each seeing half the input channels, and producing
+        half the output channels, and both subsequently concatenated.
+    layout : str, default 'NCDHW'
+        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
+        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
+        depth dimensions respectively. Convolution is applied on the 'D',
+        'H' and 'W' dimensions.
+    in_channels : int, default 0
+        The number of input channels to this layer. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and `in_channels` will be inferred from the shape of input data.
+    activation : str
+        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias : bool
+        Whether the layer uses a bias vector.
+    weight_initializer : str or `Initializer`
+        Initializer for the `weight` weights matrix.
+    bias_initializer : str or `Initializer`
+        Initializer for the bias vector.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 5D array of shape
+        (batch_size, in_channels, depth, height, width) if `layout` is `NCDHW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 5D array of shape
+        (batch_size, channels, out_depth, out_height, out_width) if `layout` is
+        `NCDHW`.
+
+        out_depth, out_height and out_width are calculated as::
+
+            out_depth = floor((depth+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
+            out_height = floor((height+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
+            out_width = floor((width+2*padding[2]-dilation[2]*(kernel_size[2]-1)-1)/stride[2])+1
+=cut
+
+has '+strides'    => (default => sub { [1, 1, 1] });
+has '+padding'    => (default => sub { [0, 0, 0] });
+has '+dilation'   => (default => sub { [1, 1, 1] });
+has '+groups'     => (default => 1);
+has '+layout'     => (default => 'NCDHW');
+
+method _update_kernel_size()
+{
+    if(not ref $self->kernel_size)
+    {
+        $self->kernel_size([($self->kernel_size)x3]);
+    }
+    confess("kernel_size must be a number or an array ref of 3 ints")
+        unless @{ $self->kernel_size } == 3;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Conv1DTranspose;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::Conv';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Conv1DTranspose
+=cut
+
+=head1 DESCRIPTION
+
+    Transposed 1D convolution layer (sometimes called Deconvolution).
+
+    The need for transposed convolutions generally arises
+    from the desire to use a transformation going in the opposite direction
+    of a normal convolution, i.e., from something that has the shape of the
+    output of some convolution to something that has the shape of its input
+    while maintaining a connectivity pattern that is compatible with
+    said convolution.
+
+    If `in_channels` is not specified, `Parameter` initialization will be
+    deferred to the first time `forward` is called and `in_channels` will be
+    inferred from the shape of input data.
+
+    Parameters
+    ----------
+    channels : int
+        The dimensionality of the output space, i.e. the number of output
+        channels (filters) in the convolution.
+    kernel_size :int or tuple/list of 3 int
+        Specifies the dimensions of the convolution window.
+    strides : int or tuple/list of 3 int,
+        Specify the strides of the convolution.
+    padding : int or a tuple/list of 3 int,
+        If padding is non-zero, then the input is implicitly zero-padded
+        on both sides for padding number of points
+    dilation : int or tuple/list of 3 int
+        Specifies the dilation rate to use for dilated convolution.
+    groups : int
+        Controls the connections between inputs and outputs.
+        At groups=1, all inputs are convolved to all outputs.
+        At groups=2, the operation becomes equivalent to having two conv
+        layers side by side, each seeing half the input channels, and producing
+        half the output channels, and both subsequently concatenated.
+    layout : str, default 'NCW'
+        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
+        'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
+        respectively. Convolution is applied on the 'W' dimension.
+    in_channels : int, default 0
+        The number of input channels to this layer. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and `in_channels` will be inferred from the shape of input data.
+    activation : str
+        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias : bool
+        Whether the layer uses a bias vector.
+    weight_initializer : str or `Initializer`
+        Initializer for the `weight` weights matrix.
+    bias_initializer : str or `Initializer`
+        Initializer for the bias vector.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 3D array of shape
+        (batch_size, in_channels, width) if `layout` is `NCW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 3D array of shape
+        (batch_size, channels, out_width) if `layout` is `NCW`.
+
+        out_width is calculated as::
+
+            out_width = (width-1)*strides-2*padding+kernel_size+output_padding
+=cut
+
+has 'output_padding' => (is => 'rw', isa => 'DimSize|Shape', default => 0);
+has '+adj'           => (default => sub { shift->output_padding }, lazy => 1);
+has '+op_name'       => (default => 'Deconvolution');
+has '+strides'       => (default => 1);
+has '+padding'       => (default => 0 );
+has '+dilation'      => (default => 1);
+has '+groups'        => (default => 1);
+has '+layout'        => (default => 'NCW');
+
+method _update_kernel_size()
+{
+    if(not ref $self->kernel_size)
+    {
+        $self->kernel_size([$self->kernel_size]);
+    }
+    if(not ref $self->output_padding)
+    {
+        $self->output_padding([$self->output_padding]);
+    }
+    confess("kernel_size must be a number or an array ref of 1 ints")
+        unless @{ $self->kernel_size } == 1;
+    confess("output_padding must be a number or an array ref of 1 ints")
+        unless @{ $self->output_padding } == 1;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Conv2DTranspose;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::Conv';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Conv2DTranspose
+=cut
+
+=head1 DESCRIPTION
+
+    Transposed 2D convolution layer (sometimes called Deconvolution).
+
+    The need for transposed convolutions generally arises
+    from the desire to use a transformation going in the opposite direction
+    of a normal convolution, i.e., from something that has the shape of the
+    output of some convolution to something that has the shape of its input
+    while maintaining a connectivity pattern that is compatible with
+    said convolution.
+
+    If `in_channels` is not specified, `Parameter` initialization will be
+    deferred to the first time `forward` is called and `in_channels` will be
+    inferred from the shape of input data.
+
+
+    Parameters
+    ----------
+    channels : int
+        The dimensionality of the output space, i.e. the number of output
+        channels (filters) in the convolution.
+    kernel_size :int or tuple/list of 3 int
+        Specifies the dimensions of the convolution window.
+    strides : int or tuple/list of 3 int,
+        Specify the strides of the convolution.
+    padding : int or a tuple/list of 3 int,
+        If padding is non-zero, then the input is implicitly zero-padded
+        on both sides for padding number of points
+    dilation : int or tuple/list of 3 int
+        Specifies the dilation rate to use for dilated convolution.
+    groups : int
+        Controls the connections between inputs and outputs.
+        At groups=1, all inputs are convolved to all outputs.
+        At groups=2, the operation becomes equivalent to having two conv
+        layers side by side, each seeing half the input channels, and producing
+        half the output channels, and both subsequently concatenated.
+    layout : str, default 'NCHW'
+        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
+        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
+        dimensions respectively. Convolution is applied on the 'H' and
+        'W' dimensions.
+    in_channels : int, default 0
+        The number of input channels to this layer. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and `in_channels` will be inferred from the shape of input data.
+    activation : str
+        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias : bool
+        Whether the layer uses a bias vector.
+    weight_initializer : str or `Initializer`
+        Initializer for the `weight` weights matrix.
+    bias_initializer : str or `Initializer`
+        Initializer for the bias vector.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 4D array of shape
+        (batch_size, in_channels, height, width) if `layout` is `NCHW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 4D array of shape
+        (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+
+        out_height and out_width are calculated as::
+
+            out_height = (height-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
+            out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
+=cut
+
+has 'output_padding'      => (is => 'rw', isa => 'DimSize|Shape', default => 0);
+has '+adj'        => (default => sub { shift->output_padding }, lazy => 1);
+has '+op_name'    => (default => 'Deconvolution');
+has '+strides'    => (default => sub { [1, 1] });
+has '+padding'    => (default => sub { [0, 0] });
+has '+dilation'   => (default => sub { [1, 1] });
+has '+groups'     => (default => 1);
+has '+layout'     => (default => 'NCHW');
+
+method _update_kernel_size()
+{
+    if(not ref $self->kernel_size)
+    {
+        $self->kernel_size([($self->kernel_size)x2]);
+    }
+    if(not ref $self->output_padding)
+    {
+        $self->output_padding([($self->output_padding)x2]);
+    }
+    confess("kernel_size must be a number or an array ref of 2 ints")
+        unless @{ $self->kernel_size } == 2;
+    confess("output_padding must be a number or an array ref of 2 ints")
+        unless @{ $self->output_padding } == 2;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::Conv3DTranspose;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::Conv';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::Conv3DTranspose
+=cut
+
+=head1 DESCRIPTION
+
+    Transposed 3D convolution layer (sometimes called Deconvolution).
+
+    The need for transposed convolutions generally arises
+    from the desire to use a transformation going in the opposite direction
+    of a normal convolution, i.e., from something that has the shape of the
+    output of some convolution to something that has the shape of its input
+    while maintaining a connectivity pattern that is compatible with
+    said convolution.
+
+    If `in_channels` is not specified, `Parameter` initialization will be
+    deferred to the first time `forward` is called and `in_channels` will be
+    inferred from the shape of input data.
+
+
+    Parameters
+    ----------
+    channels : int
+        The dimensionality of the output space, i.e. the number of output
+        channels (filters) in the convolution.
+    kernel_size :int or tuple/list of 3 int
+        Specifies the dimensions of the convolution window.
+    strides : int or tuple/list of 3 int,
+        Specify the strides of the convolution.
+    padding : int or a tuple/list of 3 int,
+        If padding is non-zero, then the input is implicitly zero-padded
+        on both sides for padding number of points
+    dilation : int or tuple/list of 3 int
+        Specifies the dilation rate to use for dilated convolution.
+    groups : int
+        Controls the connections between inputs and outputs.
+        At groups=1, all inputs are convolved to all outputs.
+        At groups=2, the operation becomes equivalent to having two conv
+        layers side by side, each seeing half the input channels, and producing
+        half the output channels, and both subsequently concatenated.
+    layout : str, default 'NCDHW'
+        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
+        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
+        depth dimensions respectively. Convolution is applied on the 'D',
+        'H', and 'W' dimensions.
+    in_channels : int, default 0
+        The number of input channels to this layer. If not specified,
+        initialization will be deferred to the first time `forward` is called
+        and `in_channels` will be inferred from the shape of input data.
+    activation : str
+        Activation function to use. See :func:`~mxnet.ndarray.Activation`.
+        If you don't specify anything, no activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias : bool
+        Whether the layer uses a bias vector.
+    weight_initializer : str or `Initializer`
+        Initializer for the `weight` weights matrix.
+    bias_initializer : str or `Initializer`
+        Initializer for the bias vector.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 5D array of shape
+        (batch_size, in_channels, depth, height, width) if `layout` is `NCDHW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 5D array of shape
+        (batch_size, channels, out_depth, out_height, out_width) if `layout` is `NCDHW`.
+        out_depth, out_height and out_width are calculated as::
+
+            out_depth = (depth-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
+            out_height = (height-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
+            out_width = (width-1)*strides[2]-2*padding[2]+kernel_size[2]+output_padding[2]
+=cut
+
+has 'output_padding'      => (is => 'rw', isa => 'DimSize|Shape', default => 0);
+has '+adj'        => (default => sub { shift->output_padding }, lazy => 1);
+has '+op_name'    => (default => 'Deconvolution');
+has '+strides'    => (default => sub { [1, 1, 1] });
+has '+padding'    => (default => sub { [0, 0, 0] });
+has '+dilation'   => (default => sub { [1, 1, 1] });
+has '+groups'     => (default => 1);
+has '+layout'     => (default => 'NCDHW');
+
+method _update_kernel_size()
+{
+    if(not ref $self->kernel_size)
+    {
+        $self->kernel_size([($self->kernel_size)x3]);
+    }
+    if(not ref $self->output_padding)
+    {
+        $self->output_padding([($self->output_padding)x3]);
+    }
+    confess("kernel_size must be a number or an array ref of 3 ints")
+        unless @{ $self->kernel_size } == 3;
+    confess("output_padding must be a number or an array ref of 3 ints")
+        unless @{ $self->output_padding } == 3;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+# Abstract class for different pooling layers.
+package AI::MXNet::Gluon::NN::Pooling;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+
+has 'pool_size'   => (is => 'rw', isa => 'DimSize|Shape', required => 1);
+has 'strides'     => (is => 'rw', isa => 'Maybe[DimSize|Shape]');
+has 'padding'     => (is => 'rw', isa => 'DimSize|Shape');
+has 'ceil_mode'   => (is => 'rw', isa => 'Bool', default => 0);
+has 'global_pool' => (is => 'rw', isa => 'Bool', default => 0);
+has 'kwargs'      => (is => 'rw', init_arg => undef);
+has 'pool_type'   => (is => 'rw', isa => 'PoolType');
+has 'layout'      => (is => 'rw');
+method python_constructor_arguments() { [qw/pool_size strides padding/] }
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_update_pool_size;
+    if(not defined $self->strides)
+    {
+        $self->strides($self->pool_size);
+    }
+    if(not ref $self->strides)
+    {
+        $self->strides([($self->strides)x@{ $self->pool_size }]);
+    }
+    if(not ref $self->padding)
+    {
+        $self->padding([($self->padding)x@{ $self->pool_size }]);
+    }
+    $self->kwargs({
+        kernel => $self->pool_size, stride => $self->strides, pad => $self->padding,
+        global_pool => $self->global_pool, pool_type => $self->pool_type,
+        pooling_convention => $self->ceil_mode ? 'full' : 'valid'
+    });
+}
+
+method _alias() { 'pool' }
+
+method hybrid_forward(GluonClass $F, GluonInput $x)
+{
+    return $F->Pooling($x, name=>'fwd', %{ $self->kwargs });
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    sprintf(
+        '%s(size=(%s), stride=(%s), padding=(%s), ceil_mode=%d)',
+        $self->_class_name,
+        join(',', @{ $self->kwargs->{kernel} }),
+        join(',', @{ $self->kwargs->{stride} }),
+        join(',', @{ $self->kwargs->{pad} }),
+        $self->kwargs->{pooling_convention} eq 'full' ? 1 : 0
+    )
+};
+
+package AI::MXNet::Gluon::NN::MaxPool1D;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::Pooling';
+method python_constructor_arguments() { [qw/pool_size strides padding layout ceil_mode/] }
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::MaxPool1D
+=cut
+
+=head1 DESCRIPTION
+
+    Max pooling operation for one dimensional data.
+
+
+    Parameters
+    ----------
+    pool_size: int
+        Size of the max pooling windows.
+    strides: int, or None
+        Factor by which to downscale. E.g. 2 will halve the input size.
+        If `None`, it will default to `pool_size`.
+    padding: int
+        If padding is non-zero, then the input is implicitly
+        zero-padded on both sides for padding number of points.
+    layout : str, default 'NCW'
+        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
+        'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
+        respectively. Pooling is applied on the W dimension.
+    ceil_mode : bool, default False
+        When `True`, will use ceil instead of floor to compute the output shape.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 3D array of shape
+        (batch_size, channels, width) if `layout` is `NCW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 3D array of shape
+        (batch_size, channels, out_width) if `layout` is `NCW`.
+
+        out_width is calculated as::
+
+            out_width = floor((width+2*padding-pool_size)/strides)+1
+
+        When `ceil_mode` is `True`, ceil will be used instead of floor in this
+        equation.
+=cut
+
+
+has '+pool_size' => (default => 2);
+has '+padding'   => (default => 0);
+has '+layout'    => (default => 'NCW');
+has '+pool_type' => (default => 'max');
+
+method _update_pool_size()
+{
+    confess("Only supports NCW layout for now")
+        unless $self->layout eq 'NCW';
+    if(not ref $self->pool_size)
+    {
+        $self->pool_size([$self->pool_size]);
+    }
+    confess("pool_size must be a number or an array ref of 1 ints")
+        unless @{ $self->pool_size } == 1;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::MaxPool2D;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::Pooling';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::MaxPool2D
+=cut
+
+=head1 DESCRIPTION
+
+    Max pooling operation for two dimensional (spatial) data.
+
+
+    Parameters
+    ----------
+    pool_size: int or list/tuple of 2 ints,
+        Size of the max pooling windows.
+    strides: int, list/tuple of 2 ints, or None.
+        Factor by which to downscale. E.g. 2 will halve the input size.
+        If `None`, it will default to `pool_size`.
+    padding: int or list/tuple of 2 ints,
+        If padding is non-zero, then the input is implicitly
+        zero-padded on both sides for padding number of points.
+    layout : str, default 'NCHW'
+        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
+        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
+        dimensions respectively. padding is applied on 'H' and 'W' dimension.
+    ceil_mode : bool, default False
+        When `True`, will use ceil instead of floor to compute the output shape.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 4D array of shape
+        (batch_size, channels, height, width) if `layout` is `NCHW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 4D array of shape
+        (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+
+        out_height and out_width are calculated as::
+
+            out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
+            out_width = floor((width+2*padding[1]-pool_size[1])/strides[1])+1
+
+        When `ceil_mode` is `True`, ceil will be used instead of floor in this
+        equation.
+=cut
+
+has '+pool_size' => (default => sub { [2, 2] });
+has '+padding'   => (default => 0);
+has '+layout'    => (default => 'NCHW');
+has '+pool_type' => (default => 'max');
+
+method _update_pool_size()
+{
+    confess("Only supports NCHW layout for now")
+        unless $self->layout eq 'NCHW';
+    if(not ref $self->pool_size)
+    {
+        $self->pool_size([($self->pool_size)x2]);
+    }
+    confess("pool_size must be a number or an array ref of 2 ints")
+        unless @{ $self->pool_size } == 2;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::MaxPool3D;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::Pooling';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::MaxPool3D
+=cut
+
+=head1 DESCRIPTION
+
+    Max pooling operation for 3D data (spatial or spatio-temporal).
+
+
+    Parameters
+    ----------
+    pool_size: int or list/tuple of 3 ints,
+        Size of the max pooling windows.
+    strides: int, list/tuple of 3 ints, or None.
+        Factor by which to downscale. E.g. 2 will halve the input size.
+        If `None`, it will default to `pool_size`.
+    padding: int or list/tuple of 3 ints,
+        If padding is non-zero, then the input is implicitly
+        zero-padded on both sides for padding number of points.
+    layout : str, default 'NCDHW'
+        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
+        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
+        depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
+        dimension.
+    ceil_mode : bool, default False
+        When `True`, will use ceil instead of floor to compute the output shape.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 5D array of shape
+        (batch_size, channels, depth, height, width) if `layout` is `NCDHW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 5D array of shape
+        (batch_size, channels, out_depth, out_height, out_width) if `layout`
+        is `NCDHW`.
+
+        out_depth, out_height and out_width are calculated as ::
+
+            out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
+            out_height = floor((height+2*padding[1]-pool_size[1])/strides[1])+1
+            out_width = floor((width+2*padding[2]-pool_size[2])/strides[2])+1
+
+        When `ceil_mode` is `True`, ceil will be used instead of floor in this
+        equation.
+=cut
+
+has '+pool_size' => (default => sub { [2, 2, 2] });
+has '+padding'   => (default => 0);
+has '+layout'    => (default => 'NCDHW');
+has '+pool_type' => (default => 'max');
+
+method _update_pool_size()
+{
+    confess("Only supports NCDHW layout for now")
+        unless $self->layout eq 'NCDHW';
+    if(not ref $self->pool_size)
+    {
+        $self->pool_size([($self->pool_size)x3]);
+    }
+    confess("pool_size must be a number or an array ref of 3 ints")
+        unless @{ $self->pool_size } == 3;
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::AvgPool1D;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::MaxPool1D';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::AvgPool1D
+=cut
+
+=head1 DESCRIPTION
+
+    Average pooling operation for temporal data.
+
+    Parameters
+    ----------
+    pool_size: int
+        Size of the max pooling windows.
+    strides: int, or None
+        Factor by which to downscale. E.g. 2 will halve the input size.
+        If `None`, it will default to `pool_size`.
+    padding: int
+        If padding is non-zero, then the input is implicitly
+        zero-padded on both sides for padding number of points.
+    layout : str, default 'NCW'
+        Dimension ordering of data and weight. Can be 'NCW', 'NWC', etc.
+        'N', 'C', 'W' stands for batch, channel, and width (time) dimensions
+        respectively. padding is applied on 'W' dimension.
+    ceil_mode : bool, default False
+        When `True`, will use ceil instead of floor to compute the output shape.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 3D array of shape
+        (batch_size, channels, width) if `layout` is `NCW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 3D array of shape
+        (batch_size, channels, out_width) if `layout` is `NCW`.
+
+        out_width is calculated as::
+
+            out_width = floor((width+2*padding-pool_size)/strides)+1
+
+        When `ceil_mode` is `True`, ceil will be used instead of floor in this
+        equation.
+=cut
+
+has '+pool_type' => (default => 'avg');
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::AvgPool2D;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::MaxPool2D';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::AvgPool2D
+=cut
+
+=head1 DESCRIPTION
+
+    Average pooling operation for spatial data.
+
+    Parameters
+    ----------
+    pool_size: int or list/tuple of 2 ints,
+        Size of the max pooling windows.
+    strides: int, list/tuple of 2 ints, or None.
+        Factor by which to downscale. E.g. 2 will halve the input size.
+        If `None`, it will default to `pool_size`.
+    padding: int or list/tuple of 2 ints,
+        If padding is non-zero, then the input is implicitly
+        zero-padded on both sides for padding number of points.
+    layout : str, default 'NCHW'
+        Dimension ordering of data and weight. Can be 'NCHW', 'NHWC', etc.
+        'N', 'C', 'H', 'W' stands for batch, channel, height, and width
+        dimensions respectively. padding is applied on 'H' and 'W' dimension.
+    ceil_mode : bool, default False
+        When True, will use ceil instead of floor to compute the output shape.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 4D array of shape
+        (batch_size, channels, height, width) if `layout` is `NCHW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 4D array of shape
+        (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+
+        out_height and out_width are calculated as::
+
+            out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
+            out_width = floor((width+2*padding[1]-pool_size[1])/strides[1])+1
+
+        When `ceil_mode` is `True`, ceil will be used instead of floor in this
+        equation.
+=cut
+
+has '+pool_type' => (default => 'avg');
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::AvgPool3D;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::MaxPool3D';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::AvgPool3D
+=cut
+
+=head1 DESCRIPTION
+
+    Average pooling operation for 3D data (spatial or spatio-temporal).
+
+    Parameters
+    ----------
+    pool_size: int or list/tuple of 3 ints,
+        Size of the max pooling windows.
+    strides: int, list/tuple of 3 ints, or None.
+        Factor by which to downscale. E.g. 2 will halve the input size.
+        If `None`, it will default to `pool_size`.
+    padding: int or list/tuple of 3 ints,
+        If padding is non-zero, then the input is implicitly
+        zero-padded on both sides for padding number of points.
+    layout : str, default 'NCDHW'
+        Dimension ordering of data and weight. Can be 'NCDHW', 'NDHWC', etc.
+        'N', 'C', 'H', 'W', 'D' stands for batch, channel, height, width and
+        depth dimensions respectively. padding is applied on 'D', 'H' and 'W'
+        dimension.
+    ceil_mode : bool, default False
+        When True, will use ceil instead of floor to compute the output shape.
+
+
+    Input shape:
+        This depends on the `layout` parameter. Input is 5D array of shape
+        (batch_size, channels, depth, height, width) if `layout` is `NCDHW`.
+
+    Output shape:
+        This depends on the `layout` parameter. Output is 5D array of shape
+        (batch_size, channels, out_depth, out_height, out_width) if `layout`
+        is `NCDHW`.
+
+        out_depth, out_height and out_width are calculated as ::
+
+            out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
+            out_height = floor((height+2*padding[1]-pool_size[1])/strides[1])+1
+            out_width = floor((width+2*padding[2]-pool_size[2])/strides[2])+1
+
+        When `ceil_mode` is `True,` ceil will be used instead of floor in this
+        equation.
+=cut
+
+has '+pool_type' => (default => 'avg');
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::GlobalMaxPool1D;
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::GlobalMaxPool1D
+=cut
+
+=head1 DESCRIPTION
+
+    Global max pooling operation for temporal data.
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::MaxPool1D';
+has '+pool_size'   => (default => sub { [1] });
+has '+global_pool' => (default => 1);
+has '+ceil_mode'   => (default => 1);
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::GlobalMaxPool2D;
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::GlobalMaxPool2D
+=cut
+
+=head1 DESCRIPTION
+
+    Global max pooling operation for spatial data.
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::MaxPool2D';
+
+has '+pool_size'   => (default => sub { [1, 1] });
+has '+global_pool' => (default => 1);
+has '+ceil_mode'   => (default => 1);
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::GlobalMaxPool3D;
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::GlobalMaxPool3D
+=cut
+
+=head1 DESCRIPTION
+
+    Global max pooling operation for 3D data.
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::MaxPool3D';
+has '+pool_size'   => (default => sub { [1, 1, 1] });
+has '+global_pool' => (default => 1);
+has '+ceil_mode'   => (default => 1);
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::GlobalAvgPool1D;
+
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::GlobalAvgPool1D
+=cut
+
+=head1 DESCRIPTION
+
+    Global average pooling operation for temporal data.
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::AvgPool1D';
+has '+pool_size'   => (default => sub { [1] });
+has '+global_pool' => (default => 1);
+has '+ceil_mode'   => (default => 1);
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::GlobalAvgPool2D;
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::GlobalAvgPool2D
+=cut
+
+=head1 DESCRIPTION
+
+    Global average pooling operation for spatial data.
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::AvgPool2D';
+
+has '+pool_size'   => (default => sub { [1, 1] });
+has '+global_pool' => (default => 1);
+has '+ceil_mode'   => (default => 1);
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+package AI::MXNet::Gluon::NN::GlobalAvgPool3D;
+=head1 NAME
+
+    AI::MXNet::Gluon::NN::GlobalAvgPool2D
+=cut
+
+=head1 DESCRIPTION
+
+    Global average pooling operation for 3D data.
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::NN::AvgPool3D';
+has '+pool_size'   => (default => sub { [1, 1, 1] });
+has '+global_pool' => (default => 1);
+has '+ceil_mode'   => (default => 1);
+
+__PACKAGE__->register('AI::MXNet::Gluon::NN');
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
new file mode 100644
index 000000000000..d241aa196a96
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Parameter.pm
@@ -0,0 +1,926 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+use Hash::Ordered;
+package AI::MXNet::Gluon::Parameter;
+use AI::MXNet::Function::Parameters;
+
+=head1 NAME 
+
+    AI::MXNet::Gluon::Parameter - A Container holding parameters (weights) of AI::MXNEt::Gluon::Block(s).
+=cut
+
+=head1 DESCRIPTION
+
+    AI::MXNet::Gluon::Parameter holds a copy of the parameter on each AI::MXNet::Context after
+    it is initialized with AI::MXNet::Gluon::Parameter->initialize(...)`. If grad_req is
+    not 'null', it will also hold a gradient array on each AI::MXNet::Context
+
+        $ctx = mx->gpu(0);
+        $x = mx->nd->zeros([16, 100], ctx=>$ctx);
+        $w = mx->gluon->Parameter('fc_weight', shape=>[64, 100], init=>mx->init->Xavier());
+        $b = mx->gluon->Parameter('fc_bias', shape=>[64], init=>mx->init->Zero());
+        $w->initialize(ctx=>$ctx);
+        $b->initialize(ctx=>ctx);
+        $out = mx->nd->FullyConnected($x, $w->data($ctx), $b->data($ctx), num_hidden=>64);
+
+    Parameters
+    ----------
+    name : str
+        Name of this parameter.
+    grad_req : {'write', 'add', 'null'}, default 'write'
+        Specifies how to update gradient to grad arrays.
+
+        - 'write' means everytime gradient is written to grad NDArray.
+        - 'add' means everytime gradient is added to the grad NDArray. You need
+          to manually call zero_grad() to clear the gradient buffer before each
+          iteration when using this option.
+        - 'null' means gradient is not requested for this parameter. gradient arrays
+          will not be allocated.
+    shape : array ref of int, default None
+        Shape of this parameter. By default shape is not specified. Parameter with
+        unknown shape can be used for `Symbol` API, but `init` will throw an error
+        when using `NDArray` API.
+    dtype : Dtype, default 'float32'
+        Data type of this parameter. For example, 'float64'.
+    lr_mult : float, default 1.0
+        Learning rate multiplier. Learning rate will be multiplied by lr_mult
+        when updating this parameter with optimizer.
+    wd_mult : float, default 1.0
+        Weight decay multiplier (L2 regularizer coefficient). Works similar to lr_mult.
+    init : Initializer, default None
+        Initializer of this parameter. Will use the global initializer by default.
+
+    Attributes
+    ----------
+    grad_req : {'write', 'add', 'null'}
+        This can be set before or after initialization. Setting grad_req to null
+        with $x->grad_req = 'null' saves memory and computation when you don't
+        need gradient w.r.t x.
+=cut
+
+use Mouse;
+use AI::MXNet::Base;
+use overload '""' => sub {
+        my $self = shift;
+        "Parameter " . $self->name.
+        " (shape=(" . join(',', @{ $self->shape//[] }) .")".
+        ", dtype=" . $self->dtype.")"
+    },
+    fallback => 1;
+
+around BUILDARGS => sub {
+    my $orig  = shift;
+    my $class = shift;
+    if(@_ % 2)
+    {
+        my $name = shift;
+        return $class->$orig(name => $name, @_);
+    }
+    else
+    {
+        return $class->$orig(@_);
+    }
+};
+
+sub BUILD
+{
+    my $self = shift;
+    $self->grad_req($self->_grad_req);
+    $self->_deferred_init([]);
+}
+
+has 'name'                => (is => 'ro', isa => 'Str', required => 1);
+has '_grad_req'           => (is => 'rw', isa => 'GradReq', init_arg => 'grad_req', default => 'write');
+has 'shape'               => (is => 'rw', isa => 'Shape');
+has 'dtype'               => (is => 'rw', isa => 'Dtype', default => 'float32');
+has [qw/lr_mult wd_mult/] => (is => 'rw', isa => 'Num', default => 1);
+has 'init'                => (is => 'rw', isa => 'Maybe[Initializer]');
+has 'allow_deferred_init' => (is => 'rw', isa => 'Bool', default => 0);
+has 'differentiable'      => (is => 'rw', isa => 'Bool', default => 1);
+has [qw/_var _data _grad
+    _deferred_init
+    _ctx_list _ctx_map/]  => (is => 'rw', init_arg => undef);
+
+method grad_req(Maybe[GradReq] $req=)
+{
+    return $self->_grad_req unless defined $req;
+    if(not $self->differentiable)
+    {
+        $req = 'null';
+    }
+    return if $self->_grad_req eq $req;
+    $self->_grad_req($req);
+    if($req eq 'null' and defined $self->_grad)
+    {
+        $self->_grad(undef);
+        $self->_data([map { $_->detach } @{ $self->_data }]);
+    }
+    elsif(defined $self->_data)
+    {
+        $self->_init_grad();
+    }
+}
+
+method _check_and_get($arr_list, $ctx)
+{
+    if(defined $arr_list)
+    {
+        if(ref $ctx eq 'ARRAY')
+        {
+            return $arr_list;
+        }
+        if(not defined $ctx)
+        {
+            if(@{ $arr_list } == 1)
+            {
+                return $arr_list->[0];
+            }
+            else
+            {
+                $ctx = AI::MXNet::Context->current_ctx;
+            }
+        }
+        my $idx;
+        if(ref $self->_ctx_map->[$ctx->device_type_id])
+        {
+            $idx = $self->_ctx_map->[$ctx->device_type_id][$ctx->device_id];
+        }
+        if(defined $idx)
+        {
+            return $arr_list->[$idx];
+        }
+        confess(
+            "Parameter ${\ $self->name } was not initialized on context $ctx. ".
+            "It was only initialized on @{ $self->_ctx_list }."
+        );
+    }
+    if(@{ $self->_deferred_init })
+    {
+        confess("DeferredInitializationError: ".
+            "Parameter ${\ $self->name } has not been initialized yet because initialization was ".
+            "deferred. Actual initialization happens during the first forward pass. ".
+            "Please pass one batch of data through the network before accessing Parameters. ".
+            "You can also avoid deferred initialization by specifying in_units, ".
+            "num_features, etc., for network layers.");
+    }
+    confess(
+        "Parameter ${\ $self->name } has not been initialized. Note that ".
+        "you should initialize parameters and create Trainer ".
+        "with Block.collect_params() instead of Block.params ".
+        "because the later does not include Parameters of ".
+        "nested child Blocks"
+    );
+}
+
+# (Re)initializes by loading from data.
+method _load_init($data, $ctx)
+{
+    if($self->shape)
+    {
+        for(zip($self->shape, $data->shape)) {
+            my ($i, $j) = @$_;
+            assert(
+                ($i == 0 or $i == $j),
+                sprintf(
+                    "Failed loading Parameter %s from saved params: ".
+                    "shape incompatible expacted (%s) vs saved (%s)",
+                    $self->name, "@{$self->shape}", "@{$data->shape}"
+                )
+            );
+        }
+    }
+    if($self->dtype)
+    {
+        assert(
+            ($self->dtype eq $data->dtype),
+            sprintf(
+                "Failed loading Parameter %s from saved params: ".
+                "dtype incompatible expacted %s vs saved %s",
+                $self->name, $self->dtype, $data->dtype
+            )
+        );
+    }
+    if(blessed ($ctx) and $ctx->isa('AI::MXNet::Context'))
+    {
+        $ctx = [$ctx];
+    }
+    if(not defined $self->_data)
+    {
+        if(@{ $self->_deferred_init })
+        {
+            assert(
+                ($ctx eq $self->_deferred_init->[1]),
+                sprintf(
+                    "Failed to load Parameter %s on %s because it was ".
+                    "previous initialized on %s.",
+                    $self->name, $ctx, $self->list_ctx
+                )
+            );
+        }
+        $self->_init_impl($data, $ctx);
+    }
+    else
+    {
+        assert(
+            (join('', @{ $ctx }) eq join('', @{ $self->list_ctx })),
+            sprintf(
+                "Failed to load Parameter %s on %s because it was ".
+                "previous initialized on %s.",
+                $self->name, "@$ctx", "@{$self->list_ctx}"
+            )
+        );
+        $self->set_data($data);
+    }
+    $self->_deferred_init([]);
+}
+
+# Finishes deferred initialization.
+method _finish_deferred_init()
+{
+    return unless @{ $self->_deferred_init };
+    my ($init, $ctx, $default_init) = @{ $self->_deferred_init };
+    $self->_deferred_init([]);
+    assert(
+        (defined($self->shape) and product(@{ $self->shape }) > 0),
+        sprintf(
+            "Cannot initialize Parameter %s because it has ".
+            "invalid shape: %s. Please specify in_units, ".
+            "in_channels, etc for `Block`s.",
+            $self->name, $self->shape
+        )
+    );
+    AI::MXNet::AutoGrad->pause(sub {
+        my $data = AI::MXNet::NDArray->zeros(
+            $self->shape, dtype => $self->dtype, ctx => AI::MXNet::Context->cpu
+        );
+        AI::MXNet::Initializer->new->(
+            AI::MXNet::InitDesc->new(
+                name => $self->name,
+                attrs => { __init__ => defined $init ? "$init" : "$default_init" }
+            ),
+            $data
+        );
+        $self->_init_impl($data, $ctx);
+    });
+}
+
+# Sets data and grad.
+method _init_impl($data, $ctx_list)
+{
+    $self->_ctx_list([@{ $ctx_list }]);
+    $self->_ctx_map([]);
+    enumerate(sub {
+        my ($i, $ctx) = @_;
+        while(@{ $self->_ctx_map } <= $ctx->device_type_id)
+        {
+            push @{ $self->_ctx_map }, [];
+        }
+        my $dev_list = $self->_ctx_map->[$ctx->device_type_id];
+        while(@{ $dev_list } <= $ctx->device_id)
+        {
+            push @{ $dev_list }, undef;
+        }
+        $dev_list->[$ctx->device_id] = $i;
+    }, $self->_ctx_list);
+    $self->_data([map { $data->copyto($_) } @{ $self->_ctx_list }]);
+    $self->_init_grad;
+}
+
+# Initialize grad buffers.
+method _init_grad()
+{
+    if($self->grad_req eq 'null')
+    {
+        $self->_grad(undef);
+        return;
+    }
+    $self->_grad([map { AI::MXNet::NDArray->zeros_like($_) } @{ $self->_data }]);
+    AI::MXNet::AutoGrad->mark_variables($self->list_data, $self->list_grad, grad_reqs => $self->grad_req);
+}
+
+# Reduce data from multiple context.
+
+method _reduce()
+{
+    my $block = $self->list_data;
+    my $data = AI::MXNet::NDArray->add_n(map { $_->copyto(AI::MXNet::Context->cpu) } @{ $block }) / @{ $block };
+    return $data;
+}
+
+
+=head2 initialize
+
+        Initializes parameter and gradient arrays. Only used for `NDArray` API.
+
+        Parameters
+        ----------
+        :$init : Initializer
+            The initializer to use. Overrides AI::MXNet::Gluon::Parameter->init and default_init.
+        :$ctx : AI::MXNet::Context or array ref of AI::MXNet::Context, defaults to AI::MXNet::Context->current_ctx().
+            Initialize Parameter on given context. If ctx is a list of Context, a
+            copy will be made for each context.
+            Copies are independent arrays. User is responsible for keeping
+            their values consistent when updating. Normally gluon->Trainer does this for you.
+        :$default_init : Initializer
+            Default initializer is used when both 'init' and AI::MXNet::Gluon::Parameter->init are undefined.
+        :$force_reinit : bool, default False
+            Whether to force re-initialization if parameter is already initialized.
+
+        Examples
+        --------
+        >>> $weight = mx->gluon->Parameter('weight', shape=>[2, 2]);
+        >>> $weight->initialize(ctx=>mx->cpu(0));
+        >>> print $weight->data
+        [[-0.01068833  0.01729892]
+         [ 0.02042518 -0.01618656]]
+        <NDArray 2x2 @cpu(0)>
+        >>> print $weight->grad()
+        [[ 0.  0.]
+         [ 0.  0.]]
+        <NDArray 2x2 @cpu(0)>
+        >>> $weight->initialize(ctx=>[mx->gpu(0), mx->gpu(1)]);
+        >>> print $weight->data(mx->gpu(0));
+        [[-0.00873779 -0.02834515]
+         [ 0.05484822 -0.06206018]]
+        <NDArray 2x2 @gpu(0)>
+        >>> print $weight->data(mx->gpu(1))
+        [[-0.00873779 -0.02834515]
+         [ 0.05484822 -0.06206018]]
+        <NDArray 2x2 @gpu(1)>
+=cut
+
+method initialize(
+    Maybe[Initializer]                                     :$init=,
+    Maybe[AI::MXNet::Context|ArrayRef[AI::MXNet::Context]] :$ctx=AI::MXNet::Context->current_ctx,
+    Initializer                                            :$default_init=AI::MXNet::Initializer->Uniform,
+    Bool                                                   :$force_reinit=0
+)
+{
+    $ctx //=AI::MXNet::Context->current_ctx;
+    if(defined $self->_data and not $force_reinit)
+    {
+        AI::MXNet::Logging->warning(
+            "Parameter %s is already initialized, ignoring. ".
+            "Set force_reinit=True to re-initialize.", $self->name
+        );
+        return;
+    }
+    $self->_data(undef);
+    $self->_grad(undef);
+    if(blessed($ctx) and $ctx->isa('AI::MXNet::Context'))
+    {
+        $ctx = [$ctx];
+    }
+    if(not defined $init)
+    {
+        if(defined $self->init)
+        {
+            $init = $self->init;
+        }
+        else
+        {
+            $init = $default_init;
+        }
+    }
+    if(not defined $self->shape or not @{ $self->shape } or product(@{ $self->shape }) <= 0)
+    {
+        if($self->allow_deferred_init)
+        {
+            $self->_deferred_init([$init, $ctx, $default_init]);
+            return;
+        }
+        confess("Cannot initialize Parameter ${\ $self->name } because it has ".
+                "invalid shape: @{$self->shape//[]}.");
+    }
+    $self->_deferred_init([$init, $ctx, $default_init]);
+    $self->_finish_deferred_init;
+}
+
+=head2 reset_ctx
+
+        Re-assign Parameter to other contexts.
+
+        :$ctx : AI::MXNet::Context or array ref of AI::MXNet::Context, default AI::MXNet::Context->current_ctx.
+        Assign Parameter to given context. If ctx is a list of Context, a
+        copy will be made for each context.
+=cut
+
+method reset_ctx(Maybe[AI::MXNet::Context|ArrayRef[AI::MXNet::Context]] :$ctx=AI::MXNet::Context->current_ctx)
+{
+    if(blessed($ctx) and $ctx->isa('AI::MXNet::Context'))
+    {
+        $ctx = [$ctx];
+    }
+    if(defined $self->_data)
+    {
+        my $data = $self->_reduce;
+        AI::MXNet::AutoGrad->pause(sub {
+            $self->_init_impl($data, $ctx);
+        });
+    }
+    elsif(@{ $self->_deferred_init })
+    {
+        my ($init, undef, $default_init) = @{ $self->_deferred_init };
+        $self->_deferred_init([$init, $ctx, $default_init]);
+    }
+    else
+    {
+        confess("Cannot reset context for Parameter ${ \ $self->name } because it ".
+                "has not been initialized.");
+    }
+}
+
+=head2 set_data
+
+    Sets this parameter's value on all contexts to data.
+=cut
+
+method set_data($data)
+{
+    assert(
+        (defined $self->_data),
+        "Parameter ${\ $self->name } has not been initialized"
+    );
+    for my $arr (@{ $self->list_data })
+    {
+        $arr .= $data;
+    }
+}
+
+=head2 data
+
+        Returns a copy of this parameter on one context. Must have been
+        initialized on this context before.
+
+        Parameters
+        ----------
+        ctx : Context
+            Desired context.
+
+        Returns
+        -------
+        NDArray on ctx
+=cut
+
+method data(Maybe[AI::MXNet::Context] $ctx=)
+{
+    return $self->_check_and_get($self->_data, $ctx);
+}
+
+=head2 list_data
+
+        Returns copies of this parameter on all contexts, in the same order
+        as creation.
+=cut
+
+method list_data()
+{
+    return $self->_check_and_get($self->_data, [])
+}
+
+=head2 grad
+
+        Returns a gradient buffer for this parameter on one context.
+
+        Parameters
+        ----------
+        ctx : Context
+            Desired context.
+=cut
+
+method grad(Maybe [AI::MXNet::Context] $ctx=)
+{
+    if(defined $self->_data and not defined $self->_grad)
+    {
+        confess(
+            "Cannot get gradient array for Parameter ${\ $self->name } ".
+            "because grad_req='null'"
+        );
+    }
+    return $self->_check_and_get($self->_grad, $ctx);
+}
+
+=head2 list_grad
+
+        Returns gradient buffers on all contexts, in the same order
+        as 'values'.
+=cut
+
+method list_grad()
+{
+    if(defined $self->_data and not defined $self->_grad)
+    {
+        confess(
+            "Cannot get gradient array for Parameter ${\ $self->name } ".
+            "because grad_req='null'"
+        );
+    }
+    return $self->_check_and_get($self->_grad, []);
+}
+
+=head2 list_ctx
+
+        Returns a list of contexts this parameter is initialized on.
+=cut
+
+method list_ctx()
+{
+    if(not defined $self->_data)
+    {
+        if(@{ $self->_deferred_init })
+        {
+            return $self->_deferred_init->[1];
+        }
+        confess("Parameter ${\ $self->name } has not been initialized");
+    }
+    return $self->_ctx_list;
+}
+
+=head2 zero_grad
+
+        Sets gradient buffer on all contexts to 0. No action is taken if
+        parameter is uninitialized or doesn't require gradient.
+=cut
+
+method zero_grad()
+{
+    return unless defined $self->_grad;
+    map { $_ .= 0 } @{ $self->_grad };
+}
+
+=head2 var
+
+        Returns a symbol representing this parameter.
+=cut
+
+method var()
+{
+    if(not defined $self->_var)
+    {
+        $self->_var(
+            AI::MXNet::Symbol->var(
+                $self->name, shape => $self->shape, dtype => $self->dtype,
+                lr_mult => $self->lr_mult, wd_mult => $self->wd_mult,
+                init => $self->init
+            )
+        );
+    }
+    return $self->_var;
+}
+
+
+package AI::MXNet::Gluon::ParameterDict;
+use AI::MXNet::Base;
+=head1 NAME
+
+    AI::MXNet::Gluon::ParameterDict - A dictionary managing a set of parameters.
+=cut
+
+=head1 DESCRIPTION
+
+    Parameters
+    ----------
+    prefix : str, default ''
+        The prefix to be prepended to all Parameters' names created by this dict.
+    shared : ParameterDict or undef
+        If not undef, when this dict's `get` method creates a new parameter, will
+        first try to retrieve it from `shared` dict. Usually used for sharing
+        parameters with another `Block`.
+=cut
+
+use Mouse;
+has _prefix => (is => 'ro', isa => 'Str', init_arg => 'prefix', default => '');
+has _shared => (is => 'rw', isa => 'Maybe[AI::MXNet::Gluon::ParameterDict]', init_arg => 'shared');
+has _params => (is => 'rw', init_arg => undef);
+
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
+method python_constructor_arguments() { [qw/prefix shared/] }
+
+sub BUILD
+{
+    my $self = shift;
+    $self->_params(Hash::Ordered->new);
+}
+
+use overload
+    '""'   => sub {
+        my $self = shift;
+        my $name = $self->_prefix ? $self->_prefix." " : '';
+        my $content = join("\n", map { AI::MXNet::Base::_indent("   $_", 2) } $self->values);
+        return "$name(\n$content\n)";
+    },
+    '%{}'  => sub { my %tmp = shift->_params->as_list; \%tmp },
+    '@{}'  => sub { my @tmp = shift->_params->as_list; \@tmp },
+    fallback => 1;
+
+method items()
+{
+    return @{$self};
+}
+
+method keys()
+{
+    return $self->_params->keys;
+}
+
+method values()
+{
+    return $self->_params->values;
+}
+
+method prefix()
+{
+    $self->_prefix;
+}
+
+
+method _get_impl($name)
+{
+    if($self->_params->exists($name))
+    {
+        return $self->_params->get($name);
+    }
+    if(defined $self->_shared and $self->_shared->_params->exists($name))
+    {
+        $self->_params->set($name => $self->_shared->_params->get($name));
+        return $self->_params->get($name);
+    }
+    return undef;
+}
+
+=head get
+
+        Retrieves a 'AI::MXNet::Gluon::Parameter' with name '$self->prefix.$name'. If not found,
+        'get' will first try to retrieve it from 'shared' dict. If still not
+        found, 'get' will create a new 'AI::MXNet::Gluon::Parameter' with key-word arguments and
+        insert it to self.
+
+        Parameters
+        ----------
+        name : str
+            Name of the desired Parameter. It will be prepended with this dictionary's
+            prefix.
+        %kwargs : hash
+            The rest of key-word arguments for the created `Parameter`.
+
+        Returns
+        -------
+        Parameter
+            The created or retrieved `Parameter`.
+=cut
+
+use Data::Dumper;
+method get(Str $name, %kwargs)
+{
+    $name = $self->prefix . $name;
+    my $param = $self->_get_impl($name);
+    if(not defined $param)
+    {
+        $param = AI::MXNet::Gluon::Parameter->new($name, %kwargs);
+        $self->_params->set($name => $param);
+    }
+    else
+    {
+        while(my ($k, $v) = each %kwargs)
+        {
+            if($param->can($k))
+            {
+                assert(
+                    (not defined $v or Dumper($v) eq Dumper($param->$k)),
+                    "Cannot retrieve Parameter $name because desired attribute ".
+                    "does not match with stored for attribute $k: ".
+                    "desired ".Dumper($v)." vs stored ". Dumper($param->$k)
+                );
+            }
+            else
+            {
+                confess("unknown param $k, $v");
+            }
+        }
+    }
+    return $param;
+}
+
+=head2 update
+
+    Copies all Parameters in $other to self.
+=cut
+
+method update($other)
+{
+    my @keys = $other->keys;
+    for my $k (@keys)
+    {
+        if($self->_params->exists($k))
+        {
+            assert(
+                ($self->_params->get($k) eq $other->_params->get($k)),
+                "Cannot update self with other because they have different ".
+                "Parameters with the same name $k"
+            );
+        }
+        else
+        {
+            $self->_params->set($k => $other->_params->get($k));
+        }
+    }
+}
+
+=head2 initialize
+
+        Initializes all Parameters managed by this dictionary to be used for 'NDArray'
+        API. It has no effect when using 'Symbol' API.
+
+        Parameters
+        ----------
+        :$init : Initializer
+            Global default Initializer to be used when AI::MXNet::Gluon::Parameter->init is undef.
+            Otherwise, AI::MXNet::Gluon::Parameter->init takes precedence.
+        :$ctx : AI::MXNet::Context or array ref of AI::MXNet::Context objects
+            Keeps a copy of Parameters on one or many context(s).
+        :$force_reinit : bool, default False
+            Whether to force re-initialization if parameter is already initialized.
+=cut
+
+
+method initialize(
+    Initializer                                            :$init=AI::MXNet::Initializer->Uniform(),
+    Maybe[AI::MXNet::Context|ArrayRef[AI::MXNet::Context]] :$ctx=,
+    Bool                                                   :$verbose=0,
+    Bool                                                   :$force_reinit=0
+)
+{
+    if($verbose)
+    {
+        $init->set_verbosity(verbose=>$verbose);
+    }
+    $_->initialize(ctx => $ctx, default_init => $init, force_reinit => $force_reinit) for $self->values;
+}
+
+=head2 zero_grad
+
+    Sets all Parameters' gradient buffer to 0.
+=cut
+
+method zero_grad()
+{
+    $_->zero_grad for $self->values;
+}
+
+=head2 reset_ctx
+
+    Re-assign all Parameters to other contexts.
+
+    $ctx : AI::MXNet::Context or array ref of AI::MXNet::Context objects, defaults to AI::MXNet::Context->current_ctx().
+            Assign Parameter to given context. If $ctx is an array ref of AI::MXNet::Context objects, a
+            copy will be made for each context.
+=cut
+
+method reset_ctx(AI::MXNet::Context|ArrayRef[AI::MXNet::Conetxt] $ctx=AI::MXNet::Context->current_ctx)
+{
+    $_->reset_ctx($ctx) for $self->values;
+}
+
+=head2 setattr
+
+        Set an attribute to a new value for all Parameters.
+
+        For example, set grad_req to null if you don't need gradient w.r.t a
+        model's Parameters::
+
+            $model->collect_params()->setattr(grad_req => 'null');
+
+        or change the learning rate multiplier::
+
+            $model->collect_params()->setattr(lr_mult => 0.5);
+
+        Parameters
+        ----------
+        $name : str
+            Name of the attribute.
+        $value : valid type for attribute name
+            The new value for the attribute.
+=cut
+
+method setattr($name, $value)
+{
+    $_->$name($value) for $self->values;
+}
+
+
+=head2 save
+
+    Save parameters to file.
+
+    $filename : str
+        Path to parameter file.
+    $strip_prefix : str, default ''
+    Strip prefix from parameter names before saving.
+=cut
+
+method save(Str $filename, Str $strip_prefix='')
+{
+    my %arg_dict = ();
+    for my $param ($self->values())
+    {
+        my $weight = $param->_reduce();
+        if(not $param->name =~ /^$strip_prefix/)
+        {
+            confess(
+                "Prefix $strip_prefix is to be striped before saving, but Parameter ".
+                "${\ $param->name } does not start with $strip_prefix. If you are using Block.save_params, ".
+                "This may be due to your Block shares parameters from other ".
+                "Blocks or you forgot to use `with name_scope()`` during init. ".
+                "Consider switching to Block.collect_params.save and ".
+                "Block.collect_params.load instead."
+            );
+        }
+        $arg_dict{ substr($param->name, length $strip_prefix) } = $weight;
+    }
+    AI::MXNet::NDArray->save($filename, \%arg_dict);
+}
+
+=head2
+
+        Load parameters from file.
+
+        $filename : str
+            Path to parameter file.
+        :$ctx : AI::MXNet::Context or array ref of AI::MXNet::Context objects
+            Context(s) initialize loaded parameters on.
+        :$allow_missing : bool, default False
+            Whether to silently skip loading parameters not represents in the file.
+        :$ignore_extra : bool, default False
+            Whether to silently ignore parameters from the file that are not
+            present in this ParameterDict.
+        :$restore_prefix : str, default ''
+            prepend prefix to names of stored parameters before loading.
+=cut
+method load(
+    Str                                              $filename,
+    AI::MXNet::Context|ArrayRef[AI::MXNet::Context] :$ctx=AI::MXNet::Context->current_ctx,
+    Bool                                            :$allow_missing=0,
+    Bool                                            :$ignore_extra=0,
+    Str                                             :$restore_prefix=''
+)
+{
+    if($restore_prefix)
+    {
+        for my $name ($self->keys())
+        {
+            assert(
+                ($name =~ /^$restore_prefix/),
+                "restore_prefix is $restore_prefix but Parameters name $name does not start ".
+                "with $restore_prefix"
+            );
+        }
+    }
+    my $lprefix  = length $restore_prefix;
+    my %orig_load = %{ AI::MXNet::NDArray->load($filename) };
+    my %arg_dict  = map { ($restore_prefix.$_, $orig_load{$_}) } keys %orig_load;
+    if(not $allow_missing)
+    {
+        for my $name ($self->keys())
+        {
+            assert(
+                (exists $arg_dict{ $name }),
+                sprintf("Parameter %s is missing in file %s", substr($name, $lprefix), $filename)
+            );
+        }
+    }
+    for my $name (keys %arg_dict)
+    {
+        if(not $self->_params->exists($name))
+        {
+            assert(
+                $ignore_extra,
+                sprintf(
+                    "Parameter %s loaded from file %s is not present in ParameterDict",
+                    substr($name, $lprefix),
+                    $filename
+                )
+            );
+            next;
+        }
+        @{$self}{$name}->_load_init($arg_dict{$name}, $ctx);
+    }
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN.pm
new file mode 100644
index 000000000000..6a5122713091
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN.pm
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::Gluon::RNN;
+use strict;
+use warnings;
+use AI::MXNet::Gluon::RNN::Layer;
+use AI::MXNet::Gluon::RNN::Cell;
+
+sub import
+{
+    my ($class, $short_name) = @_;
+    if($short_name)
+    {
+        $short_name =~ s/[^\w:]//g;
+        if(length $short_name)
+        {
+            my $short_name_package =<<"EOP";
+            package $short_name;
+            \@${short_name}::ISA = ('AI::MXNet::Gluon::RNN_');;
+            1;
+EOP
+            eval $short_name_package;
+        }
+    }
+}
+
+1;
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
new file mode 100644
index 000000000000..a3fb3c51a147
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Cell.pm
@@ -0,0 +1,1225 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+package AI::MXNet::Gluon::RNN::RecurrentCell;
+use Mouse::Role;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+
+method _cells_state_info($cells, $batch_size)
+{
+    return [map { @{ $_->state_info($batch_size) } } @{ $cells }];
+}
+
+method _cells_begin_state($cells, %kwargs)
+{
+    return [map { @{ $_->begin_state(%kwargs) } } @{ $cells }];
+}
+
+method _get_begin_state(GluonClass $F, $begin_state, GluonInput $inputs, $batch_size)
+{
+    if(not defined $begin_state)
+    {
+        if($F =~ /AI::MXNet::NDArray/)
+        {
+            my $ctx = blessed $inputs ? $inputs->context : $inputs->[0]->context;
+            {
+                local($AI::MXNet::current_ctx) = $ctx;
+                my $func = sub {
+                    my %kwargs = @_;
+                    my $shape = delete $kwargs{shape};
+                    return AI::MXNet::NDArray->zeros($shape, %kwargs);
+                };
+                $begin_state = $self->begin_state(batch_size => $batch_size, func => $func);
+            }
+        }
+        else
+        {
+            $begin_state = $self->begin_state(batch_size => $batch_size, func => sub { return $F->zeros(@_) });
+        }
+    }
+    return $begin_state;
+}
+
+method _format_sequence($length, $inputs, $layout, $merge, $in_layout=)
+{
+    assert(
+        (defined $inputs),
+        "unroll(inputs=None) has been deprecated. ".
+        "Please create input variables outside unroll."
+    );
+
+    my $axis = index($layout, 'T');
+    my $batch_axis = index($layout, 'N');
+    my $batch_size = 0;
+    my $in_axis = defined $in_layout ? index($in_layout, 'T') : $axis;
+    my $F;
+    if(blessed $inputs and $inputs->isa('AI::MXNet::Symbol'))
+    {
+        $F = 'AI::MXNet::Symbol';
+        if(not $merge)
+        {
+            assert(
+                (@{ $inputs->list_outputs() } == 1),
+                "unroll doesn't allow grouped symbol as input. Please convert ".
+                "to list with list(inputs) first or let unroll handle splitting"
+            );
+            $inputs = [
+                AI::MXNet::Symbol->split(
+                    $inputs, axis => $in_axis, num_outputs => $length, squeeze_axis => 1
+                )
+            ];
+        }
+    }
+    elsif(blessed $inputs and $inputs->isa('AI::MXNet::NDArray'))
+    {
+        $F = 'AI::MXNet::NDArray';
+        $batch_size = $inputs->shape->[$batch_axis];
+        if(not $merge)
+        {
+            assert(not defined $length or $length == $inputs->shape->[$in_axis]);
+            $inputs = as_array(
+                AI::MXNet::NDArray->split(
+                    $inputs, axis=>$in_axis,
+                    num_outputs => $inputs->shape->[$in_axis],
+                    squeeze_axis => 1
+                )
+            );
+        }
+    }
+    else
+    {
+        assert(not defined $length or @{ $inputs } == $length);
+        if($inputs->[0]->isa('AI::MXNet::Symbol'))
+        {
+            $F = 'AI::MXNet::Symbol';
+        }
+        else
+        {
+            $F = 'AI::MXNet::NDArray';
+            $batch_size = $inputs->[0]->shape->[$batch_axis];
+        }
+        if($merge)
+        {
+            $inputs  = [map { $F->expand_dims($_, axis => $axis) } @{ $inputs }];
+            $inputs  = $F->concat(@{ $inputs }, dim => $axis);
+            $in_axis = $axis;
+        }
+    }
+    if(blessed $inputs and $axis != $in_axis)
+    {
+        $inputs = $F->swapaxes($inputs, dim1=>$axis, dim2=>$in_axis);
+    }
+    return ($inputs, $axis, $F, $batch_size);
+}
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::RecurrentCell
+=cut
+
+=head1 DESCRIPTION
+
+    Abstract role for RNN cells
+
+    Parameters
+    ----------
+    prefix : str, optional
+        Prefix for names of `Block`s
+        (this prefix is also used for names of weights if `params` is `None`
+        i.e. if `params` are being created and not reused)
+    params : Parameter or None, optional
+        Container for weight sharing between cells.
+        A new Parameter container is created if `params` is `None`.
+=cut
+
+=head2 reset
+
+    Reset before re-using the cell for another graph.
+=cut
+
+method reset()
+{
+    $self->init_counter(-1);
+    $self->counter(-1);
+    $_->reset for @{ $self->_children };
+}
+
+=head2 state_info
+
+    Shape and layout information of states
+=cut
+method state_info(Int $batch_size=0)
+{
+    confess('Not Implemented');
+}
+
+=head2 begin_state
+
+        Initial state for this cell.
+
+        Parameters
+        ----------
+        $func : CodeRef, default sub { AI::MXNet::Symbol->zeros(@_) }
+            Function for creating initial state.
+
+            For Symbol API, func can be `symbol.zeros`, `symbol.uniform`,
+            `symbol.var etc`. Use `symbol.var` if you want to directly
+            feed input as states.
+
+            For NDArray API, func can be `ndarray.zeros`, `ndarray.ones`, etc.
+        $batch_size: int, default 0
+            Only required for NDArray API. Size of the batch ('N' in layout)
+            dimension of input.
+
+        %kwargs :
+            Additional keyword arguments passed to func. For example
+            `mean`, `std`, `dtype`, etc.
+
+        Returns
+        -------
+        states : nested array ref of Symbol
+            Starting states for the first RNN step.
+=cut
+
+method begin_state(Int :$batch_size=0, CodeRef :$func=, %kwargs)
+{
+    $func //= sub {
+        my %kwargs = @_;
+        my $shape = delete $kwargs{shape};
+        return AI::MXNet::NDArray->zeros($shape, %kwargs);
+    };
+    assert(
+        (not $self->modified),
+        "After applying modifier cells (e.g. ZoneoutCell) the base ".
+        "cell cannot be called directly. Call the modifier cell instead."
+    );
+    my @states;
+    for my $info (@{ $self->state_info($batch_size) })
+    {
+        $self->init_counter($self->init_counter + 1);
+        if(defined $info)
+        {
+            %$info = (%$info, %kwargs);
+        }
+        else
+        {
+            $info = \%kwargs;
+        }
+        my $state = $func->(
+            name => "${\ $self->_prefix }begin_state_${\ $self->init_counter }",
+            %$info
+        );
+        push @states, $state;
+    }
+    return \@states;
+}
+
+=head2 unroll
+
+        Unrolls an RNN cell across time steps.
+
+        Parameters
+        ----------
+        $length : int
+            Number of steps to unroll.
+        $inputs : Symbol, list of Symbol, or None
+            If `inputs` is a single Symbol (usually the output
+            of Embedding symbol), it should have shape
+            (batch_size, length, ...) if `layout` is 'NTC',
+            or (length, batch_size, ...) if `layout` is 'TNC'.
+
+            If `inputs` is a list of symbols (usually output of
+            previous unroll), they should all have shape
+            (batch_size, ...).
+        :$begin_state : nested list of Symbol, optional
+            Input states created by `begin_state()`
+            or output state of another cell.
+            Created from `begin_state()` if `None`.
+        :$layout : str, optional
+            `layout` of input symbol. Only used if inputs
+            is a single Symbol.
+        :$merge_outputs : bool, optional
+            If `False`, returns outputs as a list of Symbols.
+            If `True`, concatenates output across time steps
+            and returns a single symbol with shape
+            (batch_size, length, ...) if layout is 'NTC',
+            or (length, batch_size, ...) if layout is 'TNC'.
+            If `None`, output whatever is faster.
+
+        Returns
+        -------
+        outputs : list of Symbol or Symbol
+            Symbol (if `merge_outputs` is True) or list of Symbols
+            (if `merge_outputs` is False) corresponding to the output from
+            the RNN from this unrolling.
+
+        states : list of Symbol
+            The new state of this RNN after this unrolling.
+            The type of this symbol is same as the output of `begin_state()`.
+=cut
+
+method unroll(
+    Int $length,
+    Maybe[GluonInput] $inputs,
+    Maybe[GluonInput] :$begin_state=,
+    Str :$layout='NTC',
+    Maybe[Bool] :$merge_outputs=
+)
+{
+    $self->reset();
+    my ($F, $batch_size);
+    ($inputs, undef, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0);
+    $begin_state //= $self->_get_begin_state($F, $begin_state, $inputs, $batch_size);
+
+    my $states = $begin_state;
+    my $outputs = [];
+    use Data::Dumper;
+    for my $i (0..$length-1)
+    {
+        my $output;
+        ($output, $states) = $self->($inputs->[$i], $states);
+        push @$outputs, $output;
+    }
+    ($outputs) = $self->_format_sequence($length, $outputs, $layout, $merge_outputs);
+    return ($outputs, $states);
+}
+
+method _get_activation(GluonClass $F, GluonInput $inputs, Activation $activation, %kwargs)
+{
+    if(not blessed $activation)
+    {
+        return $F->Activation($inputs, act_type=>$activation, %kwargs);
+    }
+    else
+    {
+        return $activation->($inputs, %kwargs);
+    }
+}
+
+=head2 forward
+
+        Unrolls the recurrent cell for one time step.
+
+        Parameters
+        ----------
+        inputs : sym.Variable
+            Input symbol, 2D, of shape (batch_size * num_units).
+        states : list of sym.Variable
+            RNN state from previous step or the output of begin_state().
+
+        Returns
+        -------
+        output : Symbol
+            Symbol corresponding to the output from the RNN when unrolling
+            for a single time step.
+        states : list of Symbol
+            The new state of this RNN after this unrolling.
+            The type of this symbol is same as the output of `begin_state()`.
+            This can be used as an input state to the next time step
+            of this RNN.
+
+        See Also
+        --------
+        begin_state: This function can provide the states for the first time step.
+        unroll: This function unrolls an RNN for a given number of (>=1) time steps.
+=cut
+
+package AI::MXNet::Gluon::RNN::HybridRecurrentCell;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::HybridBlock';
+with 'AI::MXNet::Gluon::RNN::RecurrentCell';
+has 'modified'      => (is => 'rw', isa => 'Bool', default => 0);
+has [qw/counter
+     init_counter/] => (is => 'rw', isa => 'Int', default => -1);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->reset;
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    my $s = '%s(%s';
+    if($self->can('activation'))
+    {
+        $s .= ", ${\ $self->activation }";
+    }
+    $s .= ')';
+    my $mapping = $self->input_size ? $self->input_size . " -> " . $self->hidden_size : $self->hidden_size;
+    return sprintf($s, $self->_class_name, $mapping);
+};
+
+method forward(GluonInput $inputs, Maybe[GluonInput|ArrayRef[GluonInput]] $states)
+{
+    $self->counter($self->counter + 1);
+    $self->SUPER::forward($inputs, $states);
+}
+
+package AI::MXNet::Gluon::RNN::RNNCell;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::RNNCell
+=cut
+
+=head1 DESCRIPTION
+
+    Simple recurrent neural network cell.
+
+    Parameters
+    ----------
+    hidden_size : int
+        Number of units in output symbol
+    activation : str or Symbol, default 'tanh'
+        Type of activation function.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the linear
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the linear
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    h2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    prefix : str, default 'rnn_'
+        Prefix for name of `Block`s
+        (and name of weight if params is `None`).
+    params : Parameter or None
+        Container for weight sharing between cells.
+        Created if `None`.
+=cut
+
+has 'hidden_size' => (is => 'rw', isa => 'Int', required => 1);
+has 'activation'  => (is => 'rw', isa => 'Activation', default => 'tanh');
+has [qw/
+    i2h_weight_initializer
+    h2h_weight_initializer
+    /]            => (is => 'rw', isa => 'Maybe[Initializer]');
+has [qw/
+    i2h_bias_initializer
+    h2h_bias_initializer
+    /]            => (is => 'rw', isa => 'Maybe[Initializer]', default => 'zeros');
+has 'input_size'  => (is => 'rw', isa => 'Int', default => 0);
+has [qw/
+        i2h_weight
+        h2h_weight
+        i2h_bias
+        h2h_bias
+    /]            => (is => 'rw', init_arg => undef);
+
+method python_constructor_arguments()
+{
+    [qw/
+        hidden_size activation 
+        i2h_weight_initializer h2h_weight_initializer
+        i2h_bias_initializer h2h_bias_initializer
+        input_size
+    /];
+}
+
+sub BUILD
+{
+    my $self = shift;
+    $self->i2h_weight($self->params->get(
+        'i2h_weight', shape=>[$self->hidden_size, $self->input_size],
+        init => $self->i2h_weight_initializer,
+        allow_deferred_init => 1
+    ));
+    $self->h2h_weight($self->params->get(
+        'h2h_weight', shape=>[$self->hidden_size, $self->hidden_size],
+        init => $self->h2h_weight_initializer,
+        allow_deferred_init => 1
+    ));
+    $self->i2h_bias($self->params->get(
+        'i2h_bias', shape=>[$self->hidden_size],
+        init => $self->i2h_bias_initializer,
+        allow_deferred_init => 1
+    ));
+    $self->h2h_bias($self->params->get(
+        'h2h_bias', shape=>[$self->hidden_size],
+        init => $self->h2h_bias_initializer,
+        allow_deferred_init => 1
+    ));
+}
+
+method state_info(Int $batch_size=0)
+{
+    return [{ shape => [$batch_size, $self->hidden_size], __layout__ => 'NC' }];
+}
+
+method _alias() { 'rnn' }
+
+method hybrid_forward(
+    GluonClass $F, GluonInput $inputs, GluonInput $states,
+    GluonInput :$i2h_weight, GluonInput :$h2h_weight, GluonInput :$i2h_bias, GluonInput :$h2h_bias
+)
+{
+    my $prefix = "t${\ $self->counter}_";
+    my $i2h = $F->FullyConnected(
+        $inputs, $i2h_weight, $i2h_bias,
+        num_hidden => $self->hidden_size,
+        name => "${prefix}i2h"
+    );
+    my $h2h = $F->FullyConnected(
+        $states->[0], $h2h_weight, $h2h_bias,
+        num_hidden => $self->hidden_size,
+        name => "${prefix}h2h"
+    );
+    my $output = $self->_get_activation($F, $i2h + $h2h, $self->activation, name => "${prefix}out");
+    return ($output, [$output]);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+package AI::MXNet::Gluon::RNN::LSTMCell;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::LSTMCell
+=cut
+
+=head1 DESCRIPTION
+
+    Long-Short Term Memory (LSTM) network cell.
+
+    Parameters
+    ----------
+    hidden_size : int
+        Number of units in output symbol.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the linear
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the linear
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer, default 'lstmbias'
+        Initializer for the bias vector. By default, bias for the forget
+        gate is initialized to 1 while all other biases are initialized
+        to zero.
+    h2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    prefix : str, default 'lstm_'
+        Prefix for name of `Block`s
+        (and name of weight if params is `None`).
+    params : Parameter or None
+        Container for weight sharing between cells.
+        Created if `None`.
+=cut
+
+has 'hidden_size' => (is => 'rw', isa => 'Int', required => 1);
+has [qw/
+    i2h_weight_initializer
+    h2h_weight_initializer
+    /]            => (is => 'rw', isa => 'Maybe[Initializer]');
+has [qw/
+    i2h_bias_initializer
+    h2h_bias_initializer
+    /]            => (is => 'rw', isa => 'Maybe[Initializer]', default => 'zeros');
+has 'input_size'  => (is => 'rw', isa => 'Int', default => 0);
+has [qw/
+        i2h_weight
+        h2h_weight
+        i2h_bias
+        h2h_bias
+    /]            => (is => 'rw', init_arg => undef);
+
+method python_constructor_arguments()
+{
+    [qw/
+        hidden_size
+        i2h_weight_initializer h2h_weight_initializer
+        i2h_bias_initializer h2h_bias_initializer
+        input_size
+    /];
+}
+
+sub BUILD
+{
+    my $self = shift;
+    $self->i2h_weight($self->params->get(
+        'i2h_weight', shape=>[4*$self->hidden_size, $self->input_size],
+        init => $self->i2h_weight_initializer,
+        allow_deferred_init => 1
+    ));
+    $self->h2h_weight($self->params->get(
+        'h2h_weight', shape=>[4*$self->hidden_size, $self->hidden_size],
+        init => $self->h2h_weight_initializer,
+        allow_deferred_init => 1
+    ));
+    $self->i2h_bias($self->params->get(
+        'i2h_bias', shape=>[4*$self->hidden_size],
+        init => $self->i2h_bias_initializer,
+        allow_deferred_init => 1
+    ));
+    $self->h2h_bias($self->params->get(
+        'h2h_bias', shape=>[4*$self->hidden_size],
+        init => $self->h2h_bias_initializer,
+        allow_deferred_init => 1
+    ));
+}
+
+method state_info(Int $batch_size=0)
+{
+    return [
+        { shape => [$batch_size, $self->hidden_size], __layout__ => 'NC' },
+        { shape => [$batch_size, $self->hidden_size], __layout__ => 'NC' }
+    ];
+}
+
+method _alias() { 'lstm' }
+
+method hybrid_forward(
+    GluonClass $F, GluonInput $inputs, GluonInput $states,
+    GluonInput :$i2h_weight, GluonInput :$h2h_weight, GluonInput :$i2h_bias, GluonInput :$h2h_bias
+)
+{
+    my $prefix = "t${\ $self->counter}_";
+    my $i2h = $F->FullyConnected(
+        $inputs, $i2h_weight, $i2h_bias,
+        num_hidden => $self->hidden_size*4,
+        name => "${prefix}i2h"
+    );
+    my $h2h = $F->FullyConnected(
+        $states->[0], $h2h_weight, $h2h_bias,
+        num_hidden => $self->hidden_size*4,
+        name => "${prefix}h2h"
+    );
+    my $gates = $i2h + $h2h;
+    my @slice_gates = @{ $F->SliceChannel($gates, num_outputs => 4, name => "${prefix}slice") };
+    my $in_gate = $F->Activation($slice_gates[0], act_type=>"sigmoid", name => "${prefix}i");
+    my $forget_gate = $F->Activation($slice_gates[1], act_type=>"sigmoid", name => "${prefix}f");
+    my $in_transform = $F->Activation($slice_gates[2], act_type=>"tanh", name => "${prefix}c");
+    my $out_gate = $F->Activation($slice_gates[3], act_type=>"sigmoid", name => "${prefix}o");
+    my $next_c = $F->_plus($forget_gate * $states->[1], $in_gate * $in_transform, name => "${prefix}state");
+    my $next_h = $F->_mul($out_gate, $F->Activation($next_c, act_type=>"tanh"), name => "${prefix}out");
+    return ($next_h, [$next_h, $next_c]);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+package AI::MXNet::Gluon::RNN::GRUCell;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::GRUCell
+=cut
+
+=head1 DESCRIPTION
+
+    Gated Rectified Unit (GRU) network cell.
+    Note: this is an implementation of the cuDNN version of GRUs
+    (slight modification compared to Cho et al. 2014).
+
+    Parameters
+    ----------
+    hidden_size : int
+        Number of units in output symbol.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the linear
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the linear
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    h2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    prefix : str, default 'gru_'
+        prefix for name of `Block`s
+        (and name of weight if params is `None`).
+    params : Parameter or None
+        Container for weight sharing between cells.
+        Created if `None`.
+=cut
+
+has 'hidden_size' => (is => 'rw', isa => 'Int', required => 1);
+has [qw/
+    i2h_weight_initializer
+    h2h_weight_initializer
+    /]            => (is => 'rw', isa => 'Maybe[Initializer]');
+has [qw/
+    i2h_bias_initializer
+    h2h_bias_initializer
+    /]            => (is => 'rw', isa => 'Maybe[Initializer]', default => 'zeros');
+has 'input_size'  => (is => 'rw', isa => 'Int', default => 0);
+has [qw/
+        i2h_weight
+        h2h_weight
+        i2h_bias
+        h2h_bias
+    /]            => (is => 'rw', init_arg => undef);
+
+method python_constructor_arguments()
+{
+    [qw/
+        hidden_size
+        i2h_weight_initializer h2h_weight_initializer
+        i2h_bias_initializer h2h_bias_initializer
+        input_size
+    /];
+}
+
+sub BUILD
+{
+    my $self = shift;
+    $self->i2h_weight($self->params->get(
+        'i2h_weight', shape=>[3*$self->hidden_size, $self->input_size],
+        init => $self->i2h_weight_initializer,
+        allow_deferred_init => 1
+    ));
+    $self->h2h_weight($self->params->get(
+        'h2h_weight', shape=>[3*$self->hidden_size, $self->hidden_size],
+        init => $self->h2h_weight_initializer,
+        allow_deferred_init => 1
+    ));
+    $self->i2h_bias($self->params->get(
+        'i2h_bias', shape=>[3*$self->hidden_size],
+        init => $self->i2h_bias_initializer,
+        allow_deferred_init => 1
+    ));
+    $self->h2h_bias($self->params->get(
+        'h2h_bias', shape=>[3*$self->hidden_size],
+        init => $self->h2h_bias_initializer,
+        allow_deferred_init => 1
+    ));
+}
+
+method state_info(Int $batch_size=0)
+{
+    return [{ shape => [$batch_size, $self->hidden_size], __layout__ => 'NC' }];
+}
+
+method _alias() { 'gru' }
+
+method hybrid_forward(
+    GluonClass $F, GluonInput $inputs, GluonInput $states,
+    GluonInput :$i2h_weight, GluonInput :$h2h_weight, GluonInput :$i2h_bias, GluonInput :$h2h_bias
+)
+{
+    my $prefix = "t${\ $self->counter}_";
+    my $prev_state_h = $states->[0];
+    my $i2h = $F->FullyConnected(
+        $inputs, $i2h_weight, $i2h_bias,
+        num_hidden => $self->hidden_size*3,
+        name => "${prefix}i2h"
+    );
+    my $h2h = $F->FullyConnected(
+        $states->[0], $h2h_weight, $h2h_bias,
+        num_hidden => $self->hidden_size*3,
+        name => "${prefix}h2h"
+    );
+    my ($i2h_r, $i2h_z, $h2h_r, $h2h_z);
+    ($i2h_r, $i2h_z, $i2h) = @{ $F->SliceChannel($i2h, num_outputs => 3, name => "${prefix}i2h_slice") };
+    ($h2h_r, $h2h_z, $h2h) = @{ $F->SliceChannel($h2h, num_outputs => 3, name => "${prefix}h2h_slice") };
+    my $reset_gate  = $F->Activation($i2h_r + $h2h_r, act_type=>"sigmoid", name => "${prefix}r_act");
+    my $update_gate = $F->Activation($i2h_z + $h2h_z, act_type=>"sigmoid", name => "${prefix}z_act");
+    my $next_h_tmp = $F->Activation($i2h + $reset_gate * $h2h, act_type => "tanh", name => "${prefix}h_act");
+    my $next_h = $F->_plus((1 - $update_gate) * $next_h_tmp, $update_gate * $prev_state_h, name => "${prefix}out");
+    return ($next_h, [$next_h]);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+package AI::MXNet::Gluon::RNN::SequentialRNNCell;
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Base;
+no warnings 'redefine';
+extends 'AI::MXNet::Gluon::Block';
+with 'AI::MXNet::Gluon::RNN::RecurrentCell';
+has 'modified'      => (is => 'rw', isa => 'Bool', default => 0);
+has [qw/counter
+     init_counter/] => (is => 'rw', isa => 'Int', default => -1);
+
+sub BUILD
+{
+    my $self = shift;
+    $self->reset;
+}
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::SequentialRNNCell
+=cut
+
+=head1 DESCRIPTION
+
+    Sequentially stacking multiple RNN cells.
+=cut
+
+=head2 add
+
+    Appends a cell into the stack.
+
+    Parameters
+    ----------
+        cell : rnn cell
+=cut
+
+method add(AI::MXNet::Gluon::Block $cell)
+{
+    $self->register_child($cell);
+}
+
+method state_info(Int $batch_size=0)
+{
+    return $self->_cells_state_info($self->_children, $batch_size);
+}
+
+method begin_state(%kwargs)
+{
+    assert(
+        (not $self->modified),
+        "After applying modifier cells (e.g. ZoneoutCell) the base ".
+        "cell cannot be called directly. Call the modifier cell instead."
+    );
+    return $self->_cells_begin_state($self->_children, %kwargs);
+}
+
+method unroll(Int $length, GluonInput $inputs, Maybe[GluonInput] :$begin_state=, Str :$layout='NTC', Maybe[Bool] :$merge_outputs=)
+{
+    $self->reset();
+    my ($F, $batch_size);
+    ($inputs, undef, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, undef);
+    my $num_cells = @{ $self->_children };
+    $begin_state = $self->_get_begin_state($F, $begin_state, $inputs, $batch_size);
+    my $p = 0;
+    my @next_states;
+    my $states;
+    enumerate(sub {
+        my ($i, $cell) = @_;
+        my $n = @{ $cell->state_info() };
+        $states = [@{ $begin_state }[$p..$p+$n-1]];
+        $p += $n;
+        ($inputs, $states) = $cell->unroll(
+            $length, $inputs, begin_state => $states, layout => $layout,
+            merge_outputs => ($i < ($num_cells - 1)) ? undef : $merge_outputs
+        );
+        push @next_states, @{ $states };
+    }, $self->_children);
+    return ($inputs, \@next_states);
+}
+
+method call($inputs, $states)
+{
+    $self->counter($self->counter + 1);
+    my @next_states;
+    my $p = 0;
+    for my $cell (@{ $self->_children })
+    {
+        assert(not $cell->isa('AI::MXNet::Gluon::RNN::BidirectionalCell'));
+        my $n = @{ $cell->state_info() };
+        my $state = [@{ $states }[$p,$p+$n-1]];
+        $p += $n;
+        ($inputs, $state) = $cell->($inputs, $state);
+        push @next_states, @{ $state };
+    }
+    return ($inputs, \@next_states);
+}
+
+use overload '@{}' => sub { shift->_children };
+use overload '""'  => sub {
+    my $self = shift;
+    my $s = "%s(\n%s\n)";
+    my @children;
+    enumerate(sub {
+        my ($i, $m) = @_;
+        push @children, "($i): ". AI::MXNet::Base::_indent("$m", 2);
+    }, $self->_children);
+    return sprintf($s, $self->_class_name, join("\n", @children));
+};
+
+method hybrid_forward(@args)
+{
+    confess('Not Implemented');
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+package AI::MXNet::Gluon::RNN::DropoutCell;
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::DropoutCell
+=cut
+
+=head1 DESCRIPTION
+
+    Applies dropout on input.
+
+    Parameters
+    ----------
+    rate : float
+        Percentage of elements to drop out, which
+        is 1 - percentage to retain.
+=cut
+
+has 'rate' => (is => 'ro', isa => 'Num', required => 1);
+method python_constructor_arguments() { ['rate'] }
+
+method state_info(Int $batch_size=0) { [] }
+
+method _alias() { 'dropout' }
+
+method hybrid_forward(GluonClass $F, GluonInput $inputs, GluonInput $states)
+{
+    if($self->rate > 0)
+    {
+        $inputs = $F->Dropout($inputs, p => $self->rate, name => "t${\ $self->counter }_fwd");
+    }
+    return ($inputs, $states);
+}
+
+method unroll(Int $length, GluonInput $inputs, Maybe[GluonInput] :$begin_state=, Str :$layout='NTC', Maybe[Bool] :$merge_outputs=)
+{
+    $self->reset;
+    my $F;
+    ($inputs, undef, $F) = $self->_format_sequence($length, $inputs, $layout, $merge_outputs);
+    if(blessed $inputs)
+    {
+        return $self->hybrid_forward($F, $inputs, $begin_state//[]);
+    }
+    else
+    {
+        return $self->SUPER::unroll(
+            $length, $inputs, begin_state => $begin_state, layout => $layout,
+            merge_outputs => $merge_outputs
+        );
+    }
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    return $self->_class_name.'(rate ='.$self->rate.')';
+};
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+package AI::MXNet::Gluon::RNN::ModifierCell;
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
+has 'base_cell' => (is => 'rw', isa => 'AI::MXNet::Gluon::RNN::HybridRecurrentCell', required => 1);
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::ModifierCell
+=cut
+
+=head1 DESCRIPTION
+
+    Base class for modifier cells. A modifier
+    cell takes a base cell, apply modifications
+    on it (e.g. Zoneout), and returns a new cell.
+
+    After applying modifiers the base cell should
+    no longer be called directly. The modifier cell
+    should be used instead.
+=cut
+
+
+sub BUILD
+{
+    my $self = shift;
+    assert(
+        (not $self->base_cell->modified),
+        "Cell ${\ $self->base_cell->name } is already modified. One cell cannot be modified twice"
+    );
+    $self->base_cell->modified(1);
+}
+
+method params()
+{
+    return $self->base_cell->params;
+}
+
+method state_info(Int $batch_size=0)
+{
+    return $self->base_cell->state_info($batch_size);
+
+}
+
+method begin_state(CodeRef :$func=sub{ AI::MXNet::Symbol->zeros(@_) }, %kwargs)
+{
+    assert(
+        (not $self->modified),
+        "After applying modifier cells (e.g. DropoutCell) the base ".
+        "cell cannot be called directly. Call the modifier cell instead."
+    );
+    $self->base_cell->modified(0);
+    my $begin = $self->base_cell->begin_state(func => $func, %kwargs);
+    $self->base_cell->modified(1);
+    return $begin;
+}
+
+method hybrid_forward(GluonClass $F, GluonInput $inputs, GluonInput $states)
+{
+    confess('Not Implemented');
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    return $self->_class_name.'('.$self->base_cell.')';
+};
+
+package AI::MXNet::Gluon::RNN::ZoneoutCell;
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::Gluon::RNN::ModifierCell';
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::ZoneoutCell
+=cut
+
+=head1 DESCRIPTION
+
+    Applies Zoneout on base cell.
+=cut
+has [qw/zoneout_outputs
+        zoneout_states/] => (is => 'ro', isa => 'Num', default => 0);
+has 'prev_output' => (is => 'rw', init_arg => undef);
+method python_constructor_arguments() { ['base_cell', 'zoneout_outputs', 'zoneout_states'] }
+
+sub BUILD
+{
+    my $self = shift;
+    assert(
+        (not $self->base_cell->isa('AI::MXNet::Gluon::RNN::BidirectionalCell')),
+        "BidirectionalCell doesn't support zoneout since it doesn't support step. ".
+        "Please add ZoneoutCell to the cells underneath instead."
+    );
+    assert(
+        (not $self->base_cell->isa('AI::MXNet::Gluon::RNN::SequentialRNNCel') or not $self->base_cell->bidirectional),
+        "Bidirectional SequentialRNNCell doesn't support zoneout. ".
+        "Please add ZoneoutCell to the cells underneath instead."
+    );
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    return $self->_class_name.'(p_out='.$self->zoneout_outputs.', p_state='.$self->zoneout_states.
+           ', '.$self->base_cell.')';
+};
+
+method _alias() { 'zoneout' }
+
+method reset()
+{
+    $self->SUPER::reset();
+    $self->prev_output(undef);
+}
+
+method hybrid_forward(GluonClass $F, GluonInput $inputs, GluonInput $states)
+{
+    my ($cell, $p_outputs, $p_states) = ($self->base_cell, $self->zoneout_outputs, $self->zoneout_states);
+    my ($next_output, $next_states) = $cell->($inputs, $states);
+    my $mask = sub { my ($p, $like) = @_; $F->Dropout($F->ones_like($like), p=>$p) };
+
+    my $prev_output = $self->prev_output//$F->zeros_like($next_output);
+    my $output = $p_outputs != 0 ? $F->where($mask->($p_outputs, $next_output), $next_output, $prev_output) : $next_output;
+    if($p_states != 0)
+    {
+        my @tmp;
+        for(zip($next_states, $states)) {
+            my ($new_s, $old_s) = @$_;
+            push @tmp, $F->where($mask->($p_states, $new_s), $new_s, $old_s);
+        }
+        $states = \@tmp;
+    }
+    else
+    {
+        $states = $next_states;
+    }
+    $self->prev_output($output);
+    return ($output, $states);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+package AI::MXNet::Gluon::RNN::ResidualCell;
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::Gluon::RNN::ModifierCell';
+method python_constructor_arguments() { ['base_cell'] }
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::ResidualCell
+=cut
+
+=head1 DESCRIPTION
+
+    Adds residual connection as described in Wu et al, 2016
+    (https://arxiv.org/abs/1609.08144).
+    Output of the cell is output of the base cell plus input.
+=cut
+
+method hybrid_forward(GluonClas $F, GluonInput $inputs, GluonInput $states)
+{
+    my $output;
+    ($output, $states) = $self->base_cell->($inputs, $states);
+    $output = $F->elemwise_add($output, $inputs, name => "t${\ $self->counter }_fwd");
+    return ($output, $states);
+}
+
+method unroll(Int $length, GluonInput $inputs, Maybe[GluonInput] :$begin_state=, Str :$layout='NTC', Maybe[Bool] :$merge_outputs=)
+{
+    $self->reset();
+
+    $self->base_cell->modified(0);
+    my ($outputs, $states) = $self->base_cell->unroll(
+        $length, $inputs, begin_state => $begin_state, layout => $layout, merge_outputs => $merge_outputs
+    );
+    $self->base_cell->modified(1);
+
+    $merge_outputs //= blessed $outputs ? 1 : 0;
+    my $F;
+    ($inputs, undef, $F) = $self->_format_sequence($length, $inputs, $layout, $merge_outputs);
+    if($merge_outputs)
+    {
+        $outputs = $F->elemwise_add($outputs, $inputs);
+    }
+    else
+    {
+        my @tmp;
+        for(zip($outputs, $inputs)) {
+            my ($i, $j) = @$_;
+            push @tmp, $F->elemwise_add($i, $j);
+        }
+        $outputs = \@tmp;
+    }
+    return ($outputs, $states);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+package AI::MXNet::Gluon::RNN::BidirectionalCell;
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
+has [qw/l_cell r_cell/] => (is => 'ro', isa => 'AI::MXNet::Gluon::RNN::HybridRecurrentCell', required => 1);
+has 'output_prefix'     => (is => 'ro', isa => 'Str', default => 'bi_');
+method python_constructor_arguments() { ['l_cell', 'r_cell', 'output_prefix'] }
+
+=head1 NAME
+
+    AI::MXNet::Gluon::RNN::BidirectionalCell
+=cut
+
+=head1 DESCRIPTION
+
+    Bidirectional RNN cell.
+
+    Parameters
+    ----------
+    l_cell : RecurrentCell
+        Cell for forward unrolling
+    r_cell : RecurrentCell
+        Cell for backward unrolling
+=cut
+
+method call($inputs, $states)
+{
+    confess("Bidirectional cell cannot be stepped. Please use unroll");
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    "${\ $self->_class_name }(forward=${\ $self->l_cell }, backward=${\ $self->r_cell })";
+};
+
+method state_info(Int $batch_size=0)
+{
+    return $self->_cells_state_info($self->_children, $batch_size);
+}
+
+method begin_state(%kwargs)
+{
+    assert(
+        (not $self->modified),
+        "After applying modifier cells (e.g. DropoutCell) the base ".
+        "cell cannot be called directly. Call the modifier cell instead."
+    );
+    return $self->_cells_begin_state($self->_children, %kwargs);
+}
+
+method unroll(Int $length, GluonInput $inputs, Maybe[GluonInput] :$begin_state=, Str :$layout='NTC', Maybe[Bool] :$merge_outputs=)
+{
+    $self->reset();
+    my ($axis, $F, $batch_size);
+    ($inputs, $axis, $F, $batch_size) = $self->_format_sequence($length, $inputs, $layout, 0);
+    $begin_state //= $self->_get_begin_state($F, $begin_state, $inputs, $batch_size);
+
+    my $states = $begin_state;
+    my ($l_cell, $r_cell) = @{ $self->_children };
+    $l_cell->state_info($batch_size);
+    my ($l_outputs, $l_states) = $l_cell->unroll(
+            $length, $inputs,
+            begin_state => [@{ $states }[0..@{ $l_cell->state_info($batch_size) }-1]],
+            layout => $layout,
+            merge_outputs => $merge_outputs
+    );
+    my ($r_outputs, $r_states) = $r_cell->unroll(
+        $length, [reverse @{$inputs}],
+        begin_state     => [@{$states}[@{ $l_cell->state_info }..@{$states}-1]],
+        layout          => $layout,
+        merge_outputs   => $merge_outputs
+    );
+    if(not defined $merge_outputs)
+    {
+        $merge_outputs = blessed $l_outputs and blessed $r_outputs;
+        ($l_outputs) = $self->_format_sequence(undef, $l_outputs, $layout, $merge_outputs);
+        ($r_outputs) = $self->_format_sequence(undef, $r_outputs, $layout, $merge_outputs);
+    }
+    my $outputs;
+    if($merge_outputs)
+    {
+        $r_outputs = $F->reverse($r_outputs, axis=>$axis);
+        $outputs = $F->concat($l_outputs, $r_outputs, dim=>2, name=>$self->output_prefix.'out');
+    }
+    else
+    {
+        $outputs = [];
+        enumerate(sub {
+            my ($i, $l_o, $r_o) = @_;
+                push @$outputs, $F->concat(
+                    $l_o, $r_o, dim=>1,
+                    name => sprintf('%st%d', $self->output_prefix, $i)
+                );
+            }, [@{ $l_outputs }], [reverse(@{ $r_outputs })]
+        );
+    }
+    $states = [@{ $l_states }, @{ $r_states }];
+    return ($outputs, $states);
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
new file mode 100644
index 000000000000..2b6e8a5bdae4
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/RNN/Layer.pm
@@ -0,0 +1,681 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+package AI::MXNet::Gluon::RNN::Layer;
+use AI::MXNet::Function::Parameters;
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::Gluon::Block';
+
+has 'hidden_size'   => (is => 'rw', isa => 'Int');
+has 'num_layers'    => (is => 'rw', isa => 'Int');
+has 'layout'        => (is => 'rw', isa => 'Str');
+has 'dropout'       => (is => 'rw', isa => 'Num');
+has 'bidirectional' => (is => 'rw', isa => 'Bool');
+has 'input_size'    => (is => 'rw', isa => 'Int', default => 0);
+has [qw/
+    i2h_weight_initializer
+    h2h_weight_initializer
+    i2h_bias_initializer
+    h2h_bias_initializer
+    /]              => (is => 'rw', isa => 'Maybe[Initializer]');
+has 'mode'          => (is => 'rw', isa => 'Str');
+has [qw/dir gates
+    i2h_weight
+    h2h_weight
+    i2h_bias
+    h2h_bias
+    unfused/]       => (is => 'rw', init_arg => undef);
+
+method python_constructor_arguments()
+{
+    [qw/
+        hidden_size num_layers layout
+        dropout bidirectional input_size
+        i2h_weight_initializer h2h_weight_initializer
+        i2h_bias_initializer h2h_bias_initializer
+        mode
+    /];
+}
+
+sub BUILD
+{
+    my $self = shift;
+    assert(
+        ($self->layout eq 'TNC' or $self->layout eq 'NTC'),
+        "Invalid layout [${\ $self->layout }]; must be one of ['TNC' or 'NTC']"
+    );
+    $self->i2h_weight([]);
+    $self->h2h_weight([]);
+    $self->i2h_bias([]);
+    $self->h2h_bias([]);
+    $self->dir($self->bidirectional ? 2 : 1);
+    $self->gates({qw/rnn_relu 1 rnn_tanh 1 lstm 4 gru 3/}->{$self->mode});
+    my ($ng, $ni, $nh) = ($self->gates, $self->input_size, $self->hidden_size);
+    for my $i (0..$self->num_layers-1)
+    {
+        for my $j ($self->dir == 2 ? ('l', 'r') : ('l'))
+        {
+            push @{ $self->i2h_weight }, $self->params->get(
+                "$j${i}_i2h_weight", shape=>[$ng*$nh, $ni],
+                init => $self->i2h_weight_initializer,
+                allow_deferred_init => 1
+            );
+            push @{ $self->h2h_weight }, $self->params->get(
+                "$j${i}_h2h_weight", shape=>[$ng*$nh, $nh],
+                init => $self->h2h_weight_initializer,
+                allow_deferred_init => 1
+            );
+            push @{ $self->i2h_bias }, $self->params->get(
+                "$j${i}_i2h_bias", shape=>[$ng*$nh],
+                init => $self->i2h_bias_initializer,
+                allow_deferred_init => 1
+            );
+            push @{ $self->h2h_bias }, $self->params->get(
+                "$j${i}_h2h_bias", shape=>[$ng*$nh],
+                init => $self->h2h_bias_initializer,
+                allow_deferred_init => 1
+            );
+        }
+        $ni = $nh * $self->dir;
+    }
+    $self->unfused($self->_unfuse());
+}
+
+use overload '""' => sub {
+    my $self = shift;
+    my $name = $self->_class_name;
+    my $mapping = $self->input_size ? $self->input_size.' -> '.$self->hidden_size : $self->hidden_size;
+    my $s = "$name($mapping, ${\ $self->layout }";
+    if($self->num_layers != 1)
+    {
+        $s .= ', num_layers='.$self->num_layers;
+    }
+    if($self->dropout != 0)
+    {
+        $s .= ', dropout='.$self->dropout;
+    }
+    if($self->dir == 2)
+    {
+        $s .= ', bidirectional';
+    }
+    $s .= ')';
+    return $s;
+};
+
+method state_info($batch_size=0)
+{
+    confess('NotImplementedError');
+}
+
+# Unfuses the fused RNN in to a stack of rnn cells.
+
+method _unfuse()
+{
+    my $get_cell = {
+        rnn_relu => sub {
+            my %kwargs = @_;
+            AI::MXNet::Gluon::RNN::RNNCell->new(
+                $self->hidden_size,
+                activation => 'relu',
+                %kwargs
+            )
+        },
+        rnn_tanh => sub {
+            my %kwargs = @_;
+            AI::MXNet::Gluon::RNN::RNNCell->new(
+                $self->hidden_size,
+                activation => 'tanh',
+                %kwargs
+            )
+        },
+        lstm => sub {
+            my %kwargs = @_;
+            AI::MXNet::Gluon::RNN::LSTMCell->new(
+                $self->hidden_size,
+                %kwargs
+            )
+        },
+        gru => sub {
+            my %kwargs = @_;
+            AI::MXNet::Gluon::RNN::GRUCell->new(
+                $self->hidden_size,
+                %kwargs
+            )
+        }
+    }->{$self->mode};
+    my $stack = AI::MXNet::Gluon::RNN::SequentialRNNCell->new(prefix => $self->prefix, params => $self->params);
+    $stack->name_scope(sub {
+        my $ni = $self->input_size;
+        for my $i (0..$self->num_layers-1)
+        {
+            my %kwargs = (
+                input_size => $ni,
+                i2h_weight_initializer => $self->i2h_weight_initializer,
+                h2h_weight_initializer => $self->h2h_weight_initializer,
+                i2h_bias_initializer   => $self->i2h_bias_initializer,
+                h2h_bias_initializer   => $self->h2h_bias_initializer
+            );
+            if($self->dir == 2)
+            {
+                $stack->add(
+                    AI::MXNet::Gluon::RNN::BidirectionalCell->new(
+                        $get_cell->(prefix=> "l${i}_", %kwargs),
+                        $get_cell->(prefix=> "r${i}_", %kwargs),
+                    )
+                );
+            }
+            else
+            {
+                $stack->add($get_cell->(prefix=> "l${i}_", %kwargs));
+            }
+            if($self->dropout > 0 and $i != ($self->_num_layers - 1))
+            {
+                $stack->add(AI::MXNet::Gluon::RNN::DropoutCell->new($self->dropout));
+            }
+            $ni = $self->hidden_size * $self->dir;
+        }
+    });
+    return $stack;
+}
+
+method begin_state(
+    $batch_size=0,
+    CodeRef :$func=sub { my %kwargs = @_; my $shape = delete $kwargs{shape}; AI::MXNet::NDArray->zeros($shape, %kwargs) },
+    %kwargs
+)
+{
+    my @states;
+    enumerate(sub {
+        my ($i, $info) = @_;
+        if(defined $info)
+        {
+            %$info = (%$info, %kwargs);
+        }
+        else
+        {
+            %$info = %kwargs;
+        }
+        push @states, $func->(name=> $self->prefix."h0_$i", %$info);
+    }, $self->state_info($batch_size));
+    return \@states;
+}
+
+use Data::Dumper;
+method forward(GluonInput $inputs, Maybe[GluonInput] $states=)
+{
+    my $batch_size = $inputs->shape->[index($self->layout, 'N')];
+    my $skip_states = not defined $states;
+    if($skip_states)
+    {
+        $states = $self->begin_state($batch_size, ctx=>$inputs->context);
+    }
+    if(blessed $states and $states->isa('AI::MXNet::NDArray'))
+    {
+        $states = [$states];
+    }
+    for(zip($states, $self->state_info($batch_size))) {
+        my ($state, $info) = @$_;
+        if(Dumper($state->shape) ne Dumper($info->{shape}))
+        {
+            my @state_shape = @{ $state->shape };
+            confess("Invalid recurrent state shape. Expecting @{$info->{shape}}, got @state_shape.");
+        }
+    }
+    if($self->input_size == 0)
+    {
+        for my $i (0..$self->dir-1)
+        {
+            $self->i2h_weight->[$i]->shape([$self->gates*$self->hidden_size, $inputs->shape->[2]]);
+            $self->i2h_weight->[$i]->_finish_deferred_init();
+        }
+    }
+    my $out;
+    if($inputs->context->device_type eq 'gpu')
+    {
+        $out = $self->_forward_gpu($inputs, $states);
+    }
+    else
+    {
+        $out = $self->_forward_cpu($inputs, $states);
+    }
+
+    # out is (output, state)
+    return $skip_states ? $out->[0] : $out;
+}
+
+method _forward_cpu($inputs, $states)
+{
+    my $ns = @{ $states };
+    my $axis = index($self->layout, 'T');
+    $states = [map { @{$_} } @{ $states }];
+    my $outputs;
+    ($outputs, $states) = $self->unfused->unroll(
+        $inputs->shape->[$axis], $inputs, begin_state => $states,
+        layout => $self->layout, merge_outputs => 1
+    );
+    my @new_states;
+    for my $i (0..$ns-1)
+    {
+        my @tmp;
+        for (my $j = $i; $j < @{ $states }; $j += $ns)
+        {
+            push @tmp, $states->[$j];
+        }
+        my $state = AI::MXNet::NDArray->concat((map { $_->reshape([1, @{ $_->shape }]) } @tmp), dim => 0);
+        push @new_states, $state;
+    }
+    return [$outputs, \@new_states];
+}
+
+method _forward_gpu($inputs, $states)
+{
+    if($self->layout eq 'NTC')
+    {
+        $inputs = $inputs->swapaxes(dim1 => 0, dim2 => 1);
+    }
+    my $ctx = $inputs->context;
+    my @params = map { $_->data($ctx)->reshape([-1]) } map { @{ $_ } } (
+        $self->i2h_weight, $self->h2h_weight,
+        $self->i2h_bias, $self->h2h_bias
+    );
+    my $params = AI::MXNet::NDArray->concat(@params, dim => 0);
+    my $rnn = AI::MXNet::NDArray->RNN(
+        $inputs, $params, @{ $states }, state_size => $self->hidden_size,
+        num_layers => $self->num_layers, bidirectional => $self->dir == 2 ? 1 : 0,
+        p => $self->dropout, state_outputs => 1, mode => $self->mode
+    );
+    my $outputs;
+    my @rnn = @{$rnn};
+    if($self->mode eq 'lstm')
+    {
+        ($outputs, $states) = ($rnn[0], [$rnn[1], $rnn[2]]);
+    }
+    else
+    {
+        ($outputs, $states) = ($rnn[0], [$rnn[1]]);
+    }
+    if($self->layout eq 'NTC')
+    {
+        $outputs = $outputs->swapaxes(dim1 => 0, dim2 => 1);
+    }
+    return [$outputs, $states];
+}
+
+
+package AI::MXNet::Gluon::RNN::RNN;
+
+=head1 NAME
+
+     AI::MXNet::Gluon::RNN::RNN
+=cut
+
+=head1 DESCRIPTION
+
+    Applies a multi-layer Elman RNN with `tanh` or `ReLU` non-linearity to an input sequence.
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})
+
+    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is the hidden
+    state of the previous layer at time `t` or :math:`input_t` for the first layer.
+    If nonlinearity='relu', then `ReLU` is used instead of `tanh`.
+
+    Parameters
+    ----------
+    hidden_size: int
+        The number of features in the hidden state h.
+    num_layers: int, default 1
+        Number of recurrent layers.
+    activation: {'relu' or 'tanh'}, default 'tanh'
+        The activation function to use.
+    layout : str, default 'TNC'
+        The format of input and output tensors. T, N and C stand for
+        sequence length, batch size, and feature dimensions respectively.
+    dropout: float, default 0
+        If non-zero, introduces a dropout layer on the outputs of each
+        RNN layer except the last layer.
+    bidirectional: bool, default False
+        If `True`, becomes a bidirectional RNN.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the linear
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the linear
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    h2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    input_size: int, default 0
+        The number of expected features in the input x.
+        If not specified, it will be inferred from input.
+    prefix : str or None
+        Prefix of this `Block`.
+    params : ParameterDict or None
+        Shared Parameters for this `Block`.
+
+
+    Input shapes:
+        The input shape depends on `layout`. For `layout='TNC'`, the
+        input has shape `(sequence_length, batch_size, input_size)`
+
+
+    Output shape:
+        The output shape depends on `layout`. For `layout='TNC'`, the
+        output has shape `(sequence_length, batch_size, num_hidden)`.
+        If `bidirectional` is True, output shape will instead be
+        `(sequence_length, batch_size, 2*num_hidden)`
+
+    Recurrent state:
+        The recurrent state is an NDArray with shape `(num_layers, batch_size, num_hidden)`.
+        If `bidirectional` is True, the recurrent state shape will instead be
+        `(2*num_layers, batch_size, num_hidden)`
+        If input recurrent state is None, zeros are used as default begin states,
+        and the output recurrent state is omitted.
+
+
+    Examples
+    --------
+    >>> layer = mx.gluon.rnn.RNN(100, 3)
+    >>> layer.initialize()
+    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
+    >>> # by default zeros are used as begin state
+    >>> output = layer(input)
+    >>> # manually specify begin state.
+    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
+    >>> output, hn = layer(input, h0)
+=cut
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::RNN::Layer';
+
+has '+num_layers'    => (default => 1);
+has 'activation'     => (is => 'rw', default => 'relu');
+has '+layout'        => (default => 'TNC');
+has '+dropout'       => (default => 0);
+has '+bidirectional' => (default => 0);
+has [qw/
+    +i2h_bias_initializer
+    +h2h_bias_initializer
+    /]               => (default => 'zeros');
+has '+mode'          => (default => sub { 'rnn_' . shift->activation }, lazy => 1);
+method python_constructor_arguments()
+{
+    [qw/
+        hidden_size num_layers activation layout
+        dropout bidirectional input_size
+        i2h_weight_initializer h2h_weight_initializer
+        i2h_bias_initializer h2h_bias_initializer
+    /];
+}
+
+method state_info(DimSize $batch_size=0)
+{
+    return [{
+        shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size],
+        __layout__ => 'LNC'
+    }];
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+package AI::MXNet::Gluon::RNN::LSTM;
+
+=head1 NANE
+
+    AI::MXNet::Gluon::RNN::LSTM
+=cut
+
+=head1 DESCRIPTION
+
+    Applies a multi-layer long short-term memory (LSTM) RNN to an input sequence.
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        \begin{array}{ll}
+        i_t = sigmoid(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
+        f_t = sigmoid(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
+        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
+        o_t = sigmoid(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
+        c_t = f_t * c_{(t-1)} + i_t * g_t \\
+        h_t = o_t * \tanh(c_t)
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the
+    cell state at time `t`, :math:`x_t` is the hidden state of the previous
+    layer at time `t` or :math:`input_t` for the first layer, and :math:`i_t`,
+    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and
+    out gates, respectively.
+
+    Parameters
+    ----------
+    hidden_size: int
+        The number of features in the hidden state h.
+    num_layers: int, default 1
+        Number of recurrent layers.
+    layout : str, default 'TNC'
+        The format of input and output tensors. T, N and C stand for
+        sequence length, batch size, and feature dimensions respectively.
+    dropout: float, default 0
+        If non-zero, introduces a dropout layer on the outputs of each
+        RNN layer except the last layer.
+    bidirectional: bool, default False
+        If `True`, becomes a bidirectional RNN.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the linear
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the linear
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer, default 'lstmbias'
+        Initializer for the bias vector. By default, bias for the forget
+        gate is initialized to 1 while all other biases are initialized
+        to zero.
+    h2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    input_size: int, default 0
+        The number of expected features in the input x.
+        If not specified, it will be inferred from input.
+    prefix : str or None
+        Prefix of this `Block`.
+    params : `ParameterDict` or `None`
+        Shared Parameters for this `Block`.
+
+
+    Input shapes:
+        The input shape depends on `layout`. For `layout='TNC'`, the
+        input has shape `(sequence_length, batch_size, input_size)`
+
+    Output shape:
+        The output shape depends on `layout`. For `layout='TNC'`, the
+        output has shape `(sequence_length, batch_size, num_hidden)`.
+        If `bidirectional` is True, output shape will instead be
+        `(sequence_length, batch_size, 2*num_hidden)`
+
+    Recurrent state:
+        The recurrent state is a list of two NDArrays. Both has shape
+        `(num_layers, batch_size, num_hidden)`.
+        If `bidirectional` is True, each recurrent state will instead have shape
+        `(2*num_layers, batch_size, num_hidden)`.
+        If input recurrent state is None, zeros are used as default begin states,
+        and the output recurrent state is omitted.
+
+
+    Examples
+    --------
+    >>> layer = mx.gluon.rnn.LSTM(100, 3)
+    >>> layer.initialize()
+    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
+    >>> # by default zeros are used as begin state
+    >>> output = layer(input)
+    >>> # manually specify begin state.
+    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
+    >>> c0 = mx.nd.random.uniform(shape=(3, 3, 100))
+    >>> output, hn = layer(input, [h0, c0])
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::RNN::Layer';
+
+has '+num_layers'    => (default => 1);
+has '+layout'        => (default => 'TNC');
+has '+dropout'       => (default => 0);
+has '+bidirectional' => (default => 0);
+has [qw/
+    +i2h_bias_initializer
+    +h2h_bias_initializer
+    /]               => (default => 'zeros');
+has '+mode'          => (default => 'lstm');
+
+method state_info(DimSize $batch_size=0)
+{
+    return [
+        {
+            shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size],
+            __layout__ => 'LNC'
+        },
+        {
+            shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size],
+            __layout__ => 'LNC'
+        }
+    ];
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+package AI::MXNet::Gluon::RNN::GRU;
+
+=head1 NANE
+
+    AI::MXNet::Gluon::RNN::GRU
+=cut
+
+=head1 DESCRIPTION
+
+    Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
+
+    For each element in the input sequence, each layer computes the following
+    function:
+
+    .. math::
+        \begin{array}{ll}
+        r_t = sigmoid(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+        i_t = sigmoid(W_{ii} x_t + b_{ii} + W_hi h_{(t-1)} + b_{hi}) \\
+        n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
+        h_t = (1 - i_t) * n_t + i_t * h_{(t-1)} \\
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden
+    state of the previous layer at time `t` or :math:`input_t` for the first layer,
+    and :math:`r_t`, :math:`i_t`, :math:`n_t` are the reset, input, and new gates, respectively.
+
+    Parameters
+    ----------
+    hidden_size: int
+        The number of features in the hidden state h
+    num_layers: int, default 1
+        Number of recurrent layers.
+    layout : str, default 'TNC'
+        The format of input and output tensors. T, N and C stand for
+        sequence length, batch size, and feature dimensions respectively.
+    dropout: float, default 0
+        If non-zero, introduces a dropout layer on the outputs of each
+        RNN layer except the last layer
+    bidirectional: bool, default False
+        If True, becomes a bidirectional RNN.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the linear
+        transformation of the inputs.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the linear
+        transformation of the recurrent state.
+    i2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    h2h_bias_initializer : str or Initializer
+        Initializer for the bias vector.
+    input_size: int, default 0
+        The number of expected features in the input x.
+        If not specified, it will be inferred from input.
+    prefix : str or None
+        Prefix of this `Block`.
+    params : ParameterDict or None
+        Shared Parameters for this `Block`.
+
+
+    Input shapes:
+        The input shape depends on `layout`. For `layout='TNC'`, the
+        input has shape `(sequence_length, batch_size, input_size)`
+
+    Output shape:
+        The output shape depends on `layout`. For `layout='TNC'`, the
+        output has shape `(sequence_length, batch_size, num_hidden)`.
+        If `bidirectional` is True, output shape will instead be
+        `(sequence_length, batch_size, 2*num_hidden)`
+
+    Recurrent state:
+        The recurrent state is an NDArray with shape `(num_layers, batch_size, num_hidden)`.
+        If `bidirectional` is True, the recurrent state shape will instead be
+        `(2*num_layers, batch_size, num_hidden)`
+        If input recurrent state is None, zeros are used as default begin states,
+        and the output recurrent state is omitted.
+
+
+    Examples
+    --------
+    >>> layer = mx.gluon.rnn.GRU(100, 3)
+    >>> layer.initialize()
+    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
+    >>> # by default zeros are used as begin state
+    >>> output = layer(input)
+    >>> # manually specify begin state.
+    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
+    >>> output, hn = layer(input, h0)
+=cut
+
+use AI::MXNet::Gluon::Mouse;
+extends 'AI::MXNet::Gluon::RNN::Layer';
+
+has '+num_layers'    => (default => 1);
+has '+layout'        => (default => 'TNC');
+has '+dropout'       => (default => 0);
+has '+bidirectional' => (default => 0);
+has [qw/
+    +i2h_bias_initializer
+    +h2h_bias_initializer
+    /]               => (default => 'zeros');
+has '+mode'          => (default => 'gru');
+
+method state_info(DimSize $batch_size=0)
+{
+    return [
+        {
+            shape => [$self->num_layers * $self->dir, $batch_size, $self->hidden_size],
+            __layout__ => 'LNC'
+        }
+    ];
+}
+
+__PACKAGE__->register('AI::MXNet::Gluon::RNN');
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm
new file mode 100644
index 000000000000..63f521c5c699
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Trainer.pm
@@ -0,0 +1,334 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+package AI::MXNet::Gluon::Trainer;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+use IO::File;
+use Mouse;
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Trainer
+=cut
+
+=head1 DESCRIPTION
+
+    Applies an `Optimizer` on a set of Parameters. Trainer should
+    be used together with `autograd`.
+
+    Parameters
+    ----------
+    params : ParameterDict
+        The set of parameters to optimize.
+    optimizer : str or Optimizer
+        The optimizer to use. See
+        `help <http://mxnet.io/api/python/optimization.html#the-mxnet-optimizer-package>`_
+        on Optimizer for a list of available optimizers.
+    optimizer_params : dict
+        Key-word arguments to be passed to optimizer constructor. For example,
+        `{'learning_rate': 0.1}`. All optimizers accept learning_rate, wd (weight decay),
+        clip_gradient, and lr_scheduler. See each optimizer's
+        constructor for a list of additional supported arguments.
+    kvstore : str or KVStore
+        kvstore type for multi-gpu and distributed training. See help on
+        :any:`mxnet.kvstore.create` for more information.
+=cut
+
+has '_params'          => (is => 'rw', init_arg => 'params', isa => 'HashRef|ArrayRef|AI::MXNet::Gluon::ParameterDict');
+has 'optimizer'        => (is => 'ro', isa => 'Optimizer');
+has 'optimizer_params' => (is => 'ro', isa => 'Maybe[HashRef]');
+has '_kv_store'        => (is => 'rw', init_arg => 'kvstore', isa => 'Maybe[KVStore]', default => 'device');
+has [qw/_scale _contexts
+    _kv_initialized
+    _update_on_kvstore
+    _updaters
+    _optimizer/]       => (is => 'rw', init_arg => undef);
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
+method python_constructor_arguments() { ['params', 'optimizer', 'optimizer_params'] }
+
+sub BUILD
+{
+    my $self = shift;
+    my @params;
+    if(blessed $self->_params)
+    {
+        @params = $self->_params->values;
+    }
+    elsif(ref $self->_params eq 'HASH')
+    {
+        @params = values %{ $self->_params };
+    }
+    else
+    {
+        @params = @{ $self->_params };
+    }
+    $self->_params([]);
+    for my $param (@params)
+    {
+        if(not(blessed $param and $param->isa('AI::MXNet::Gluon::Parameter')))
+        {
+            confess(
+                "First argument must be a array or hash of Parameters, ".
+                "got list of [$param]."
+            );
+        }
+        push @{ $self->_params }, $param;
+    }
+    my $optimizer_params = $self->optimizer_params//{};
+    $self->_scale(delete $optimizer_params->{rescale_grad}//1);
+    $self->_contexts($self->_check_contexts);
+    $self->_init_optimizer($self->optimizer, $optimizer_params);
+    $self->_kv_initialized(0);
+}
+
+method _check_contexts()
+{
+    my $contexts;
+    for my $param (@{ $self->_params })
+    {
+        my $ctx = $param->list_ctx;
+        assert(
+            (not defined $contexts or join('', @{ $contexts }) eq join('', @{ $ctx })),
+            "All Parameters must be initialized on the same set of contexts, ".
+            "but Parameter ${\ $param->name } is initialized on @{ $ctx//[] } while previous Parameters ".
+            "are initialized on @{ $contexts//[] }."
+        );
+        $contexts = $ctx;
+    }
+    return $contexts;
+}
+
+method _init_optimizer($optimizer, $optimizer_params)
+{
+    my %param_dict = map { $_ => $self->_params->[$_] } 0 .. @{ $self->_params } - 1;
+    if(blessed $optimizer and $optimizer->isa('AI::MXNet::Optimizer'))
+    {
+        assert(
+            (not %{ $optimizer_params }),
+            "optimizer_params must be empty if optimizer is an instance of ".
+            "Optimizer instead of str"
+        );
+        $self->_optimizer($optimizer);
+        $self->_optimizer->param_dict(\%param_dict);
+    }
+    else
+    {
+        $self->_optimizer(
+            AI::MXNet::Optimizer->create(
+                $optimizer, param_dict => \%param_dict,
+                %{ $optimizer_params }
+            )
+        );
+    }
+    $self->_updaters([
+        map { AI::MXNet::Optimizer->get_updater($self->_optimizer) } @{ $self->_contexts }
+    ]);
+}
+
+method _init_kvstore()
+{
+    my %arg_arrays = map { $_->name => $_->data($self->_contexts->[0]) } @{ $self->_params };
+    my ($kvstore, $update_on_kvstore) = AI::MXNet::Module::_create_kvstore(
+        $self->_kv_store, scalar(@{$self->_contexts }), \%arg_arrays
+    );
+    if($kvstore)
+    {
+        if($kvstore->type =~ /dist/)
+        {
+            $update_on_kvstore = 0;
+        }
+        enumerate(sub {
+            my ($i, $param) = @_;
+            my $param_arrays = $param->list_data;
+            $kvstore->init($i, $param_arrays->[0]);
+            $kvstore->pull($i, out => $param_arrays, priority => -$i);
+        }, $self->_params);
+        if($update_on_kvstore)
+        {
+            $kvstore->set_optimizer($self->_optimizer);
+        }
+        $self->_kv_store($kvstore);
+        $self->_update_on_kvstore($update_on_kvstore);
+    }
+    else
+    {
+        $self->_kv_store(undef);
+        $self->_update_on_kvstore(undef)
+    }
+    $self->_kv_initialized(1);
+}
+
+=head2 step
+
+        Makes one step of parameter update. Should be called after
+        `autograd.compute_gradient` and outside of `record()` scope.
+
+        Parameters
+        ----------
+        batch_size : int
+            Batch size of data processed. Gradient will be normalized by `1/batch_size`.
+            Set this to 1 if you normalized loss manually with `loss = mean(loss)`.
+        ignore_stale_grad : bool, optional, default=False
+            If true, ignores Parameters with stale gradient (gradient that has not
+            been updated by `backward` after last step) and skip update.
+=cut
+
+method step(Int $batch_size, Bool $ignore_stale_grad=0)
+{
+    if(not $self->_kv_initialized)
+    {
+        $self->_init_kvstore;
+    }
+    $self->_optimizer->rescale_grad($self->_scale/$batch_size);
+    enumerate(sub {
+        my ($i, $param) = @_;
+        return if $param->grad_req eq 'null';
+        if(not $ignore_stale_grad)
+        {
+            for my $data (@{ $param->list_data })
+            {
+                if(not $data->_fresh_grad)
+                {
+                    AI::MXNet::Logging->warning(
+                        "Gradient of Parameter `%s` on context %s has not been updated ".
+                        "by backward since last `step`. This could mean a bug in your ".
+                        "model that maked it only use a subset of the Parameters (Blocks) ".
+                        "for this iteration. If you are intentionally only using a subset, ".
+                        "call step with ignore_stale_grad=True to suppress this ".
+                        "warning and skip updating of Parameters with stale gradient",
+                        $param->name, $data->context
+                    );
+                }
+            }
+        }
+        if($self->_kv_store)
+        {
+            $self->_kv_store->push($i, $param->list_grad, priority => -$i);
+            if($self->_update_on_kvstore)
+            {
+                $self->_kv_store->pull($i, out => $param->list_data, priority => -$i);
+                return;
+            }
+            else
+            {
+                $self->_kv_store->pull($i, out => $param->list_grad, priority => -$i);
+            }
+        }
+        for(zip($self->_updaters, $param->list_data, $param->list_grad)) {
+            my ($upd, $arr, $grad) = @$_;
+            if(not $ignore_stale_grad or $arr->_fresh_grad)
+            {
+                $upd->($i, $grad, $arr);
+                $arr->_fresh_grad(0);
+            }
+        }
+    }, $self->_params);
+}
+
+method learning_rate(Maybe [Num] $lr)
+{
+    if(not blessed $self->_optimizer)
+    {
+        AI::MXNet::Logging->warning(
+            "Optimizer has to be defined before its learning ".
+            "rate can be accessed."
+        );
+        return;
+    }
+    else
+    {
+        if(defined $lr)
+        {
+            $self->_optimizer->lr($lr);
+        }
+        return $self->_optimizer->lr;
+    }
+}
+
+=head2 set_learning_rate
+
+        Sets a new learning rate of the optimizer.
+
+        Parameters
+        ----------
+        lr : float
+            The new learning rate of the optimizer.
+=cut
+
+method set_learning_rate(Num $lr)
+{
+    $self->learning_rate($lr);
+}
+
+=head2 save_states
+
+        Saves trainer states (e.g. optimizer, momentum) to a file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to output states file.
+=cut
+
+method save_states(Str $fname)
+{
+    assert(defined $self->_optimizer);
+    if($self->_update_on_kvstore)
+    {
+        $self->_kv_store->save_optimizer_states($fname, dump_optimizer=>1);
+    }
+    else
+    {
+        open(F, ">$fname") or Carp::confess("can not open $fname: $1");
+        print F $self->_updaters->[0]->get_states(dump_optimizer=>1);
+        close(F);
+    }
+}
+
+=head2 load_states
+
+        Loads trainer states (e.g. optimizer, momentum) from a file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to input states file.
+=cut
+
+method load_states(Str $fname)
+{
+    if($self->_update_on_kvstore)
+    {
+        $self->_kv_store->load_optimizer_states($fname);
+        $self->_optimizer($self->_kv_store->_updater->optimizer);
+    }
+    else
+    {
+        my $states = join('', IO::File->new($fname)->getlines);
+        for my $updater (@{ $self->_updaters })
+        {
+            $updater->set_states($states);
+            $updater->optimizer($self->_updaters->[0]->optimizer);
+        }
+        $self->_optimizer($self->_updaters->[0]->optimizer);
+    }
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Utils.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Utils.pm
new file mode 100644
index 000000000000..6acb66237195
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Gluon/Utils.pm
@@ -0,0 +1,280 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::Gluon::Utils;
+use strict;
+use warnings;
+use AI::MXNet::Base;
+use AI::MXNet::Function::Parameters;
+use Digest::SHA qw(sha1_hex);
+use File::Path qw(make_path);
+use HTTP::Tiny;
+use Exporter;
+use base qw(Exporter);
+@AI::MXNet::Gluon::Utils::EXPORT_OK = qw(download);
+
+=head1 NAME
+
+    AI::MXNet::Gluon::Utils
+=cut
+
+=head1 DESCRIPTION
+
+    Miscellaneous utilities.
+=cut
+
+=head2 split_data
+
+    Splits an NDArray into `num_slice` slices along `batch_axis`.
+    Usually used for data parallelism where each slices is sent
+    to one device (i.e. GPU).
+
+    Parameters
+    ----------
+    $data : NDArray
+        A batch of data.
+    $num_slice : int
+        Number of desired slices.
+    $batch_axis=0 : int, default 0
+        The axis along which to slice.
+    :$even_split=1 : bool, default True
+        Whether to force all slices to have the same number of elements.
+        If `True`, an error will be raised when `num_slice` does not evenly
+        divide `data.shape[batch_axis]`.
+
+    Returns
+    -------
+    array ref of NDArray
+        Return value is a array ref even if `num_slice` is 1.
+=cut
+
+
+method split_data(AI::MXNet::NDArray $data, Int $num_slice, Int $batch_axis=0, Bool :$even_split=1)
+{
+    my $size = $data->shape->[$batch_axis];
+    if($size < $num_slice)
+    {
+        Carp::confess(
+            sprintf(
+                "Too many slices for data with shape (%s). Arguments are ".
+                "num_slice=%d and batch_axis=%d.",
+                join(',', @{ $data->shape }), $num_slice, $batch_axis
+            )
+        );
+    }
+    if($even_split and $size % $num_slice != 0)
+    {
+        Carp::confess(
+            sprintf(
+                "data with shape %s cannot be evenly split into %d slices along axis %d. ".
+                "Use a batch size that's multiple of %d or set even_split=False to allow ".
+                "uneven partitioning of data.",
+                join(',', @{ $data->shape }), $num_slice, $batch_axis, $num_slice
+            )
+        );
+    }
+    my $step = int($size/$num_slice);
+    my $slices = [];
+    if($batch_axis == 0)
+    {
+        for my $i (0 .. $num_slice-1)
+        {
+            if($i < $num_slice-1)
+            {
+                push @$slices, $data->slice([$i*$step, ($i+1)*$step-1]);
+            }
+            else
+            {
+                push @$slices, $data->slice([$i*$step, $size-1]);
+            }
+        }
+    }
+    elsif($even_split)
+    {
+        $slices = AI::MXNet::NDArray->split($data, num_outputs => $num_slice, axis => $batch_axis);
+    }
+    else
+    {
+        for my $i (0 .. $num_slice-1)
+        {
+            if($i < $num_slice-1)
+            {
+                push @$slices, $data->slice_axis($batch_axis, $i*$step, ($i+1)*$step);
+            }
+            else
+            {
+                push @$slices, $data->slice_axis($batch_axis, $i*$step, $size);
+            }
+        }
+    }
+    return $slices;
+}
+
+=head2 split_and_load
+
+    Splits an NDArray into `len(ctx_list)` slices along `batch_axis` and loads
+    each slice to one context in `ctx_list`.
+
+    Parameters
+    ----------
+    $data : AcceptableInput
+        A batch of data.
+    :$ctx_list : list of Context
+        A list of Contexts.
+    :$batch_axis : int, default 0
+        The axis along which to slice.
+    :$even_split : bool, default True
+        Whether to force all slices to have the same number of elements.
+
+    Returns
+    -------
+    list of NDArray
+        Each corresponds to a context in `ctx_list`.
+=cut
+
+method split_and_load(
+    PDL|PDL::Matrix|ArrayRef|AI::MXNet::NDArray $data,
+    ArrayRef[AI::MXNet::Context] :$ctx_list,
+    Int :$batch_axis=0,
+    Bool :$even_split=1
+)
+{
+    if(not (blessed $data and $data->isa('AI::MXNet::NDArray')))
+    {
+        $data = AI::MXNet::NDArray->array($data, ctx => $ctx_list->[0])
+    }
+    if(@{ $ctx_list } == 1)
+    {
+        return [$data->as_in_context($ctx_list->[0])];
+    }
+    my $slices = __PACKAGE__->split_data($data, scalar(@$ctx_list), $batch_axis, $even_split);
+    my @ret;
+    for(zip($slices, $ctx_list)) {
+        my ($i, $ctx) = @$_;
+        push @ret, $i->as_in_context($ctx);
+    }
+    return \@ret;
+}
+
+=head2 clip_global_norm
+
+    Rescales NDArrays so that the sum of their 2-norm is smaller than `max_norm`.
+=cut
+
+method clip_global_norm(ArrayRef[AI::MXNet::NDArray] $arrays, Num $max_norm)
+{
+    assert(@$arrays > 0);
+    my $total_norm = 0;
+    for my $arr (@$arrays)
+    {
+        $arr = $arr->reshape([-1]);
+        $total_norm += AI::MXNet::NDArray->dot($arr, $arr);
+    }
+    $total_norm = sqrt($total_norm->asscalar);
+    my $scale = $max_norm / ($total_norm + 1e-8);
+    if($scale < 1)
+    {
+        $_ *= $scale for @{ $arrays };
+    }
+    return $total_norm
+}
+
+=head2 check_sha1
+
+    Check whether the sha1 hash of the file content matches the expected hash.
+
+    Parameters
+    ----------
+    filename : str
+        Path to the file.
+    sha1_hash : str
+        Expected sha1 hash in hexadecimal digits.
+
+    Returns
+    -------
+    bool
+        Whether the file content matches the expected hash.
+=cut
+
+func check_sha1(Str $filename, Str $sha1_hash)
+{
+    local($/) = undef;
+    open(F, $filename) or Carp::confess("can't open $filename $!");
+    my $data = <F>;
+    close(F);
+    return sha1_hex($data) eq $sha1_hash;
+}
+
+=head2 download
+
+    Download an given URL
+
+    Parameters
+    ----------
+    $url : str
+        URL to download
+    :$path : str, optional
+        Destination path to store downloaded file. By default stores to the
+        current directory with same name as in url.
+    :$overwrite : bool, optional
+        Whether to overwrite destination file if already exists.
+    :$sha1_hash : str, optional
+        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
+        but doesn't match.
+    Returns
+    -------
+    str
+        The file path of the downloaded file.
+=cut
+
+func download(Str $url, Maybe[Str] :$path=, Bool :$overwrite=0, Maybe[Str] :$sha1_hash=)
+{
+    my $fname;
+    $path =~ s/~/$ENV{HOME}/ if defined $path;
+    if(not defined $path)
+    {
+        $fname = (split(m[/], $url))[-1];
+    }
+    elsif(-d $path)
+    {
+        $fname = join('/', $path, (split(m[/], $url))[-1]);
+    }
+    else
+    {
+        $fname = $path;
+    }
+    if($overwrite or not -f $fname or ($sha1_hash and not check_sha1($fname, $sha1_hash)))
+    {
+        $fname =~ s/~/$ENV{HOME}/;
+        my $dirname = $fname;
+        $dirname =~ s/[^\/]+$//;
+        if(not -d $dirname)
+        {
+            make_path($dirname);
+        }
+        warn "Downloading $fname from $url ...\n";
+        my $response = HTTP::Tiny->new->get($url);
+        Carp::confess("download of url failed! ($response->{status} $response->{reason})\n")
+            unless $response->{success};
+        open(F, ">$fname") or Carp::confess("can't open $fname: $!");
+        print F $response->{content};
+        close(F);
+    }
+    return $fname
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
index 7a61cd9f1f1f..6ca98ca05ab8 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/IO.pm
@@ -29,7 +29,7 @@ use Scalar::Util qw/blessed/;
 
 # Convert data into canonical form.
 method init_data(
-    AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]|Undef $data,
+    Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]|Hash::Ordered] $data,
     Undef|Int :$allow_empty=,
     Str :$default_name
 )
@@ -37,8 +37,7 @@ method init_data(
     Carp::confess("data must be defined or allow_empty set to true value")
         if(not defined $data and not $allow_empty);
     $data //= [];
-
-    if(ref($data) and ref($data) ne 'ARRAY' and ref($data) ne 'HASH')
+    if(blessed $data and not $data->isa('Hash::Ordered'))
     {
         $data = [$data];
     }
@@ -59,13 +58,24 @@ method init_data(
             @ret = map { $i++; ["_${i}_$default_name", $_] } @{ $data };
         }
     }
-    if(ref($data) eq 'HASH')
+    elsif(ref($data) eq 'HASH')
     {
+        AI::MXNet::Logging->warning(
+            "Use of a raw perl hash as input is obsolete and the behaviour of the iterator is undefined.\n".
+            "Please use Hash::Ordered object instead."
+        );
         while(my ($k, $v) = each %{ $data })
         {
             push @ret, [$k, $v];
         }
     }
+    elsif(blessed $data and $data->isa('Hash::Ordered'))
+    {
+        for my $k ($data->keys)
+        {
+            push @ret, [$k, $data->get($k)];
+        }
+    }
     for my $d (@ret)
     {
         if(not (blessed $d->[1] and $d->[1]->isa('AI::MXNet::NDArray')))
@@ -216,9 +226,11 @@ method reset(){}
 method list()
 {
     my @ret;
-    while(<$self>)
+    while(my $data = <$self>)
     {
-        push @ret, $_;
+        $data->label([map { $_->copy } @{ $data->label }]);
+        $data->data([map { $_->copy } @{ $data->data }]);
+        push @ret, $data;
     }
     return \@ret;
 }
@@ -396,7 +408,7 @@ method getpad()
 package AI::MXNet::NDArrayIter;
 use Mouse;
 use AI::MXNet::Base;
-use List::Util qw(shuffle);
+use List::Util;
 extends 'AI::MXNet::DataIter';
 
 =head1 NAME
@@ -428,27 +440,19 @@ extends 'AI::MXNet::DataIter';
     for training and can cause problems if used for prediction.
 =cut
 
-has 'data'                => (is => 'rw', isa => 'Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]]');
+has 'data'                => (is => 'rw', isa => 'Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]|Hash::Ordered]');
 has 'data_list'           => (is => 'rw', isa => 'ArrayRef[AI::MXNet::NDArray]');
-has 'label'               => (is => 'rw', isa => 'Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]]');
+has 'label'               => (is => 'rw', isa => 'Maybe[AcceptableInput|HashRef[AcceptableInput]|ArrayRef[AcceptableInput]|Hash::Ordered]');
 has 'batch_size'          => (is => 'rw', isa => 'Int', default => 1);
-has '_shuffle'            => (is => 'rw', init_arg => 'shuffle', isa => 'Bool', default => 0);
+has 'shuffle'             => (is => 'rw', isa => 'Bool', default => 0);
 has 'last_batch_handle'   => (is => 'rw', isa => 'Str', default => 'pad');
 has 'label_name'          => (is => 'rw', isa => 'Str', default => 'softmax_label');
 has 'num_source'          => (is => 'rw', isa => 'Int');
 has 'cursor'              => (is => 'rw', isa => 'Int');
 has 'num_data'            => (is => 'rw', isa => 'Int');
 
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    if(@_%2)
-    {
-        my $data  = shift;
-        return $class->$orig(data => $data, @_);
-    }
-    return $class->$orig(@_);
-};
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
+method python_constructor_arguments() { ['data', 'label'] };
 
 sub BUILD
 {
@@ -458,9 +462,9 @@ sub BUILD
     my $num_data  = $data->[0][1]->shape->[0];
     confess("size of data dimension 0 $num_data < batch_size ${\ $self->batch_size }")
         unless($num_data >= $self->batch_size);
-    if($self->_shuffle)
+    if($self->shuffle)
     {
-        my @idx = shuffle(0..$num_data-1);
+        my @idx = List::Util::shuffle(0..$num_data-1);
         $_->[1] = AI::MXNet::NDArray->array(pdl_shuffle($_->[1]->aspdl, \@idx)) for @$data;
         $_->[1] = AI::MXNet::NDArray->array(pdl_shuffle($_->[1]->aspdl, \@idx)) for @$label;
     }
@@ -596,7 +600,6 @@ method getpad()
         return 0;
     }
 }
-
 package AI::MXNet::MXDataIter;
 use Mouse;
 use AI::MXNet::Base;
@@ -668,6 +671,7 @@ method reset()
     check_call(AI::MXNetCAPI::DataIterBeforeFirst($self->handle));
 }
 
+
 method next()
 {
     if($self->_debug_skip_load and not $self->_debug_at_begin)
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
index 18ef42af5525..3cc71df7ed01 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Image.pm
@@ -27,6 +27,49 @@ use AI::MXNet::Function::Parameters;
     AI::MXNet:Image - Read individual image files and perform augmentations.
 =cut
 
+=head2 imread
+
+    Read and decode an image to an NDArray.
+
+    Note: `imread` uses OpenCV.
+    MXNet must have been built with USE_OPENCV=1 for `imdecode` to work.
+
+    Parameters
+    ----------
+    $filename : str
+        Name of the image file to be loaded.
+    :$flag : int
+        0 for grayscale. 1 for colored.
+    :$to_rgb : int
+        0 for BGR format (OpenCV default). 1 for RGB format (MXNet default).
+    :$out : NDArray
+        Output buffer. Do not specify for automatic allocation.
+
+    Returns
+    -------
+    An NDArray containing the image.
+
+    Example
+    -------
+    >>> mx->img->imread("flower.jpg");
+    <NDArray 224x224x3 @cpu(0)>
+
+    Set `flag` parameter to 0 to get grayscale output
+
+    >>> mx->img->imdecode("flower.jpg", flag=>0);
+    <NDArray 224x224x1 @cpu(0)>
+
+    Set `to_rgb` parameter to 0 to get output in OpenCV format (BGR)
+
+    >>> mx->img->imdecode($str_image, to_rgb=>0);
+    <NDArray 224x224x3 @cpu(0)>
+=cut
+
+method imread(Str $filename, Int :$flag=1, Int :$to_rgb=1, Maybe[AI::MXNet::NDArray] :$out=)
+{
+    return AI::MXNet::NDArray->_cvimread($filename, { flag => $flag, to_rgb => $to_rgb, ($out ? (out => $out) : ()) });
+}
+
 =head2 imdecode
 
     Decode an image from string. Requires OpenCV to work.
@@ -396,7 +439,7 @@ method RandomOrderAug(ArrayRef[CodeRef] $ts)
         my @tmp;
         for my $t (@ts)
         {
-            push @tmp, &{$t}($src);
+            push @tmp, $t->($src);
         }
         return \@tmp;
     };
@@ -649,6 +692,11 @@ Int            :$inter_method=2
     return \@auglist;
 }
 
+method imresize(AI::MXNet::NDArray $src, Int $w, Int $h, Int $interp=2)
+{
+    return AI::MXNet::NDArray->_cvimresize($src, $w, $h, { interp=>$interp });
+}
+
 method ImageIter(@args) { AI::MXNet::ImageIter->new(@args) }
 
 package AI::MXNet::ImageIter;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
index 182327dfccfe..4397b4e39ba5 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Initializer.pm
@@ -151,9 +151,18 @@ method call(Str|AI::MXNet::InitDesc $desc, AI::MXNet::NDArray $arr)
     my $init = $desc->attrs->{ __init__ };
     if($init)
     {
-      my ($klass, $kwargs) = @{ decode_json($init) };
-      $self->get_init_registry->{ lc $klass }->new(%{ $kwargs })->_init_weight("$desc", $arr);
-      $self->_verbose_print($desc, $init, $arr);
+        my ($klass, $kwargs);
+        if(exists $self->get_init_registry->{ lc $init })
+        {
+            $klass = $init;
+            $kwargs = {};
+        }
+        else
+        {
+            ($klass, $kwargs) = @{ decode_json($init) };
+        }
+        $self->get_init_registry->{ lc $klass }->new(%{ $kwargs })->_init_weight("$desc", $arr);
+        $self->_verbose_print($desc, $init, $arr);
     }
     else
     {
@@ -398,7 +407,7 @@ method call(Str $name, AI::MXNet::NDArray $arr)
     {
         if($name =~ /$pattern/)
         {
-            &{$self->map->{$pattern}}($name, $arr);
+            $self->map->{$pattern}->($name, $arr);
             return;
         }
     }
@@ -418,6 +427,12 @@ method _init_weight(Str $name, AI::MXNet::NDArray $arr)
 
 __PACKAGE__->register;
 
+package AI::MXNet::Zeros;
+use Mouse;
+extends 'AI::MXNet::Zero';
+
+__PACKAGE__->register;
+
 package AI::MXNet::One;
 use Mouse;
 extends 'AI::MXNet::Initializer';
@@ -428,6 +443,12 @@ method _init_weight(Str $name, AI::MXNet::NDArray $arr)
 
 __PACKAGE__->register;
 
+package AI::MXNet::Ones;
+use Mouse;
+extends 'AI::MXNet::One';
+
+__PACKAGE__->register;
+
 package AI::MXNet::Constant;
 use Mouse;
 extends 'AI::MXNet::Initializer';
@@ -603,6 +624,8 @@ has "factor_type" => (is => "ro", isa => enum([qw/avg in out/]), default => 'avg
 method _init_weight(Str $name, AI::MXNet::NDArray $arr)
 {
     my @shape = @{ $arr->shape };
+    confess(__PACKAGE__." initializer can not be applied on less than 2D tensor")
+        if @shape < 2;
     my $hw_scale = 1;
     if(@shape > 2)
     {
@@ -717,6 +740,8 @@ package AI::MXNet::LSTMBias;
 use Mouse;
 extends 'AI::MXNet::Initializer';
 has 'forget_bias' => (is => 'ro', isa => 'Num', required => 1);
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
+method python_constructor_arguments() { ['forget_bias'] }
 
 method _init_weight(Str $name, AI::MXNet::NDArray $arr)
 {
@@ -801,7 +826,7 @@ method _init_weight($name, $arr)
         }
         else
         {
-            &{$self->init}($desc, $args->{$name});
+            $self->init->($desc, $args->{$name});
         }
     }
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm b/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
index eff57a31dc53..84a890dcc908 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/KVStore.pm
@@ -244,7 +244,7 @@ method pull(
 method set_optimizer(AI::MXNet::Optimizer $optimizer)
 {
     my $is_worker = check_call(AI::MXNetCAPI::KVStoreIsWorkerNode());
-    if($self->type eq 'dist' and $is_worker)
+    if($self->type =~ /dist/ and $is_worker)
     {
         my $optim_str = MIME::Base64::encode_base64(Storable::freeze($optimizer), "");
         $self->_send_command_to_servers(0, $optim_str);
@@ -252,7 +252,7 @@ method set_optimizer(AI::MXNet::Optimizer $optimizer)
     else
     {
         $self->_updater(AI::MXNet::Optimizer->get_updater($optimizer));
-        $self->_set_updater(sub { &{$self->_updater}(@_) });
+        $self->_set_updater($self->_updater);
     }
 }
 
@@ -309,14 +309,17 @@ method num_workers()
     ----------
     fname : str
         Path to output states file.
+    dump_optimizer : bool, default False
+            Whether to also save the optimizer itself. This would also save optimizer
+            information such as learning rate and weight decay schedules.
 =cut
 
-method save_optimizer_states(Str $fname)
+method save_optimizer_states(Str $fname, Bool :$dump_optimizer=0)
 {
     confess("Cannot save states for distributed training")
         unless defined $self->_updater;
     open(F, ">:raw", "$fname") or confess("can't open $fname for writing: $!");
-    print F $self->_updater->get_states();
+    print F $self->_updater->get_states($dump_optimizer);
     close(F);
 }
 
@@ -371,7 +374,7 @@ method load_optimizer_states(Str $fname)
         [ 6.  6.  6.]]
 =cut
 
-method _set_updater(CodeRef $updater_func)
+method _set_updater(Updater $updater_func)
 {
     $self->_updater_func(
         sub {
@@ -478,12 +481,12 @@ sub _key_value
         assert(not blessed($vals) and @$keys == @$vals);
         my @c_keys;
         my @c_vals;
-        zip(sub {
-            my ($key, $val) = @_;
+        for(zip($keys, $vals)) {
+            my ($key, $val) = @$_;
             my ($c_key, $c_val) = _key_value($key, $val);
             push @c_keys, @$c_key;
             push @c_vals, @$c_val;
-        }, $keys, $vals);
+        }
         return (\@c_keys, \@c_vals);
     }
 }
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm
index f3039cc09bfd..839b456e1ef1 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Logging.pm
@@ -17,8 +17,11 @@
 
 package AI::MXNet::Logging;
 ## TODO
+use strict;
+use warnings;
 use Mouse;
-sub warning { shift; warn sprintf(shift, @_) . "\n" };
+our $silent = 0;
+sub warning { return if $silent; shift; warn sprintf(shift, @_) . "\n" };
 *debug   = *info = *warning;
 sub get_logger { __PACKAGE__->new }
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
index c3a3183432d5..3b9345d8baf9 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Metric.pm
@@ -20,6 +20,7 @@ use strict;
 use warnings;
 use AI::MXNet::Function::Parameters;
 use Scalar::Util qw/blessed/;
+use JSON::PP;
 
 =head1 NAME
 
@@ -77,10 +78,37 @@ has 'name'       => (is => 'rw', isa => 'Str');
 has 'num'        => (is => 'rw', isa => 'Int');
 has 'num_inst'   => (is => 'rw', isa => 'Maybe[Int|ArrayRef[Int]]');
 has 'sum_metric' => (is => 'rw', isa => 'Maybe[Num|ArrayRef[Num]]');
+has '_kwargs'    => (is => 'rw', init_arg => undef);
+around BUILDARGS => \&AI::MXNet::Base::process_arguments;
 
 sub BUILD
 {
-    shift->reset;
+    my ($self, $kwargs) = @_;
+    $self->reset;
+    $self->_kwargs($kwargs);
+}
+
+method _class_name()
+{
+    my $class = ref $self || $self;
+    $class =~ s/^.+:://;
+    $class;
+}
+
+=head2 get_config
+
+    Save configurations of metric. Can be recreated
+        from configs with mx->metric->create(%{ $config })
+=cut
+
+method get_config()
+{
+    my %config = %{ $self->_kwargs };
+    %config = (%config,
+        metric => $self->_class_name,
+        name   => $self->name
+    );
+    return \%config;
 }
 
 method update($label, $pred)
@@ -151,6 +179,7 @@ use Mouse;
 extends 'AI::MXNet::EvalMetric';
 has 'metrics' => (is => 'rw', isa => 'ArrayRef[AI::MXNet::EvalMetric]', default => sub { [] });
 has '+name'   => (default => 'composite');
+method python_constructor_arguments() { ['metrics'] }
 
 # Add a child metric.
 method add(AI::MXNet::EvalMetric $metric)
@@ -212,8 +241,8 @@ has '+name'   => (default => 'accuracy');
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    zip(sub {
-        my ($label, $pred_label) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred_label) = @$_;
         if(join(',', @{$pred_label->shape}) ne join(',', @{$label->shape}))
         {
             $pred_label = AI::MXNet::NDArray->argmax_channel($pred_label);
@@ -222,7 +251,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
         my $sum = ($pred_label->aspdl->flat == $label->aspdl->flat)->sum;
         $self->sum_metric($self->sum_metric + $sum);
         $self->num_inst($self->num_inst + $pred_label->size);
-    }, $labels, $preds);
+    }
 }
 
 package AI::MXNet::TopKAccuracy;
@@ -232,6 +261,7 @@ use AI::MXNet::Base;
 extends 'AI::MXNet::EvalMetric';
 has '+name'   => (default => 'top_k_accuracy');
 has 'top_k' => (is => 'rw', isa => 'int', default => 1);
+method python_constructor_arguments() { ['top_k'] }
 
 sub BUILD
 {
@@ -244,8 +274,8 @@ sub BUILD
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    zip(sub {
-        my ($label, $pred_label) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred_label) = @$_;
         confess('Predictions should be no more than 2 dims')
             unless @{ $pred_label->shape } <= 2;
         $pred_label = $pred_label->aspdl->qsorti;
@@ -269,7 +299,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
             }
         }
         $self->num_inst($self->num_inst + $num_samples);
-    }, $labels, $preds);
+    }
 }
 
 # Calculate the F1 score of a binary classification problem.
@@ -282,16 +312,16 @@ has '+name'   => (default => 'f1');
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    zip(sub {
-        my ($label, $pred_label) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred_label) = @$_;
         AI::MXNet::Metric::check_label_shapes($label, $pred_label);
         $pred_label = $pred_label->aspdl->maximum_ind;
         $label = $label->astype('int32')->aspdl;
         confess("F1 currently only supports binary classification.")
             if $label->uniq->shape->at(0) > 2;
         my ($true_positives, $false_positives, $false_negatives) = (0,0,0);
-        zip(sub{
-            my ($y_pred, $y_true) = @_;
+        for(zip($pred_label->unpdl, $label->unpdl)) {
+            my ($y_pred, $y_true) = @$_;
             if($y_pred == 1 and $y_true == 1)
             {
                 $true_positives += 1;
@@ -304,7 +334,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
             {
                 $false_negatives += 1;
             }
-        }, $pred_label->unpdl, $label->unpdl);
+        }
         my $precision;
         my $recall;
         if($true_positives + $false_positives > 0)
@@ -334,7 +364,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
         }
         $self->sum_metric($self->sum_metric + $f1_score);
         $self->num_inst($self->num_inst + 1);
-    }, $labels, $preds);
+    }
 }
 
 package AI::MXNet::Perplexity;
@@ -344,6 +374,8 @@ extends 'AI::MXNet::EvalMetric';
 has '+name'        => (default => 'Perplexity');
 has 'ignore_label' => (is => 'ro', isa => 'Maybe[Int]');
 has 'axis'         => (is => 'ro', isa => 'Int', default => -1);
+method python_constructor_arguments() { ['ignore_label', 'axis'] }
+
 around BUILDARGS => sub {
     my $orig  = shift;
     my $class = shift;
@@ -376,8 +408,8 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds);
     my ($loss, $num) = (0, 0);
-    zip(sub {
-        my ($label, $pred) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred) = @$_;
         my $label_shape = $label->shape;
         my $pred_shape  = $pred->shape;
         assert(
@@ -394,7 +426,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
         }
         $loss -= $pred->maximum(1e-10)->log->sum->asscalar;
         $num  += $pred->size;
-    }, $labels, $preds);
+    }
     $self->sum_metric($self->sum_metric + $loss);
     $self->num_inst($self->num_inst + $num);
 }
@@ -418,8 +450,8 @@ has '+name'   => (default => 'mae');
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    zip(sub {
-        my ($label, $pred) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred) = @$_;
         $label = $label->aspdl;
         $pred =  $pred->aspdl;
         if($label->ndims == 1)
@@ -428,7 +460,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
         }
         $self->sum_metric($self->sum_metric + ($label - $pred)->abs->avg);
         $self->num_inst($self->num_inst + 1);
-    }, $labels, $preds);
+    }
 }
 
 # Calculate Mean Squared Error loss
@@ -441,8 +473,8 @@ has '+name'   => (default => 'mse');
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    zip(sub {
-        my ($label, $pred) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred) = @$_;
         $label = $label->aspdl;
         $pred =  $pred->aspdl;
         if($label->ndims == 1)
@@ -451,7 +483,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
         }
         $self->sum_metric($self->sum_metric + (($label - $pred)**2)->avg);
         $self->num_inst($self->num_inst + 1);
-    }, $labels, $preds);
+    }
 }
 
 # Calculate Root Mean Squred Error loss
@@ -464,8 +496,8 @@ has '+name'   => (default => 'rmse');
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    zip(sub {
-        my ($label, $pred) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred) = @$_;
         $label = $label->aspdl;
         $pred =  $pred->aspdl;
         if($label->ndims == 1)
@@ -474,7 +506,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
         }
         $self->sum_metric($self->sum_metric + sqrt((($label - $pred)**2)->avg));
         $self->num_inst($self->num_inst + 1);
-    }, $labels, $preds);
+    }
 }
 
 # Calculate Cross Entropy loss
@@ -483,19 +515,14 @@ use Mouse;
 use AI::MXNet::Base;
 extends 'AI::MXNet::EvalMetric';
 has '+name'   => (default => 'cross-entropy');
-has 'eps'     => (is => 'ro', isa => 'Num', default => 1e-8);
-around BUILDARGS => sub {
-    my $orig  = shift;
-    my $class = shift;
-    return $class->$orig(eps => $_[0]) if @_ == 1;
-    return $class->$orig(@_);
-};
+has 'eps'     => (is => 'ro', isa => 'Num', default => 1e-12);
+method python_constructor_arguments() { ['eps'] }
 
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    zip(sub {
-        my ($label, $pred) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred) = @$_;
         $label = $label->aspdl->flat;
         $pred =  $pred->aspdl;
         my $label_shape = $label->shape->at(0);
@@ -507,7 +534,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
         my $prob = $pred->index($label);
         $self->sum_metric($self->sum_metric + (-($prob + $self->eps)->log)->sum);
         $self->num_inst($self->num_inst + $label_shape);
-    }, $labels, $preds);
+    }
 }
 
 package AI::MXNet::PearsonCorrelation;
@@ -543,8 +570,8 @@ has '+name'   => (default => 'pearson-correlation');
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds);
-    zip(sub {
-        my ($label, $pred) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred) = @$_;
         AI::MXNet::Metric::check_label_shapes($label, $pred);
         $label = $label->aspdl->flat;
         $pred  = $pred->aspdl->flat;
@@ -556,9 +583,143 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
             ((($label-$label_mean)*($pred-$pred_mean))->sum/$label->nelem)/(($label_stdv*$pred_stdv)->at(0))
         );
         $self->num_inst($self->num_inst + 1);
-    }, $labels, $preds);
+    }
+}
+
+package AI::MXNet::Loss;
+use Mouse;
+use AI::MXNet::Base;
+extends 'AI::MXNet::EvalMetric';
+has '+name'   => (default => 'loss');
+
+=head1 NAME
+
+    AI::MXNet::Loss
+=cut
+
+=head1 DESCRIPTION
+
+    Dummy metric for directly printing loss.
+
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+=cut
+
+method update($labels, ArrayRef[AI::MXNet::NDArray] $preds)
+{
+    for my $pred (@{ $preds })
+    {
+        $self->sum_metric($self->sum_metric + $pred->sum->asscalar);
+        $self->num_inst($self->num_inst + $pred->size);
+    }
 }
 
+package AI::MXNet::Confidence;
+use Mouse;
+
+=head1 NAME
+
+    AI::MXNet::Confidence
+=cut
+
+=head1 DESCRIPTION
+
+    Accuracy by confidence buckets.
+
+    Parameters
+    ----------
+    name : str
+        Name of this metric instance for display.
+    num_classes: Int
+        number of classes
+    confidence_thresholds: ArrayRef[Num]
+        confidence buckets
+    For example
+    my $composite_metric  = AI::MXNet::CompositeEvalMetric->new;
+    $composite_metric->add(mx->metric->create('acc'));
+    $composite_metric->add(
+        AI::MXNet::Confidence->new(
+            num_classes => 2,
+            confidence_thresholds => [ 0.5, 0.7, 0.8, 0.9 ],
+        )
+    );
+=cut
+
+extends 'AI::MXNet::EvalMetric';
+has 'num_classes', is => 'ro', isa => 'Int', required => 1;
+has 'confidence_thresholds', is => 'ro', isa => 'ArrayRef[Num]', required => 1;
+has '+name'   => (default => 'confidence');
+has '+sum_metric', isa => 'PDL';
+has '+num_inst', isa => 'PDL';
+method python_constructor_arguments() { ['num_classes', 'confidence_thresholds'] }
+
+sub _hot
+{
+    my($m, $n) = @_;
+    my $md = $m->dim(-1);
+    my $hot = PDL->zeros($n, $md);
+    $hot->index2d($m->flat(), PDL->sequence($md)) .= 1;
+    return $hot;
+}
+
+sub reset
+{
+    my($self) = @_;
+    my $nt = @{$self->confidence_thresholds};
+    my $n = $self->num_classes;
+    $self->sum_metric(PDL->zeroes($nt, $n));
+    $self->num_inst(PDL->zeroes($nt, $n));
+    return;
+}
+
+sub update
+{
+    my($self, $labels, $preds) = @_;
+    my $n = $self->num_classes;
+    my $ct = PDL->new($self->confidence_thresholds);
+    my $nt = $ct->nelem;
+    for(0 .. @$labels - 1)
+    {
+        my $label = _hot($labels->[$_]->aspdl, $n);
+        my $pred = $preds->[$_]->aspdl;
+        for my $c (0 .. $n - 1)
+        {
+            my $ls = $label->slice($c);
+            my $pm = $pred->slice($c) > $ct;
+            $self->sum_metric->slice(":,$c") += ($pm & $ls);
+            $self->num_inst->slice(":,$c") += $pm;
+        }
+    }
+    return;
+}
+
+sub get
+{
+    my($self) = @_;
+    my(@names, @values);
+    my $val = $self->sum_metric / $self->num_inst;
+    my $ct = $self->confidence_thresholds;
+    my $n = $self->num_classes;
+    for my $c (0 .. $n - 1)
+    {
+        for my $t (0 .. @$ct - 1)
+        {
+            my $sm = $self->sum_metric->at($t, $c);
+            my $ni = $self->num_inst->at($t, $c);
+            push @names, "P(v=$c|Conf>$ct->[$t])=($sm/$ni)";
+            push @values, $val->at($t, $c);
+        }
+    }
+    return(\@names, \@values);
+}
+
+=head1 NAME
+
+    AI::MXNet::CustomMetric
+=cut
+
 =head1 DESCRIPTION
 
     Custom evaluation metric that takes a sub ref.
@@ -582,13 +743,14 @@ use AI::MXNet::Base;
 extends 'AI::MXNet::EvalMetric';
 has 'eval_function'       => (is => 'ro', isa => 'CodeRef');
 has 'allow_extra_outputs' => (is => 'ro', isa => 'Int', default => 0);
+method python_constructor_arguments() { ['eval_function', 'allow_extra_outputs'] }
 
 method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray] $preds)
 {
     AI::MXNet::Metric::check_label_shapes($labels, $preds)
         unless $self->allow_extra_outputs;
-    zip(sub {
-        my ($label, $pred) = @_;
+    for(zip($labels, $preds)) {
+        my ($label, $pred) = @$_;
         $label = $label->aspdl;
         $pred =  $pred->aspdl;
         my $value = $self->eval_function->($label, $pred);
@@ -596,7 +758,7 @@ method update(ArrayRef[AI::MXNet::NDArray] $labels, ArrayRef[AI::MXNet::NDArray]
         my $num_inst   = ref $value ? $value->[1] : 1;
         $self->sum_metric($self->sum_metric + $sum_metric);
         $self->num_inst($self->num_inst + $num_inst);
-    }, $labels, $preds);
+    }
 }
 
 package AI::MXNet::Metric;
@@ -613,22 +775,28 @@ package AI::MXNet::Metric;
 =cut
 
 my %metrics = qw/
-    acc            AI::MXNet::Accuracy
-    accuracy       AI::MXNet::Accuracy
-    ce             AI::MXNet::CrossEntropy
-    f1             AI::MXNet::F1
-    mae            AI::MXNet::MAE
-    mse            AI::MXNet::MSE
-    rmse           AI::MXNet::RMSE
-    top_k_accuracy AI::MXNet::TopKAccuracy
-    Perplexity     AI::MXNet::Perplexity
-    perplexity     AI::MXNet::Perplexity
-    pearsonr       AI::MXNet::PearsonCorrelation
+    acc                 AI::MXNet::Accuracy
+    accuracy            AI::MXNet::Accuracy
+    ce                  AI::MXNet::CrossEntropy
+    crossentropy        AI::MXNet::CrossEntropy
+    f1                  AI::MXNet::F1
+    mae                 AI::MXNet::MAE
+    mse                 AI::MXNet::MSE
+    rmse                AI::MXNet::RMSE
+    top_k_accuracy      AI::MXNet::TopKAccuracy
+    topkaccuracy        AI::MXNet::TopKAccuracy
+    perplexity          AI::MXNet::Perplexity
+    pearsonr            AI::MXNet::PearsonCorrelation
+    pearsoncorrelation  AI::MXNet::PearsonCorrelation
+    loss                AI::MXNet::Loss
+    compositeevalmetric AI::MXNet::CompositeEvalMetric
+    confidence          AI::MXNet::Confidence
 /;
 
-method create(Metric|ArrayRef[Metric] $metric, %kwargs)
+method create(Metric|ArrayRef[Metric] $metric, @kwargs)
 {
     Carp::confess("metric must be defined") unless defined $metric;
+    return $metric if blessed $metric and $metric->isa('AI::MXNet::EvalMetric');
     if(my $ref = ref $metric)
     {
         if($ref eq 'ARRAY')
@@ -636,23 +804,29 @@ method create(Metric|ArrayRef[Metric] $metric, %kwargs)
             my $composite_metric = AI::MXNet::CompositeEvalMetric->new();
             for my $child_metric (@{ $metric })
             {
-                $composite_metric->add(__PACKAGE__->create($child_metric, %kwargs))
+                $composite_metric->add(__PACKAGE__->create($child_metric, @kwargs))
             }
             return $composite_metric;
         }
         else
         {
-            return AI::MXNet::CustomMetric->new(eval_function => $metric, %kwargs);
+            return AI::MXNet::CustomMetric->new(eval_function => $metric, @kwargs);
         }
     }
     else
     {
-        if(not exists $metrics{ lc($metric) })
+        if(not exists $metrics{ lc($metric) } and not $metric =~ /^{/)
         {
             my @metrics = keys %metrics;
             Carp::confess("Metric must be either subref or one of [@metrics]");
         }
-        return $metrics{ lc($metric) }->new(%kwargs);
+        if($metric =~ /^{/ and not @kwargs)
+        {
+            my $config = decode_json($metric);
+            $metric = delete $config->{metric};
+            @kwargs = %{ $config };
+        }
+        return $metrics{ lc($metric) }->new(@kwargs);
     }
 }
 
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
index 3e4d938bf4e9..3229d22597d0 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Module.pm
@@ -147,7 +147,7 @@ func _update_params(
             # faked an index here, to make optimizer create diff
             # state for the same index but on diff devs, TODO(mli)
             # use a better solution later
-            &{$updater}($index*$num_device+$k, $g, $w);
+            $updater->($index*$num_device+$k, $g, $w);
         }, $arg_list, $grad_list);
     }, $param_arrays, $grad_arrays);
 }
@@ -457,13 +457,13 @@ method init_params(
                     }
                     if(defined $initializer)
                     {
-                        &{$initializer}($name, $arr);
+                        $initializer->($name, $arr);
                     }
                 }
             }
             else
             {
-                &{$initializer}($name, $arr) if defined $initializer;
+                $initializer->($name, $arr) if defined $initializer;
             }
     };
     my $attrs = $self->_symbol->attr_dict;
@@ -809,12 +809,12 @@ method forward(
         else
         {
             $new_dshape = [];
-            zip(sub {
-                my ($i, $shape) = @_;
+            for(zip($self->data_shapes, \@new_data_shapes)) {
+                my ($i, $shape) = @$_;
                 push @{ $new_dshape }, AI::MXNet::DataDesc->new(
                     $i->name, $shape, $i->dtype, $i->layout
                 );
-            }, $self->data_shapes, \@new_data_shapes);
+            }
         }
         my $new_lshape;
         if($data_batch->can('provide_label') and $data_batch->provide_label)
@@ -824,12 +824,12 @@ method forward(
         elsif($data_batch->can('label') and $data_batch->label)
         {
             $new_lshape = [];
-            zip(sub {
-                my ($i, $j) = @_;
+            for(zip($self->label_shapes, $data_batch->label)) {
+                my ($i, $j) = @$_;
                 push @{ $new_lshape }, AI::MXNet::DataDesc->new(
                     $i->name, $j->shape, $i->dtype, $i->layout
                 );
-            }, $self->label_shapes, $data_batch->label);
+            }
         }
         $self->reshape(data_shapes => $new_dshape, label_shapes => $new_lshape);
     }
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
index 7a9e3de090db..4deca466c1a6 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Base.pm
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+use strict;
+use warnings;
 package AI::MXNet::BatchEndParam;
 use Mouse;
 use AI::MXNet::Function::Parameters;
@@ -81,12 +83,13 @@ method _check_names_match(
 )
 {
     return if (not @$data_shapes and @$data_names == 1 and  $data_names->[0] eq 'softmax_label');
-    my @actual = map { @{$_}[0] } @{ $data_shapes };
-    if("@$data_names" ne "@actual")
+    my @actual = sort map { @{$_}[0] } @{ $data_shapes };
+    my @data_names = sort @$data_names;
+    if("@data_names" ne "@actual")
     {
         my $msg = sprintf(
             "Data provided by %s_shapes don't match names specified by %s_names (%s vs. %s)",
-            $name, $name, "@$data_shapes", "@$data_names"
+            $name, $name, "@actual", "@data_names"
         );
         if($throw)
         {
@@ -283,7 +286,7 @@ method score(
             );
             for my $callback (@{ _as_list($batch_end_callback) })
             {
-                &{$callback}($batch_end_params);
+                $callback->($batch_end_params);
             }
         }
         $actual_num_batch++;
@@ -298,7 +301,7 @@ method score(
         );
         for my $callback (@{ _as_list($score_end_callback) })
         {
-            &{callback}($params);
+            $callback->($params);
         }
     }
     return $eval_metric->get_name_value;
@@ -484,7 +487,7 @@ method fit(
     EvalMetric                         :$eval_metric='acc',
     Maybe[Callback]|ArrayRef[Callback] :$epoch_end_callback=,
     Maybe[Callback]|ArrayRef[Callback] :$batch_end_callback=,
-    Str                                :$kvstore='local',
+    KVStore                            :$kvstore='local',
     Optimizer                          :$optimizer='sgd',
     HashRef                            :$optimizer_params={ learning_rate => 0.01 },
     Maybe[Callback]|ArrayRef[Callback] :$eval_end_callback=,
@@ -567,7 +570,7 @@ method fit(
                 );
                 for my $callback (@{ _as_list($batch_end_callback) })
                 {
-                    &{$callback}($batch_end_params);
+                    $callback->($batch_end_params);
                 }
             }
             $nbatch++;
@@ -589,7 +592,7 @@ method fit(
         {
             for my $callback (@{ _as_list($epoch_end_callback) })
             {
-                &{$callback}($epoch, $self->get_symbol, $arg_params, $aux_params);
+                $callback->($epoch, $self->get_symbol, $arg_params, $aux_params);
             }
         }
         #----------------------------------------
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
index 531f41d58a3a..aa495674faa1 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
@@ -156,7 +156,7 @@ sub BUILD
     $self->_fixed_param_names([]) unless defined $original_params->{fixed_param_names};
     $self->_state_names([]) unless defined $original_params->{state_names};
     $self->_params_dirty(0);
-    my ($symbol, $data_names, $label_names) = &{$self->_sym_gen}($self->_default_bucket_key);
+    my ($symbol, $data_names, $label_names) = $self->_sym_gen->($self->_default_bucket_key);
     $self->_check_input_names($symbol, $data_names//[], "data", 1);
     $self->_check_input_names($symbol, $label_names//[], "label", 0);
     $self->_check_input_names($symbol, $self->_state_names, "state", 1);
@@ -179,7 +179,7 @@ method data_names()
     }
     else
     {
-        return (&{$self->_sym_gen}($self->_default_bucket_key))[1];
+        return ($self->_sym_gen->($self->_default_bucket_key))[1];
     }
 }
 
@@ -191,7 +191,7 @@ method output_names()
     }
     else
     {
-        my ($symbol) = &{$self->_sym_gen}($self->_default_bucket_key);
+        my ($symbol) = $self->_sym_gen->($self->_default_bucket_key);
         return $symbol->list_ouputs;
     }
 }
@@ -356,7 +356,7 @@ method bind(
     $self->inputs_need_grad($inputs_need_grad);
     $self->binded(1);
 
-    my ($symbol, $data_names, $label_names) = &{$self->_sym_gen}($bucket_key//$self->_default_bucket_key);
+    my ($symbol, $data_names, $label_names) = $self->_sym_gen->($bucket_key//$self->_default_bucket_key);
     my $module = AI::MXNet::Module->new(
             symbol            => $symbol,
             data_names        => $data_names,
@@ -410,7 +410,7 @@ method switch_bucket(
     assert($self->binded, 'call bind before switching bucket');
     if(not exists $self->_buckets->{ $bucket_key })
     {
-        my ($symbol, $data_names, $label_names) = &{$self->_sym_gen}($bucket_key);
+        my ($symbol, $data_names, $label_names) = $self->_sym_gen->($bucket_key);
         my $module = AI::MXNet::Module->new(
             symbol         => $symbol,
             data_names     => $data_names,
@@ -546,4 +546,29 @@ method install_monitor(AI::MXNet::Monitor $mon)
     }
 }
 
+=head2 save_checkpoint
+
+    Save current progress to a checkpoint.
+    Use mx->callback->module_checkpoint as epoch_end_callback to save during training.
+
+    Parameters
+    ----------
+    prefix : str
+        The file prefix to checkpoint to
+    epoch : int
+        The current epoch number
+    save_optimizer_states : bool
+        Whether to save optimizer states for later training
+=cut
+
+
+method save_checkpoint(Str $prefix, Int $epoch, Bool $save_optimizer_states=0)
+{
+    my %buckets = %{ $self->_buckets };
+    while(my ($key, $module) = each %buckets)
+    {
+        $module->save_checkpoint("${prefix}_$key", $epoch, $save_optimizer_states);
+    }
+}
+
 1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
index 993461713cb6..386164112e65 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Monitor.pm
@@ -145,14 +145,14 @@ method toc()
     }
     for my $exe (@{ $self->exes })
     {
-        zip(sub {
-            my ($name, $array) = @_;
+        for(zip($exe->_symbol->list_arguments, $exe->arg_arrays)) {
+            my ($name, $array) = @$_;
             push @{ $self->queue }, [$self->step, $name, $self->stat_func->($array)];
-        }, $exe->_symbol->list_arguments, $exe->arg_arrays);
-        zip(sub {
-            my ($name, $array) = @_;
+        }
+        for(zip($exe->_symbol->list_auxiliary_states, $exe->aux_arrays)) {
+            my ($name, $array) = @$_;
             push @{ $self->queue }, [$self->step, $name, $self->stat_func->($array)];
-        }, $exe->_symbol->list_auxiliary_states, $exe->aux_arrays);
+        }
     }
     $self->activated(0);
     my @res;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
index 1f58a74e2bba..ffee1295d0db 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray.pm
@@ -49,7 +49,8 @@ use overload
     '<'  => \&lesser,
     '<=' => \&lesser_equal,
     '.=' => \&set,
-    '=' => sub { $_[0] };
+    '@{}'=> \&split_array,
+    '='  => sub { $_[0] };
 
 extends 'AI::MXNet::NDArray::Base';
 has 'writable' => (is => 'rw', isa => 'Int', default => 1, lazy => 1);
@@ -77,6 +78,11 @@ method STORABLE_thaw($cloning, $buf, $writable)
     $self->writable($$writable);
 }
 
+method split_array(@args)
+{
+     $self->shape->[0] > 1 ? $self->split(num_outputs => $self->shape->[0], squeeze_axis => 1, axis => 0) : [$self];
+}
+
 method at(Index @indices)
 {
     confess("No idxs supplied") unless @indices;
@@ -90,12 +96,12 @@ method at(Index @indices)
                    or full crop")
         if $isize > 1 and $dsize != $isize;
     my $i = 0;
-    zip(sub {
-        my ($idx, $dim_size) = @_;
+    for(zip(\@indices, $shape)) {
+        my ($idx, $dim_size) = @$_;
         confess("Dimension $i mismatch Idx: $idx >= Dim Size: $dim_size")
             if $idx >= $dim_size or ($idx + $dim_size) < 0;
         ++$i;
-    }, \@indices, $shape);
+    }
     $i = 0;
     for my $v (@indices)
     {
@@ -106,9 +112,31 @@ method at(Index @indices)
     return $self->slice(@indices);
 }
 
-method slice(Slice @slices)
+method len() { $self->shape->[0] }
+
+method slice(Slice|AdvancedSlice @slices)
 {
     confess("No slices supplied") unless @slices;
+    if(ref $slices[0] eq 'ARRAY' and ref $slices[0]->[0])
+    {
+        my @indices;
+        my $key = $slices[0];
+        my $dtype = 'int32';
+        for my $idx_i (@{ $key })
+        {
+            if(not (blessed $idx_i and $idx_i->isa(__PACKAGE__)))
+            {
+                $idx_i = __PACKAGE__->array($idx_i, ctx=>$self->context, dtype=>$dtype);
+            }
+            else
+            {
+                $dtype = $idx_i->dtype;
+            }
+            push @indices, $idx_i;
+        }
+        my $indices = __PACKAGE__->stack(@indices);
+        return __PACKAGE__->gather_nd($self, $indices);
+    }
     my $shape = $self->shape;
     my $dsize = @$shape;
     my $isize = @slices;
@@ -121,10 +149,10 @@ method slice(Slice @slices)
     my $i = -1;
     @slices = map {
         ++$i;
-        ref $_ ? (@$_ == 1 ? [$_->[0], $shape->[$i] - 1] : $_) : ($_ eq 'X' ? [0, $shape->[$i] - 1] : [$_, $_]);
+        ref $_ ? (@$_ == 1 ? [$_->[0], $_->[0]] : $_) : ($_ eq 'X' ? [0, $shape->[$i] - 1] : [$_, $_]);
     } @slices;
-    zip(sub {
-        my ($slice, $dim_size) = @_;
+    for(zip(\@slices, $shape)) {
+        my ($slice, $dim_size) = @$_;
         my ($begin, $end, $stride) = @$slice;
         confess("NDArray does not support slice strides != 1")
             if ($stride//0) > 1;
@@ -132,7 +160,7 @@ method slice(Slice @slices)
             if $begin >= $dim_size or ($begin + $dim_size) < 0;
         confess("Dimension $i mismatch slice end : $end >= Dim Size: $dim_size")
             if $end >= $dim_size or ($end + $dim_size) < 0;
-    }, \@slices, $shape);
+    }
     $i = 0;
     my ($begin, $end) = ([], []);
     for my $s (@slices)
@@ -156,7 +184,7 @@ method set(AcceptableInput $value, $reverse=)
     ## plain number
     if(not ref $value)
     {
-        $self->_set_value($value, { out => $self });
+        $self->_set_value($value, out => $self);
     }
     # ndarray
     elsif(blessed($value) and $value->isa(__PACKAGE__))
@@ -180,6 +208,15 @@ method asscalar()
 {
     confess("ndarray size must be 1") unless $self->size == 1;
     return $self->aspdl->at(0);
+    ## code below works happily on CPU/segfaults on GPU
+    #$self->wait_to_read;
+    #my $perl_pack_type = DTYPE_MX_TO_PERL->{$self->dtype};
+    #my $length = {qw/f 4 d 8 S 2 C 1 l 4/}->{$perl_pack_type};
+    #return
+    #(map {
+    #        $perl_pack_type eq 'S' ? AI::MXNetCAPI::_half_to_float($_) : $_
+    #     } unpack("$perl_pack_type", check_call(AI::MXNetCAPI::NDArrayGetData($self->handle, $length)))
+    #)[0];
 }
 
 method _sync_copyfrom(ArrayRef|PDL|PDL::Matrix $source_array)
@@ -208,17 +245,13 @@ method _sync_copyfrom(ArrayRef|PDL|PDL::Matrix $source_array)
         confess("Shape inconsistant: expected $ndary_shape_str vs got $pdl_shape_str")
     }
     my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
-    my $buf;
+    my $ptr = $source_array->get_dataref;
     ## special handling for float16
     if($perl_pack_type eq 'S')
     {
-        $buf = pack("S*", map { AI::MXNetCAPI::_float_to_half($_) } unpack ("f*", ${$source_array->get_dataref}));
-    }
-    else
-    {
-        $buf = ${$source_array->get_dataref};
+        $ptr = \( pack("S*", map { AI::MXNetCAPI::_float_to_half($_) } unpack ("f*", $$ptr)) );
     }
-    check_call(AI::MXNetCAPI::NDArraySyncCopyFromCPU($self->handle, $buf, $self->size));
+    check_call(AI::MXNetCAPI::NDArraySyncCopyFromCPU($self->handle, $$ptr, $self->size));
     return $self;
 }
 
@@ -238,14 +271,13 @@ method aspdl()
     my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
     my $pdl = PDL->new_from_specification($pdl_type, reverse @{ $self->shape });
     my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
-    my $buf = pack("$perl_pack_type*", (0)x$self->size);
-    check_call(AI::MXNetCAPI::NDArraySyncCopyToCPU($self->handle, $buf, $self->size));
+    my $ptr = $pdl->get_dataref;
+    check_call(AI::MXNetCAPI::NDArraySyncCopyToCPU($self->handle, $$ptr, $self->size));
     ## special handling for float16
     if($perl_pack_type eq 'S')
     {
-        $buf = pack("f*", map { AI::MXNetCAPI::_half_to_float($_) } unpack("S*", $buf));
+        $$ptr = pack("f*", map { AI::MXNetCAPI::_half_to_float($_) } unpack("S*", $$ptr));
     }
-    ${$pdl->get_dataref} = $buf;
     $pdl->upd_data;
     return $pdl;
 }
@@ -269,14 +301,13 @@ method asmpdl()
     my $pdl_type = PDL::Type->new(DTYPE_MX_TO_PDL->{ $dtype });
     my $pdl = PDL::Matrix->new_from_specification($pdl_type, @{ $self->shape });
     my $perl_pack_type = DTYPE_MX_TO_PERL->{$dtype};
-    my $buf = pack("$perl_pack_type*", (0)x$self->size);
-    check_call(AI::MXNetCAPI::NDArraySyncCopyToCPU($self->handle, $buf, $self->size));
+    my $ptr = $pdl->get_dataref;
+    check_call(AI::MXNetCAPI::NDArraySyncCopyToCPU($self->handle, $$ptr, $self->size));
     ## special handling for float16
     if($perl_pack_type eq 'S')
     {
-        $buf = pack("f*", map { AI::MXNetCAPI::_half_to_float($_) } unpack("S*", $buf));
+        $$ptr = pack("f*", map { AI::MXNetCAPI::_half_to_float($_) } unpack("S*", $$ptr));
     }
-    ${$pdl->get_dataref} = $buf;
     $pdl->upd_data;
     return $pdl;
 }
@@ -348,9 +379,16 @@ method reshape(ArrayRef[Int] $new_shape)
     my $i = -1;
     my @inferred = map { $i++; $_ == -1 ? ($i) : () } @$new_shape;
     assert((@inferred <= 1), 'Only one dimension can be inferred.');
+    $i = -1;
+    my @keep = map { $i++; $_ == 0 ? ($i) : () } @$new_shape;
+    my $shape = $self->shape;
+    if(@keep)
+    {
+        @{$new_shape}[@keep] = @{$shape}[@keep];
+    }
     if(@inferred)
     {
-        $new_shape->[$inferred[0]] = product(@{ $self->shape })/product(map { abs($_) } @{ $new_shape });
+        $new_shape->[$inferred[0]] = product(@{ $shape })/product(map { abs($_) } @{ $new_shape });
     }
     my $handle = check_call(
                     AI::MXNetCAPI::NDArrayReshape(
@@ -959,7 +997,9 @@ method zeros(
     Shape $shape,
     AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx,
     Dtype :$dtype='float32',
-    Maybe[AI::MXNet::NDArray] :$out=
+    Maybe[AI::MXNet::NDArray] :$out=,
+    Maybe[Str] :$name=,
+    Maybe[Str] :$__layout__=
 )
 {
     return __PACKAGE__->_zeros({ shape => $shape, ctx => "$ctx", dtype => $dtype, ($out ? (out => $out) : ())  });
@@ -990,7 +1030,9 @@ method ones(
     Shape $shape,
     AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx,
     Dtype :$dtype='float32',
-    Maybe[AI::MXNet::NDArray] :$out=
+    Maybe[AI::MXNet::NDArray] :$out=,
+    Maybe[Str] :$name=,
+    Maybe[Str] :$__layout__=
 )
 {
     return __PACKAGE__->_ones({ shape => $shape, ctx => "$ctx", dtype => $dtype, ($out ? (out => $out) : ()) });
@@ -1023,7 +1065,9 @@ method ones(
 method full(
     Shape $shape, Num $val,
     AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx,
-    Dtype :$dtype='float32', Maybe[AI::MXNet::NDArray] :$out=
+    Dtype :$dtype='float32', Maybe[AI::MXNet::NDArray] :$out=,
+    Maybe[Str] :$name=,
+    Maybe[Str] :$__layout__=
 )
 {
     return __PACKAGE__->_set_value({ src => $val, out => $out ? $out : __PACKAGE__->empty($shape, ctx => $ctx, dtype => $dtype) });
@@ -1157,7 +1201,7 @@ method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$alway
     ----------
     :$start=0 : number, optional
         Start of interval. The interval includes this value. The default start value is 0.
-    $stop= : number, optional
+    :$stop= : number, optional
         End of interval. The interval does not include this value.
     :$step=1 : number, optional
         Spacing between the values
@@ -1175,7 +1219,7 @@ method concatenate(ArrayRef[AI::MXNet::NDArray] $arrays, Index :$axis=0, :$alway
         The created NDArray
 =cut
 
-method arange(Index :$start=0, Index :$stop=, Index :$step=1, Index :$repeat=1,
+method arange(Index :$start=0, Maybe[Index] :$stop=, Index :$step=1, Index :$repeat=1,
               AI::MXNet::Context :$ctx=AI::MXNet::Context->current_ctx, Dtype :$dtype='float32')
 {
     return __PACKAGE__->_arange({
@@ -1403,14 +1447,84 @@ method detach()
     return __PACKAGE__->new(handle => $handle);
 }
 
-method backward(Maybe[AI::MXNet::NDArray] $out_grad=, Bool $retain_graph=0)
+=head2 attach_grad
+
+        Attach a gradient buffer to this NDArray, so that `backward`
+        can compute gradient with respect to it.
+
+        Parameters
+        ----------
+        GradReq :$grad_req='write' : {'write', 'add', 'null'}
+            How gradient will be accumulated.
+            - 'write': gradient will be overwritten on every backward.
+            - 'add': gradient will be added to existing value on every backward.
+            - 'null': do not compute gradient for this NDArray.
+        Maybe[Str] :$stype= : str, optional
+            The storage type of the gradient array. Defaults to the same stype of this NDArray.
+=cut
+
+method attach_grad(GradReq :$grad_req='write', Maybe[Str] :$stype=)
+{
+    my $grad;
+    if(defined $stype)
+    {
+        $grad = __PACKAGE__->_zeros($self->shape, stype=>$stype);
+    }
+    else
+    {
+        $grad = $self->zeros_like;
+    }
+    $grad_req = GRAD_REQ_MAP->{$grad_req};
+    check_call(
+        AI::MXNetCAPI::AutogradMarkVariables(
+            1,
+            [$self->handle],
+            [$grad_req],
+            [$grad->handle]
+        )
+    );
+}
+
+=head2 grad
+
+    Returns gradient buffer attached to this NDArray.
+=cut
+
+method grad()
+{
+    my $handle = check_call(AI::MXNetCAPI::NDArrayGetGrad($self->handle));
+    return undef unless defined $handle;
+    return __PACKAGE__->new(handle => $handle);
+}
+
+=head2 backward
+
+    Compute the gradients of this NDArray w.r.t variables.
+
+    Parameters
+    ----------
+    :$out_grad= : NDArray, optional
+        Gradient with respect to head.
+    :$retain_graph=0 : bool, optional
+        Whether to retain the computaion graph for another backward
+        pass on the same graph. By default the computaion history
+        is cleared.
+    :$train_mode=1 : bool, optional
+        Whether to compute gradient for training or inference.
+=cut
+
+method backward(Maybe[AI::MXNet::NDArray] :$out_grad=, Bool :$retain_graph=0, Bool :$train_mode=1)
 {
     check_call(
-        AI::MXNetCAPI::AutogradBackward(
+        AI::MXNetCAPI::AutogradBackwardEx(
             1,
             [$self->handle],
             [defined $out_grad ? $out_grad->handle : undef],
-            $retain_graph
+            0,
+            [],
+            $retain_graph,
+            0,
+            $train_mode
         )
     )
 }
@@ -1427,4 +1541,7 @@ eval << "EOV" if ($^V and $^V >= 5.006007);
 }
 EOV
 
+sub contrib { 'AI::MXNet::Contrib::NDArray' }
+sub random  { 'AI::MXNet::Random' }
+
 __PACKAGE__->meta->make_immutable;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm
index b51436157a82..8efe9d740cc3 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Base.pm
@@ -60,33 +60,66 @@ func _make_ndarray_function($handle, $func_name)
                             $key_var_num_args,
                             $ret_type
     );
+    my %ndarguments;
     my @arguments;
+    my %arguments = (out => 1, name => 1, ctx => 1, shape => 1);
+    my $j = 0;
     for my $i (0..(@$arg_names-1))
     {
         if(not $arg_types->[$i] =~ /^(?:NDArray|Symbol|ndarray\-or\-symbol)/)
         {
             push @arguments, $arg_names->[$i];
+            $arguments{ $arg_names->[$i] } = 1;
+        }
+        else
+        {
+            $ndarguments{ $arg_names->[$i] } = $j++;
         }
     }
     my $generic_ndarray_function = sub
     {
         my $class = shift;
-        my (@args, %kwargs);
+        my (@args, %kwargs, %ndkwargs, @tmp);
         if(@_ and ref $_[-1] eq 'HASH')
         {
             %kwargs = %{ pop(@_) };
         }
-        @args = @_;
-        if(ref $class)
+        else
         {
-            @args = ($class) if not @args;
-            $class = ref $class;
+            while(@_ >= 2 and not ref $_[-2])
+            {
+                if(exists $arguments{ $_[-2] })
+                {
+                    my $v = pop(@_);
+                    my $k = pop(@_);
+                    $kwargs{ $k } = $v;
+                }
+                elsif(exists $ndarguments{ $_[-2] })
+                {
+                    my $v = pop(@_);
+                    my $k = pop(@_);
+                    $ndkwargs{ $k } = $v;
+                }
+                else
+                {
+                    unshift(@tmp, pop(@_));
+                    unshift(@tmp, pop(@_));
+                }
+            }
+        }
+        @args = (@_, @tmp);
+        if(%ndkwargs)
+        {
+            for my $k (keys %ndkwargs)
+            {
+                $args[$ndarguments{$k}] = $ndkwargs{$k};
+            }
         }
         my @ndargs;
         my @pos_args;
         for my $i (@args)
         {
-            if(blessed($i) and $i->isa($class))
+            if(blessed($i) and $i->isa(__PACKAGE__))
             {
                 push @ndargs, $i->handle;
             }
@@ -96,12 +129,13 @@ func _make_ndarray_function($handle, $func_name)
             }
             if(@pos_args > @arguments)
             {
-                die "Too many positional arguments";
+                confess("Too many positional arguments");
             }
         }
         @kwargs{ @arguments[0..$#pos_args] } = @pos_args;
         my $original_output;
         my $output_vars;
+        delete $kwargs{name};
         if(grep { $_ eq 'out' } keys %kwargs)
         {
             $output_vars = delete $kwargs{out};
@@ -115,6 +149,11 @@ func _make_ndarray_function($handle, $func_name)
         {
             $output_vars = [];
         }
+        if(blessed($class) and $class->isa(__PACKAGE__) and not @{ $output_vars })
+        {
+            @ndargs = ($class->handle) if not @ndargs;
+            $class = ref $class;
+        }
         for my $key (keys %kwargs)
         {
             $kwargs{ $key } = "(" .join(", ", @{ $kwargs{ $key } }) .")"
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm
index 40312ebaa24f..1a3ea7e0a460 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/NDArray/Slice.pm
@@ -33,33 +33,32 @@ has end    => (is => 'ro', isa => 'Shape', required => 1);
 use overload
     '.=' => \&set,
     '='  => sub { $_[0] },
-    '""' => \&notsupported,
-    '+'  => \&notsupported,
-    '+=' => \&notsupported,
-    '-'  => \&notsupported,
-    '-=' => \&notsupported,
-    '*'  => \&notsupported,
-    '*=' => \&notsupported,
-    '/'  => \&notsupported,
-    '/=' => \&notsupported,
-    '**' => \&notsupported,
-    '==' => \&notsupported,
-    '!=' => \&notsupported,
-    '>'  => \&notsupported,
-    '>=' => \&notsupported,
-    '<'  => \&notsupported,
-    '<=' => \&notsupported;
+    '""' => sub { my $self = $_[0]->sever; "$self" },
+    '**' => sub { my $self = $_[0]->sever; $self ** $_[1] },
+    '==' => sub { my $self = $_[0]->sever; $self == $_[1] },
+    '!=' => sub { my $self = $_[0]->sever; $self != $_[1] },
+    '+'  => sub { my $self = $_[0]->sever; $self +  $_[1] },
+    '*'  => sub { my $self = $_[0]->sever; $self *  $_[1] },
+    '-'  => sub { my $self = $_[0]->sever; $_[2] ? $_[1] - $self : $self - $_[1] },
+    '/'  => sub { my $self = $_[0]->sever; $_[2] ? $_[1] / $self : $self / $_[1] },
+    '+=' => sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in+$_[1]) },
+    '-=' => sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in-$_[1]) },
+    '*=' => sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in*$_[1]) },
+    '/=' => sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in/$_[1]) },
+    '**='=> sub { my ($self, $other) = @_; my $in = $self->sever; $self .= ($in**$_[1]) },
+    '>'  => sub { my $self = $_[0]->sever; return $_[2] ? $_[1] >  $self : $self >  $_[1] },
+    '>=' => sub { my $self = $_[0]->sever; return $_[2] ? $_[1] >= $self : $self >= $_[1] },
+    '<'  => sub { my $self = $_[0]->sever; return $_[2] ? $_[1] <  $self : $self <  $_[1] },
+    '<=' => sub { my $self = $_[0]->sever; return $_[2] ? $_[1] <= $self : $self <= $_[1] };
 
 method set(AcceptableInput $value, $reverse=)
 {
     confess("set value must be defined") unless defined $value;
     confess("${\ $self->parent } is not writable") unless $self->parent->writable;
-    my $shape = [];
-    zip(
-        sub { my ($begin, $end) = @_; push @$shape, ($end-$begin); },
-        $self->begin,
-        $self->end
-    );
+    my $shape = [ map {
+        my($begin, $end) = @$_;
+        ($end-$begin);
+    } zip($self->begin, $self->end) ];
     if(ref $value)
     {
         if(blessed($value) and $value->isa('AI::MXNet::NDArray'))
@@ -76,15 +75,11 @@ method set(AcceptableInput $value, $reverse=)
         }
         confess("value $value does not match slice dim sizes [@$shape]")
             if @{$value->shape} != @$shape;
-        zip(
-            sub {
-                my ($dsize, $vdsize) = @_;
+        for(zip($shape, $value->shape)) {
+                my ($dsize, $vdsize) = @$_;
                 confess("Slice [@$shape]  != $value given as value")
                     if $dsize != $vdsize;
-            },
-            $shape,
-            $value->shape
-        );
+        }
         AI::MXNet::NDArray->_crop_assign(
             $self->parent,
             $value,
@@ -113,7 +108,13 @@ method sever()
     no warnings 'misc';
     use attributes 'AI::MXNet::NDArray::Slice', \&AI::MXNet::NDArray::Slice::sever, 'lvalue';
 }
+
 sub notsupported  { confess("NDArray only support continuous slicing on axis 0"); }
-sub AUTOLOAD { notsupported() }
+sub AUTOLOAD {
+    my $sub = $AI::MXNet::NDArray::Slice::AUTOLOAD;
+    $sub =~ s/.*:://;
+    my $self = shift;
+    return $self->sever->$sub(@_);
+}
 
 1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
index c6f682253833..2894b1e9179f 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
@@ -108,6 +108,7 @@ has 'clip_gradient'       => (is => "rw", isa => "Maybe[Num]");
 has 'param_idx2name'      => (is => "rw", isa => "HashRef[Str]", default => sub { +{} });
 has 'idx2name'            => (is => "rw", isa => "HashRef[Str]");
 has 'sym'                 => (is => "rw", isa => "Maybe[AI::MXNet::Symbol]");
+has 'param_dict'          => (is => "rw", isa => "HashRef", default => sub { +{} });
 
 sub BUILD
 {
@@ -217,14 +218,18 @@ method _get_lr(Index $index)
     my $lr;
     if($self->lr_scheduler)
     {
-        $lr = &{$self->lr_scheduler}($self->num_update);
+        $lr = $self->lr_scheduler->($self->num_update);
     }
     else
     {
         $lr = $self->lr;
     }
 
-    if(exists $self->lr_mult->{ $index })
+    if(exists $self->param_dict->{ $index })
+    {
+        $lr *= $self->param_dict->{ $index }->lr_mult;
+    }
+    elsif(exists $self->lr_mult->{ $index })
     {
         $lr *= $self->lr_mult->{ $index };
     }
@@ -238,7 +243,11 @@ method _get_lr(Index $index)
 method _get_wd(Index $index)
 {
     my $wd = $self->wd;
-    if(exists $self->wd_mult->{ $index })
+    if(exists $self->param_dict->{ $index })
+    {
+        $wd *= $self->param_dict->{ $index }->wd_mult;
+    }
+    elsif(exists $self->wd_mult->{ $index })
     {
         $wd *= $self->wd_mult->{ $index };
     }
@@ -297,7 +306,7 @@ has 'multi_precision' => (is => "ro", isa => "Bool", default => 0);
 sub BUILD
 {
     my $self = shift;
-    $self->kwargs({ rescale_grad => $self->rescale_grad });
+    $self->kwargs({});
     if($self->momentum)
     {
         $self->kwargs->{momentum} = $self->momentum;
@@ -351,6 +360,7 @@ method update(
         out => $weight,
         lr  => $lr,
         wd  => $wd,
+        rescale_grad => $self->rescale_grad,
         %{ $self->kwargs }
     };
     my $use_multi_precision = ref($state) eq 'ARRAY';
@@ -452,7 +462,7 @@ method update(
     Index                     $index,
     AI::MXNet::NDArray        $weight,
     AI::MXNet::NDArray        $grad,
-    Maybe[AI::MXNet::NDArray] $state
+    Maybe[AI::MXNet::NDArray|ArrayRef[Maybe[AI::MXNet::NDArray]]] $state
 )
 {
     my $lr = $self->_get_lr($index);
@@ -668,7 +678,6 @@ sub BUILD
 {
     my $self = shift;
     $self->kwargs({
-        rescale_grad => $self->rescale_grad,
         beta1   => $self->beta1,
         beta2   => $self->beta2,
         epsilon => $self->epsilon
@@ -715,6 +724,7 @@ method update(
             out => $weight,
             lr  => $lr,
             wd  => $wd,
+            rescale_grad => $self->rescale_grad,
             %{ $self->kwargs }
         }
     );
@@ -867,7 +877,6 @@ sub BUILD
 {
     my $self = shift;
     $self->kwargs({
-        rescale_grad => $self->rescale_grad,
         gamma1       => $self->gamma1,
         epsilon      => $self->epsilon
     });
@@ -933,6 +942,7 @@ method update(
                 out => $weight,
                 lr  => $lr,
                 wd  => $wd,
+                rescale_grad => $self->rescale_grad,
                 %{ $self->kwargs }
             }
         );
@@ -945,6 +955,7 @@ method update(
                 out => $weight,
                 lr  => $lr,
                 wd  => $wd,
+                rescale_grad => $self->rescale_grad,
                 %{ $self->kwargs }
             }
         );
@@ -1351,16 +1362,38 @@ method sync_state_context(Maybe[AI::MXNet::NDArray|ArrayRef[AI::MXNet::NDArray]]
     return $state;
 }
 
+=head2 set_states
+
+    Sets updater states.
+=cut
+
 method set_states($states)
 {
     my $thawed_states = thaw($states);
+    my ($optimizer);
+    if(ref $thawed_states eq 'ARRAY')
+    {
+        ($thawed_states, $optimizer) = @{ $thawed_states };
+        $self->optimizer($optimizer);
+    }
     $self->states($thawed_states);
     %{ $self->states_synced } = map { $_ => 0 } keys %{ $thawed_states };
 }
 
-method get_states()
+=head2 get_states
+
+        Gets updater states.
+
+        Parameters
+        ----------
+        dump_optimizer : bool, default False
+            Whether to also save the optimizer itself. This would also save optimizer
+            information such as learning rate and weight decay schedules.
+=cut
+
+method get_states(Bool $dump_optimizer=0)
 {
-    return freeze($self->states);
+    return freeze($dump_optimizer ? [$self->states, $self->optimizer] : $self->states);
 }
 
 package AI::MXNet::Optimizer;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
index 08c3094aa9c7..f2d8b5369e99 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/RNN/Cell.pm
@@ -247,7 +247,7 @@ method begin_state(CodeRef :$func=AI::MXNet::Symbol->can('zeros'), @kwargs)
             }
         }
         my %kwargs = (@kwargs, %info);
-        my $state = &{$func}(
+        my $state = $func->(
             'AI::MXNet::Symbol',
             @name,
             %kwargs
@@ -425,7 +425,7 @@ method unroll(
     for my $i (0..$length-1)
     {
         my $output;
-        ($output, $states) = &{$self}(
+        ($output, $states) = $self->(
             $inputs[$i],
             $states
         );
@@ -447,7 +447,7 @@ method _get_activation($inputs, $activation, @kwargs)
     }
     else
     {
-        return &{$activation}($inputs, @kwargs);
+        return $activation->($inputs, @kwargs);
     }
 }
 
@@ -1190,7 +1190,7 @@ method call($inputs, $states)
         my $n = scalar(@{ $cell->state_info });
         my $state = [@{ $states }[$p..$p+$n-1]];
         $p += $n;
-        ($inputs, $state) = &{$cell}($inputs, $state);
+        ($inputs, $state) = $cell->($inputs, $state);
         push @next_states, $state;
     }
     return ($inputs, [map { @$_} @next_states]);
@@ -1412,15 +1412,15 @@ method unroll(
         $r_outputs = [reverse(@{ $r_outputs })];
     }
     my $outputs = [];
-    zip(sub {
-        my ($i, $l_o, $r_o) = @_;
+    for(zip([0..@{ $l_outputs }-1], [@{ $l_outputs }], [@{ $r_outputs }])) {
+        my ($i, $l_o, $r_o) = @$_;
         push @$outputs, AI::MXNet::Symbol->Concat(
             $l_o, $r_o, dim=>(1+($merge_outputs?1:0)),
             name => $merge_outputs
                         ? sprintf('%sout', $self->_output_prefix)
                         : sprintf('%st%d', $self->_output_prefix, $i)
         );
-    }, [0..@{ $l_outputs }-1], [@{ $l_outputs }], [@{ $r_outputs }]);
+    }
     if($merge_outputs)
     {
         $outputs = @{ $outputs }[0];
@@ -1828,7 +1828,7 @@ has [qw/dropout_outputs dropout_states/] => (is => 'ro', isa => 'Num', default =
 
 method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
 {
-    my ($output, $states) = &{$self->base_cell}($inputs, $states);
+    my ($output, $states) = $self->base_cell->($inputs, $states);
     if($self->dropout_outputs > 0)
     {
         $output = AI::MXNet::Symbol->Dropout(data => $output, p => $self->dropout_outputs);
@@ -1886,7 +1886,7 @@ method reset()
 method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
 {
     my ($cell, $p_outputs, $p_states) = ($self->base_cell, $self->zoneout_outputs, $self->zoneout_states);
-    my ($next_output, $next_states) = &{$cell}($inputs, $states);
+    my ($next_output, $next_states) = $cell->($inputs, $states);
     my $mask = sub {
         my ($p, $like) = @_;
         AI::MXNet::Symbol->Dropout(
@@ -1899,7 +1899,7 @@ method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
     my $prev_output = $self->prev_output // AI::MXNet::Symbol->zeros(shape => [0, 0]);
     my $output = $p_outputs != 0
         ? AI::MXNet::Symbol->where(
-            &{$mask}($p_outputs, $next_output),
+            $mask->($p_outputs, $next_output),
             $next_output,
             $prev_output
         )
@@ -1907,14 +1907,14 @@ method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
     my @states;
     if($p_states != 0)
     {
-        zip(sub {
-            my ($new_s, $old_s) = @_;
+        for(zip($next_states, $states)) {
+            my ($new_s, $old_s) = @$_;
             push @states, AI::MXNet::Symbol->where(
-                &{$mask}($p_states, $new_s),
+                $mask->($p_states, $new_s),
                 $new_s,
                 $old_s
             );
-        }, $next_states, $states);
+        }
     }
     $self->prev_output($output);
     return ($output, @states ? \@states : $next_states);
@@ -1940,7 +1940,7 @@ extends 'AI::MXNet::RNN::ModifierCell';
 method call(AI::MXNet::Symbol $inputs, SymbolOrArrayOfSymbols $states)
 {
     my $output;
-    ($output, $states) = &{$self->base_cell}($inputs, $states);
+    ($output, $states) = $self->base_cell->($inputs, $states);
     $output = AI::MXNet::Symbol->elemwise_add($output, $inputs, name => $output->name.'_plus_residual');
     return ($output, $states)
 }
@@ -1968,11 +1968,11 @@ method unroll(
     else
     {
         my @temp;
-        zip(sub {
-            my ($output_sym, $input_sym) = @_;
+        for(zip([@{ $outputs }], [@{ $inputs }])) {
+            my ($output_sym, $input_sym) = @$_;
             push @temp, AI::MXNet::Symbol->elemwise_add($output_sym, $input_sym,
                             name=>$output_sym->name."_plus_residual");
-        }, [@{ $outputs }], [@{ $inputs }]);
+        }
         $outputs = \@temp;
     }
     return ($outputs, $states);
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
index 9ca013c62348..dcd765c0f166 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Random.pm
@@ -18,6 +18,7 @@
 package AI::MXNet::Random;
 use strict;
 use warnings;
+use Scalar::Util qw/blessed/;
 use AI::MXNet::Base;
 use AI::MXNet::NDArray::Base;
 use AI::MXNet::Function::Parameters;
@@ -57,23 +58,78 @@ method seed(Int $seed_state)
     check_call(AI::MXNetCAPI::RandomSeed($seed_state));
 }
 
-for my $method (
-        [qw/_sample_uniform uniform/],
-        [qw/_sample_normal normal/],
-        [qw/_sample_gamma gamma/],
-        [qw/_sample_exponential exponential/],
-        [qw/_sample_poisson poisson/],
-        [qw/_sample_negbinomial negative_binomial/],
-        [qw/_sample_gennegbinomial generalized_negative_binomial/],
-)
-{
-    my ($nd_method_name, $rnd_method_name) = @{$method};
+sub AUTOLOAD {
+    my $sub = $AI::MXNet::Random::AUTOLOAD;
+    $sub =~ s/.*:://;
+    shift;
+    my %updated;
+    my %defaults = (
+        ctx   => AI::MXNet::Context->current_ctx,
+        shape => 1,
+        out   => 1
+    );
+    my @args;
+    my @tmp = @_;
+    if(ref $tmp[-1] eq 'HASH')
+    {
+        my @kwargs = %{ pop(@tmp) };
+        push @tmp, @kwargs;
+    }
+    while(@tmp >= 2 and not ref $tmp[-2])
+    {
+        if(exists $defaults{$tmp[-2]})
+        {
+            my $v = pop(@tmp);
+            my $k = pop(@tmp);
+            if(defined $v)
+            {
+                $updated{$k} = 1;
+                $defaults{$k} = $v;
+            }
+        }
+        else
+        {
+            unshift @args, pop(@tmp);
+            unshift @args, pop(@tmp);
+        }
+    }
+    unshift @args, @tmp;
+    if(blessed($defaults{out}) and not exists $updated{shape})
+    {
+        delete $defaults{shape};
+    }
+    delete $defaults{out} unless blessed $defaults{out};
+    if($sub eq 'exponential')
+    {
+        my $changed = 0;
+        for my $i (0..@args-1)
+        {
+            if(not ref $args[$i] and $args[$i] eq 'scale')
+            {
+                $args[$i] = 'lam';
+                $args[$i+1] = 1/$args[$i+1];
+                $changed = 1;
+            }
+        }
+        $args[0] = 1/$args[0] unless $changed;
+    }
+    if(grep { blessed($_) and $_->isa('AI::MXNet::NDArray') } @args)
+    {
+        if($sub eq 'normal')
+        {
+            my %mapping = qw/loc mu scale sigma/;
+            @args = map { (not ref $_ and exists $mapping{$_}) ? $mapping{$_} : $_ } @args
+        }
+        $sub = "_sample_$sub";
+        delete $defaults{shape} if not exists $updated{shape};
+        delete $defaults{ctx};
+        return AI::MXNet::NDArray->$sub(@args, %defaults);
+    }
+    else
     {
-        no strict 'refs';
-        *{__PACKAGE__."::$rnd_method_name"} = sub { shift;
-            return AI::MXNet::NDArray->$nd_method_name(@_);
-        };
+        $sub = "_random_$sub";
     }
+    return AI::MXNet::NDArray->$sub(@args, %defaults);
 }
 
 1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm b/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm
index 2027a901ec10..f22e2ce92789 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/RecordIO.pm
@@ -25,7 +25,7 @@ use Mouse;
 
 =head1 NAME
 
-    AI::MXNet::Function::Parameters - Read/write RecordIO format data
+    AI::MXNet::RecordIO - Read/write RecordIO format data
 =cut
 
 =head2 new
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Rtc.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Rtc.pm
deleted file mode 100644
index 09dc66200322..000000000000
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Rtc.pm
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-package AI::MXNet::Rtc;
-use strict;
-use warnings;
-use AI::MXNet::Base;
-use Mouse;
-use AI::MXNet::Function::Parameters;
-
-=head1 DESCRIPTION
-
-    Interface to runtime cuda kernel compile module.
-=cut
-
-=head2 Constructor
-
-    MXRtc object in mxnet.
-    This class allow you to write cuda kernel in perl
-    and call them with NDArray.
-
-    Parameters
-    ----------
-    name : str
-        name of the kernel
-    inputs : tuple of (str, mxnet.ndarray)
-        list of input names and ndarray
-    outputs : tuple of (str, mxnet.ndarray)
-        list of output names and ndarray
-    kernel : str
-        the actual kernel code.
-        Note that this is only the body of the kernel, i.e.
-        after { and before }. Rtc will decorate the kernel.
-        For example, if name = "mykernel" and
-        inputs = [('x', mx.nd.zeros((10,)))]
-        outputs = [('y', mx.nd.zeros((10,)))]
-        kernel = "y[threadIdx.x] = x[threadIdx.x];",
-        the kernel that is compile will be:
-        extern "C" __global__ mykernel(float *x, float *y) {
-            const int x_ndim = 1;
-            const int x_dims = { 10 };
-            const int y_ndim = 1;
-            const int y_dims = { 10 };
-
-            y[threadIdx.x] = x[threadIdx.x];
-        }
-=cut
-
-has 'handle'              => (is => 'rw', isa => 'RtcHandle', init_arg => undef);
-has [qw/name kernel/]     => (is => 'ro', isa => 'Str', required => 1);
-has [qw/inputs outputs/]  => (is => 'ro', isa => 'HashRef[AI::MXNet::NDArray]', required => 1);
-
-sub BUILD
-{
-    my $self = shift;
-    my (@input_names, @output_names, @input_nds, @output_nds);
-    while(my ($name, $arr) = each %{ $self->inputs })
-    {
-        push @input_names, $name;
-        push @input_nds, $arr->handle;
-    }
-    while(my ($name, $arr) = each %{ $self->outputs })
-    {
-        push @output_names, $name;
-        push @output_nds, $arr->handle;
-    }
-    my $handle = check_call(
-        AI::MXNetCAPI::RtcCreate(
-            $self->name,
-            scalar(@input_names),
-            scalar(@output_names),
-            \@input_names,
-            \@output_names,
-            \@input_nds,
-            \@output_nds,
-            $self->kernel
-        )
-    );
-    $self->handle($handle);
-}
-
-sub DEMOLISH
-{
-    check_call(AI::MXNetCAPI::MXRtcFree(shift->handle));
-}
-
-=head2 push
-
-        run the kernel.
-
-        Parameters
-        ----------
-        inputs : list of ndarray
-            list of input. Can be different ndarray then uses for constructor,
-            but must have the same shape and in the same order.
-        outputs : list of ndarray
-            list of out. Can be different ndarray then uses for constructor,
-            but must have the same shape and in the same order.
-        grid_dims : tuple of 3 uint
-            grid dimension for kernel launch
-        block_dims : tuple of 3 uint
-            block dimension for kernel launch
-=cut
-
-
-method push(
-    ArrayRef[AI::MXNet::NDArray] $inputs,
-    ArrayRef[AI::MXNet::NDArray] $outputs,
-    ArrayRef[DimSize] $grid_dims,
-    ArrayRef[DimSize] $block_dims
-)
-{
-    confess("grid_dims must be size of 3")
-        unless @{ $grid_dims } == 3;
-    confess("block_dims must be size of 3")
-        unless @{ $block_dims } == 3;
-    check_call(
-        AI::MXNetCAPI::RtcPush(
-            $self->handle,
-            scalar(@$inputs),
-            scalar(@$outputs),
-            [map { $_->handle } @$inputs],
-            [map { $_->handle } @$outputs],
-            @{ $grid_dims },
-            @{ $block_dims }
-        )
-    );
-}
-
-1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
index eed6e93f568b..8fd885a1d2c8 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol.pm
@@ -26,6 +26,7 @@ use strict;
 use warnings;
 use AI::MXNet::Base;
 use AI::MXNet::Symbol::Base;
+use AI::MXNet::Symbol::Random;
 use AI::MXNet::Types;
 use Mouse;
 use AI::MXNet::Function::Parameters;
@@ -73,7 +74,11 @@ method STORABLE_thaw($cloning, $json)
 method stringify($other=, $reverse=)
 {
     my $name = $self->name;
-    sprintf("<%s %s>", ref($self), $name ? $name : 'Grouped');
+    sprintf(
+        "<%s %s%s>",
+        ref($self),
+        $name ? ($name, '') : ('group [', join(', ', map { $_->name } @{ $self }) . ']')
+    );
 }
 
 method add(AI::MXNet::Symbol|Num $other, $reverse=)
@@ -580,8 +585,8 @@ method infer_shape(Maybe[Str|Shape] @args)
         my ($arg_shapes) = $self->_infer_shape_impl(1, @args);
         my $arg_names    = $self->list_arguments;
         my @unknowns;
-        zip(sub {
-            my ($name, $shape) = @_;
+        for(zip($arg_names, $arg_shapes)) {
+            my ($name, $shape) = @$_;
             if(not ref $shape or not @$shape or not product(@$shape))
             {
                 if(@unknowns >= 10)
@@ -594,7 +599,7 @@ method infer_shape(Maybe[Str|Shape] @args)
                     push @unknowns, "$name @shape";
                 }
             }
-        }, $arg_names, $arg_shapes);
+        }
         AI::MXNet::Logging->warning(
             "Cannot decide shape for the following arguments "
             ."(0s in shape means unknown dimensions). "
@@ -1463,4 +1468,7 @@ sub  _ufunc_helper
     }
 }
 
+sub contrib { 'AI::MXNet::Contrib::Symbol' }
+sub random  { 'AI::MXNet::Symbol::Random' }
+
 1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
index 4282f124a34b..8d3d069c82e3 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Base.pm
@@ -90,6 +90,14 @@ func _make_atomic_symbol_function($handle, $name)
         $ret_type) = @{ check_call(AI::MXNetCAPI::SymbolGetAtomicSymbolInfo($handle)) };
     $ret_type //= '';
     my $func_name = $name;
+    my @arguments;
+    my %arguments = map { $_ => 1 } qw/name attr lr_mult wd_mult
+                                       init __layout__ dtype shape/;
+    for my $i (0..@{ $arg_names }-1)
+    {
+        push @arguments, $arg_names->[$i];
+        $arguments{ $arg_names->[$i] } = 1;
+    }
     my $doc_str = build_doc($func_name,
                             $desc,
                             $arg_names,
@@ -99,7 +107,7 @@ func _make_atomic_symbol_function($handle, $name)
                             $ret_type
     );
     my $creator = sub {
-        my $class = shift;
+        my $class = ref($_[0]) || shift;
         my (@args, %kwargs);
         if(
             @_
@@ -114,6 +122,7 @@ func _make_atomic_symbol_function($handle, $name)
         }
         elsif(blessed $_[0] and $_[0]->isa(__PACKAGE__))
         {
+
             while(blessed $_[0] and $_[0]->isa(__PACKAGE__))
             {
                 push @args, shift(@_);
@@ -122,7 +131,18 @@ func _make_atomic_symbol_function($handle, $name)
         }
         else
         {
-            %kwargs = @_;
+            while(@_ >= 2 and not ref $_[-2]
+                    and (exists $arguments{ $_[-2] } or (blessed $_[-1] and $_[-1]->isa(__PACKAGE__))))
+            {
+                my $v = pop(@_);
+                my $k = pop(@_);
+                $kwargs{ $k } = $v;
+            }
+            @kwargs{ @arguments[0..@args-1] } = @args;
+        }
+        if(blessed $class and $class->isa(__PACKAGE__))
+        {
+            $kwargs{data} = $class;
         }
         my $params = {};
         my $symbol_kwargs = {};
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
index 109949c79078..95ea8a6f49ea 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/NameManager.pm
@@ -75,10 +75,15 @@ method get(Maybe[Str] $name, Str $hint)
 
 method current()
 {
-    $AI::MXNet::current_nm_ldr;
+    $AI::MXNet::Symbol::NameManager;
 }
 
-$AI::MXNet::current_nm_ldr = __PACKAGE__->new;
+method set_current(AI::MXNet::Symbol::NameManager $new)
+{
+    $AI::MXNet::Symbol::NameManager = $new;
+}
+
+$AI::MXNet::Symbol::NameManager = __PACKAGE__->new;
 
 package AI::MXNet::Symbol::Prefix;
 use Mouse;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Random.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Random.pm
new file mode 100644
index 000000000000..795fe44825e4
--- /dev/null
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Symbol/Random.pm
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+package AI::MXNet::Symbol::Random;
+use strict;
+use warnings;
+use Scalar::Util qw/blessed/;
+
+sub AUTOLOAD {
+    my $sub = $AI::MXNet::Symbol::Random::AUTOLOAD;
+    $sub =~ s/.*:://;
+    shift;
+    my @args = @_;
+    if($sub eq 'exponential')
+    {
+        my $changed = 0;
+        for my $i (0..@args-1)
+        {
+            if(not ref $args[$i] and $args[$i] eq 'scale')
+            {
+                $args[$i] = 'lam';
+                $args[$i+1] = 1/$args[$i+1];
+                $changed = 1;
+            }
+        }
+        $args[0] = 1/$args[0] unless $changed;
+    }
+    if(grep { blessed($_) and $_->isa('AI::MXNet::Symbol') } @args)
+    {
+        if($sub eq 'normal')
+        {
+            my %mapping = qw/loc mu scale sigma/;
+            @args = map { (not ref $_ and exists $mapping{$_}) ? $mapping{$_} : $_ } @args
+        }
+        $sub = "_sample_$sub";
+    }
+    else
+    {
+        $sub = "_random_$sub";
+    }
+    return AI::MXNet::Symbol->$sub(@args);
+}
+
+1;
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm b/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
index ea918c0cddf3..af9b27dce96c 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
@@ -26,7 +26,7 @@ use Exporter;
 use base qw(Exporter);
 @AI::MXNet::TestUtils::EXPORT_OK = qw(same reldiff almost_equal GetMNIST_ubyte
                                       GetCifar10 pdl_maximum pdl_minimum mlp2 conv
-                                      check_consistency zip assert enumerate same_array dies_like);
+                                      check_consistency zip assert enumerate same_array dies_like allclose);
 use constant default_numerical_threshold => 1e-6;
 =head1 NAME
 
@@ -47,6 +47,21 @@ func same(PDL $a, PDL $b)
     return ($a != $b)->sum == 0;
 }
 
+=head2 allclose
+
+    Test if all elements of two pdl arrays are almost equal
+
+    Parameters
+    ----------
+    a : pdl
+    b : pdl
+=cut
+
+func allclose(PDL $a, PDL $b)
+{
+    return (($a - $b)->abs <= default_numerical_threshold)->all;
+}
+
 =head2 reldiff
 
     Calculate the relative difference between two input arrays
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
index b4ec7e9018b3..ea17f195542c 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Types.pm
@@ -35,19 +35,31 @@ class_type 'AI::MXNet::EvalMetric';
 class_type 'AI::MXNet::DataParallelExecutorGroup';
 class_type 'AI::MXNet::Optimizer';
 class_type 'AI::MXNet::Initializer';
+class_type 'AI::MXNet::KVStore';
 class_type 'AI::MXNet::InitDesc';
 class_type 'AI::MXNet::IRHeader';
+class_type 'AI::MXNet::Updater';
+class_type 'AI::MXNet::KVStore';
+class_type 'AI::MXNet::Gluon::Block';
+class_type 'AI::MXNet::Gluon::Data::Set';
+class_type 'AI::MXNet::Gluon::RNN::HybridRecurrentCell';
+class_type 'AI::MXNet::Symbol::NameManager';
 subtype "AcceptableInput" => as "Num|PDL|PDL::Matrix|AI::MXNet::NDArray|AI::MXNet::NDArray::Slice|ArrayRef";
 subtype "Index"           => as "Int";
 subtype "DimSize"         => as "Int" => where { $_ >= 0 };
+subtype "Dropout"         => as "Num" => where { $_ >= 0 and $_ <= 1 };
 subtype "Shape"           => as "ArrayRef[DimSize]";
+subtype "CudaKernelShape" => as "Shape" => where { @$_ == 3 };
 subtype "WholeDim"        => as "Str" => where { $_ eq 'X' };
 subtype "Slice"           => as "ArrayRef[Index]|WholeDim|Index" => where { ref $_ ? @$_ > 0 : 1 };
 subtype "Dtype"           => as enum([qw[float32 float64 float16 uint8 int32]]);
-subtype "Metric"          => as "Maybe[CodeRef|Str]";
 subtype "ProfilerMode"    => as enum([qw[symbolic all]]);
+subtype "GluonClass"      => as enum([qw[AI::MXNet::NDArray AI::MXNet::Symbol]]);
+subtype "GluonInput"      => as "AI::MXNet::NDArray|AI::MXNet::Symbol|ArrayRef[AI::MXNet::NDArray|AI::MXNet::Symbol]";
 subtype "ProfilerState"   => as enum([qw[stop run]]);
 subtype "GradReq"         => as enum([qw[add write null]]);
+subtype "KVStoreStr"      => as enum([qw[local device dist dist_sync dist_async]]);
+subtype "PoolType"        => as enum([qw[max avg sum]]);
 subtype "NameShape"       => as "ArrayRef" => where {
     find_type_constraint("Str")->check($_->[0])
         and
@@ -55,8 +67,14 @@ subtype "NameShape"       => as "ArrayRef" => where {
 };
 subtype "Callback"        => as "CodeRef|ArrayRef[Coderef]|AI::MXNet::Callback|ArrayRef[AI::MXNet::Callback]";
 subtype "EvalMetric"      => as "AI::MXNet::EvalMetric|Str|CodeRef";
+subtype "Metric"          => as "Maybe[EvalMetric]";
 subtype "Optimizer"       => as "AI::MXNet::Optimizer|Str";
 subtype "Initializer"     => as "AI::MXNet::Initializer|Str";
+subtype "Updater"         => as "AI::MXNet::Updater|CodeRef";
+subtype "KVStore"         => as "AI::MXNet::KVStore|KVStoreStr";
 subtype "Activation"      => as "AI::MXNet::Symbol|Str|CodeRef";
 subtype "SymbolOrArrayOfSymbols" => as "AI::MXNet::Symbol|ArrayRef[AI::MXNet::Symbol]";
 subtype "NameShapeOrDataDesc" => as "NameShape|AI::MXNet::DataDesc";
+subtype "AdvancedSlice"   => as "ArrayRef[ArrayRef|PDL|PDL::Matrix|AI::MXNet::NDArray]";
+
+1;
diff --git a/perl-package/AI-MXNet/t/test_autograd.t b/perl-package/AI-MXNet/t/test_autograd.t
new file mode 100644
index 000000000000..b45d233d79a0
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_autograd.t
@@ -0,0 +1,370 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use AI::MXNet::AutoGrad qw(autograd);
+use AI::MXNet::TestUtils qw(same);
+use AI::MXNet::Base;
+use Test::More tests => 74;
+
+sub autograd_assert
+{
+    my $kwargs = {};
+    if(ref $_[-1] eq 'HASH') { $kwargs = pop(@_) };
+    my @args = @_;
+    my $func   = $kwargs->{func};
+    my $grad_f = $kwargs->{grad_func};
+    my $argnum = $kwargs->{argnum};
+    my $grad_func = autograd->grad_and_loss($func, $argnum);
+    my ($grad_vals, $output) = $grad_func->(@args);
+    my $res = $func->(@args);
+    ok(same($output->aspdl, $res->aspdl));
+    my $grad_res = $grad_f->(@args);
+    ok(@$grad_vals == @$grad_res);
+    for(zip($grad_vals, $grad_res)) {
+        my ($a, $b) = @$_;
+        ok(same($a->aspdl, $b->aspdl));
+    }
+}
+
+sub test_unary_func
+{
+    my $check_unary_func = sub {
+        my ($x) = @_;
+        my $f_exp       = sub { $_[0]->exp };
+        my $f_exp_grad  = sub { [$_[0]->exp] };
+        autograd_assert($x, { func => $f_exp, grad_func => $f_exp_grad });
+        my $f_half      = sub { $_[0]/2 };
+        my $f_half_grad = sub { [mx->nd->ones($_[0]->shape) * 0.5] };
+        autograd_assert($x, { func => $f_half, grad_func => $f_half_grad });
+        my $f_square    = sub { $_[0]**2 };
+        my $f_square_grad = sub { [2*$_[0]] };
+        autograd_assert($x, { func => $f_square, grad_func => $f_square_grad });
+    };
+    my $uniform = mx->nd->uniform(shape=>[4, 5]);
+    $check_unary_func->($uniform);
+    # sparse support
+    #my $stypes = ['row_sparse', 'csr', 'default'];
+    #for my $stype (@$stypes)
+    #{
+    #    $check_unary_func->($uniform->tostype($stype));
+    #}
+}
+
+test_unary_func();
+
+sub test_binary_func
+{
+    my $check_binary_func = sub {
+        my ($x, $y) = @_;
+        my $f_add      = sub { $_[0]+$_[1] };
+        my $f_add_grad = sub { [map { mx->nd->ones($_->shape) } @_] };
+        autograd_assert($x, $y, { func => $f_add, grad_func => $f_add_grad });
+        my $f_mul      = sub { $_[0]*$_[1] };
+        my $f_mul_grad = sub { [reverse(@_)] };
+        autograd_assert($x, $y, { func => $f_mul, grad_func => $f_mul_grad });
+        my $f_compose  = sub { $_[0]+$_[0]*$_[1] };
+        my $f_compose_grad = sub { [mx->nd->ones($_[0]->shape) + $y, $x] };
+        autograd_assert($x, $y, { func => $f_compose, grad_func => $f_compose_grad });
+    };
+    my $uniform_x = mx->nd->uniform(shape=>[4, 5]);
+    my $uniform_y = mx->nd->uniform(shape=>[4, 5]);
+    $check_binary_func->($uniform_x, $uniform_y);
+    # sparse support
+    #my $stypes = ['row_sparse', 'csr', 'default'];
+    #for my $stype_x (@$stypes)
+    #{
+    #    for my $stype_y (@$stypes)
+    #    {
+    #        my $x = $uniform_x->tostype($stype_x);
+    #        my $y = $uniform_y->tostype($stype_y);
+    #        $check_binary_func->($x, $y);
+    #    }
+    #}
+}
+
+test_binary_func();
+
+sub test_operator_with_state
+{
+    my $f_fc = sub {
+        my ($a, $b, $weight, $bias) = @_;
+        my $x = $a*$b;
+        my $fc = mx->nd->FullyConnected(
+            $x, $weight, $bias, num_hidden=>32);
+        return $fc;
+    };
+
+    my $a = mx->nd->uniform(shape=>[64, 50]);
+    my $b = mx->nd->uniform(shape=>[64, 50]);
+    my $weight = mx->nd->uniform(shape=>[32, 50]);
+    my $bias = mx->nd->uniform(shape=>[32]);
+
+    my $grad_func = autograd->grad_and_loss($f_fc);
+    my ($grad_vals, $outputs) = $grad_func->($a, $b, $weight, $bias);
+}
+
+test_operator_with_state();
+
+sub test_argnum
+{
+    my $f_with_mode = sub {
+        my ($a, $b, $mode) = @_;
+        if($mode)
+        {
+            return $a+$b;
+        }
+        else
+        {
+            return $a*$b;
+        }
+    };
+    my $a = mx->nd->uniform(shape=>[3, 2]);
+    my $b = mx->nd->uniform(shape=>[3, 2]);
+    my $f_add_grad = sub { [map { mx->nd->ones($_->shape) } @_[0,1]] };
+    my $f_mul_grad = sub { [reverse(@_[0,1])] };
+    autograd_assert($a, $b, 1,
+        { argnum=>[0, 1], func=>$f_with_mode, grad_func=>$f_add_grad });
+    autograd_assert($a, $b, 0,
+        { argnum=>[0, 1], func=>$f_with_mode, grad_func=>$f_mul_grad });
+}
+
+test_argnum();
+
+sub test_training
+{
+    my $x = mx->nd->ones([10, 10]);
+    autograd->record(sub {
+        my $y = mx->nd->Dropout($x, p=>0.5);
+        ok(not ($y->aspdl == $x->aspdl)->all);
+        autograd->pause(sub {
+            my $y = mx->nd->Dropout($x, p=>0.5);
+            ok(($y->aspdl == $x->aspdl)->all);
+        });
+    });
+}
+
+test_training();
+
+sub test_out_grads
+{
+    my $x = mx->nd->ones([3, 5]);
+    my $dx = mx->nd->zeros_like($x);
+    autograd->mark_variables([$x], [$dx]);
+    my $da;
+    my $db = mx->nd->array([1,2,3,4,5]);
+    my $dc = mx->nd->array([5,4,3,2,1]);
+
+    autograd->record(sub {
+        my ($a, $b, $c) = @{ $x };
+        autograd->backward([$a, $b, $c], head_grads => [$da, $db, $dc]);
+    });
+    ok(($dx->aspdl == pdl(
+        [[1,1,1,1,1],
+         [1,2,3,4,5],
+         [5,4,3,2,1]]))->all);
+}
+
+test_out_grads();
+
+sub test_detach_updated_grad
+{
+    my $x = mx->nd->ones([2, 2]);
+    my $dx = mx->nd->zeros_like($x);
+    my $y = mx->nd->ones_like($x);
+    my $dy = mx->nd->zeros_like($x);
+    autograd->mark_variables([$x, $y], [$dx, $dy]);
+    ok($x->_fresh_grad == 0);
+    ok($y->_fresh_grad == 0);
+
+    autograd->record(sub {
+        my $x2 = $x + 2;
+        my $y2  = $x2 + $y;
+        $y2->backward();
+    });
+    ok(($dx->aspdl == 1)->all);
+    ok($x->_fresh_grad == 1);
+    ok($y->_fresh_grad == 1);
+
+    $dx .= 0;
+    $x->_fresh_grad(0);
+    $y->_fresh_grad(0);
+    ok($x->_fresh_grad == 0);
+    ok($y->_fresh_grad == 0);
+
+    autograd->record(sub {
+        my $x2 = $x + 2;
+        $x2 = $x2->detach;
+        my $y2  = $x2 + $y;
+        $y2->backward();
+    });
+    ok(($dx->aspdl == 0)->all);
+    ok($x->_fresh_grad == 0);
+    ok($y->_fresh_grad == 1);
+}
+
+test_detach_updated_grad();
+
+sub test_retain_grad
+{
+    my $x = mx->nd->ones([2, 2]);
+    my $dx = mx->nd->zeros([2, 2]);
+    autograd->mark_variables([$x], [$dx], grad_reqs=>'add');
+    autograd->record(sub {
+        my $y = $x + 1;
+        $y->backward(retain_graph=>0);
+    });
+    ok(($dx->aspdl == 1)->all);
+
+    $dx .= 0;
+    autograd->record(sub {
+        my $y = $x + 1;
+        $y->backward(retain_graph=>1);
+        $y->backward(retain_graph=>0);
+    });
+    ok(($dx->aspdl == 2)->all);
+    no warnings;
+    open(CPERR, ">&STDERR");
+    open(STDERR, ">/dev/null");
+    eval {
+        autograd->record(sub {
+            my $y = $x + 1;
+            $y->backward();
+            $y->backward();
+        });
+    };
+    open(STDERR, ">&CPERR");
+    ok($@);
+}
+
+test_retain_grad();
+
+sub test_attach_grad
+{
+    my $check_attach_grad = sub {
+        my ($x) = @_;
+        ok(not defined $x->grad);
+        $x->attach_grad();
+        autograd->record(sub {
+            my $y = $x * 2;
+            ok(not defined $y->grad);
+            $y->backward;
+        });
+        ok(($x->grad->aspdl == 2)->all);
+    };
+    my $zeros = mx->nd->zeros([10, 10]);
+    $check_attach_grad->($zeros);
+    # sparse support
+    #stypes = ['default', 'row_sparse', 'csr']
+    #for stype in stypes:
+    #    x = zeros.tostype(stype)
+    #    check_attach_grad(x)
+}
+
+test_attach_grad();
+
+sub test_is_train
+{
+    my $x = mx->nd->ones([10, 10]);
+    $x->attach_grad();
+    autograd->record(sub {
+        ok(autograd->is_recording());
+        ok(autograd->is_training());
+        my $y = mx->nd->Dropout($x, p=>0.5);
+        ok($y->aspdl->max == 2 and $y->aspdl->min == 0);
+        $y->backward();
+        ok(($x->grad->aspdl == $y->aspdl)->all);
+        autograd->predict_mode(sub {
+            ok(autograd->is_recording());
+            ok(not autograd->is_training());
+            my $y = mx->nd->Dropout($x, p=>0.5);
+            ok(($y->aspdl == $x->aspdl)->all);
+            $y->backward(train_mode=>0);
+            ok(($x->grad->aspdl == $x->aspdl)->all);
+        });
+    }, train_mode => 1);
+
+    autograd->record(sub {
+        ok(autograd->is_recording());
+        ok(not autograd->is_training());
+        my $y = mx->nd->Dropout($x, p=>0.5);
+        ok(($y->aspdl == $x->aspdl)->all);
+        $y->backward(train_mode=>0);
+        ok(($x->grad->aspdl == $x->aspdl)->all);
+
+        autograd->train_mode(sub {
+            ok(autograd->is_recording);
+            ok(autograd->is_training);
+            my $y = mx->nd->Dropout($x, p=>0.5);
+            ok($y->aspdl->max == 2 and $y->aspdl->min == 0);
+            $y->backward;
+            ok(($x->grad->aspdl == $y->aspdl)->all);
+        });
+    }, train_mode => 0);
+
+    ok(not autograd->is_recording);
+    ok(not autograd->is_training);
+    my $y = mx->nd->Dropout($x, p=>0.5);
+    ok(($y->aspdl == $x->aspdl)->all);
+
+    autograd->train_mode(sub {
+        ok(not autograd->is_recording);
+        ok(autograd->is_training);
+        my $y = mx->nd->Dropout($x, p=>0.5);
+        ok($y->aspdl->max == 2 and $y->aspdl->min == 0);
+    });
+}
+
+test_is_train();
+
+sub test_get_symbol
+{
+    my $x = mx->nd->ones([1]);
+    $x->attach_grad;
+    my $y;
+    autograd->record(sub {
+        $y = $x*$x + 2*$x - 1;
+    });
+    ok(@{ autograd->get_symbol($y)->list_arguments } == 1);
+
+    my $z = mx->nd->ones([1]);
+    $z->attach_grad;
+    autograd->record(sub {
+        $y = $x*$x + 2*$z - 1;
+    });
+    ok(@{ autograd->get_symbol($y)->list_arguments } == 2);
+}
+
+test_get_symbol();
+
+sub test_gradient
+{
+    my $x = mx->nd->ones([1]);
+    $x->attach_grad;
+    my $z;
+    mx->autograd->record(sub {
+        $z = mx->nd->elemwise_add($x->exp, $x);
+    });
+    my $dx = mx->autograd->grad($z, $x, create_graph=>1);
+    ok(abs($dx->asscalar - 3.71828175) < 1e-7);
+    $dx->backward;
+    ok(abs($x->grad->asscalar - 2.71828175) < 1e-7);
+}
+
+test_gradient();
diff --git a/perl-package/AI-MXNet/t/test_base.t b/perl-package/AI-MXNet/t/test_base.t
new file mode 100644
index 000000000000..ea0bd0ef98f3
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_base.t
@@ -0,0 +1,107 @@
+use strict;
+use warnings;
+use Test::More;
+use AI::MXNet qw(mx);
+
+sub test_builtin_zip()
+{
+    is_deeply(
+        [ AI::MXNet::zip([ 0 .. 9 ], [ 10 .. 19 ]) ],
+        [ map { [ $_, 10 + $_ ] } 0 .. 9 ]);
+    is_deeply(
+        [ AI::MXNet::zip([ 0 .. 9 ], [ 10 .. 19 ], [ 20 .. 29 ]) ],
+        [ map { [ $_, 10 + $_, 20 + $_ ] } 0 .. 9 ]);
+    my $over = ListOverload->new(10 .. 19);
+    is_deeply(
+        [ AI::MXNet::zip([ 0 .. 9 ], \@$over) ],
+        [ map { [ $_, 10 + $_ ] } 0 .. 9 ]);
+    my $tied = ListTied->new(10 .. 19);
+    is_deeply(
+        [ AI::MXNet::zip([ 0 .. 9 ], \@$tied) ],
+        [ map { [ $_, 10 + $_ ] } 0 .. 9 ]);
+}
+
+
+test_builtin_zip();
+done_testing();
+
+package ListTied {
+    sub new {
+        my($class, @list) = @_;
+        my @tied;
+        tie @tied, $class, @list;
+        return \@tied;
+    }
+    sub TIEARRAY {
+        my($class, @list) = @_;
+        return bless { list => \@list }, $class;
+    }
+    sub FETCH {
+        my($self, $index) = @_;
+        return $self->{list}[$index];
+    }
+    sub STORE {
+        my($self, $index, $value) = @_;
+        return $self->{list}[$index] = $value;
+    }
+    sub FETCHSIZE {
+        my($self) = @_;
+        return scalar @{$self->{list}};
+    }
+    sub STORESIZE {
+        my($self, $count) = @_;
+        return $self->{list}[$count - 1] //= undef;
+    }
+    sub EXTEND {
+        my($self, $count) = @_;
+        return $self->STORESIZE($count);
+    }
+    sub EXISTS {
+        my($self, $key) = @_;
+        return exists $self->{list}[$key];
+    }
+    sub DELETE {
+        my($self, $key) = @_;
+        return delete $self->{list}[$key];
+    }
+    sub CLEAR {
+        my($self) = @_;
+        return @{$self->{list}} = ();
+    }
+    sub PUSH {
+        my($self, @list) = @_;
+        return push @{$self->{list}}, @list;
+    }
+    sub POP {
+        my($self) = @_;
+        return pop @{$self->{list}};
+    }
+    sub SHIFT {
+        my($self) = @_;
+        return shift @{$self->{list}};
+    }
+    sub UNSHIFT {
+        my($self, @list) = @_;
+        return unshift @{$self->{list}}, @list;
+    }
+    sub SPLICE {
+        my($self, $offset, $length, @list) = @_;
+        return splice @{$self->{list}}, $offset, $length, @list;
+    }
+    sub UNTIE {
+        my($self) = @_;
+    }
+    sub DESTROY {
+        my($self) = @_;
+    }
+}
+
+package ListOverload {
+    use overload '@{}' => \&as_list;
+    sub new {
+        my($class, @list) = @_;
+        return bless { list => \@list }, $class;
+    }
+    sub as_list { return $_[0]{list} }
+}
+
diff --git a/perl-package/AI-MXNet/t/test_cuda_module.t b/perl-package/AI-MXNet/t/test_cuda_module.t
new file mode 100644
index 000000000000..ce0e511cfb49
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_cuda_module.t
@@ -0,0 +1,40 @@
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use Test::More tests => 3;
+my $gpu_present = (`perl -e 'use AI::MXNet qw(mx); print mx->nd->ones([1], ctx => mx->gpu(0))->asscalar' 2>/dev/null` eq '1');
+
+sub test_cuda_rtc
+{
+    my $source = '
+    extern "C" __global__ void axpy(const float *x, float *y, float alpha) {
+        int i = threadIdx.x + blockIdx.x * blockDim.x;
+        y[i] += alpha * x[i];
+    }
+
+    extern "C" __global__ void saxpy(const float *x, float *y, float alpha) {
+        extern __shared__ float smem[];
+        int i = threadIdx.x + blockIdx.x * blockDim.x;
+        smem[threadIdx.x] = x[i];
+        y[i] += alpha * smem[threadIdx.x];
+    }
+    ';
+    my $module = mx->rtc->CudaModule($source);
+    my $axpy = $module->get_kernel("axpy", "const float *x, float *y, float alpha");
+    my $x = mx->nd->ones([10], ctx=>mx->gpu(0));
+    my $y = mx->nd->zeros([10], ctx=>mx->gpu(0));
+    $axpy->launch([$x, $y, 3], mx->gpu(0), [1, 1, 1], [10, 1, 1]);
+    ok(($y->aspdl == 3)->all);
+
+    my $saxpy = $module->get_kernel("saxpy", "const float *x, float *y, float alpha");
+    $saxpy->launch([$x, $y, 4], mx->gpu(0), [1, 1, 1], [10, 1, 1], 10);
+    ok(($y->aspdl == 7)->all);
+
+    $saxpy->launch([$x, $y, 5], mx->gpu(0), [2, 1, 1], [5, 1, 1], 5);
+    ok(($y->aspdl == 12)->all);
+}
+
+SKIP: {
+    skip("GPU is not avalilable", 3) unless $gpu_present;
+    test_cuda_rtc();
+}
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/t/test_gluon.t b/perl-package/AI-MXNet/t/test_gluon.t
new file mode 100644
index 000000000000..ff7e2a604d64
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_gluon.t
@@ -0,0 +1,587 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+use Test::More tests => 119;
+use AI::MXNet qw(mx);
+use AI::MXNet::Gluon qw(gluon);
+use AI::MXNet::Gluon::NN qw(nn);
+use AI::MXNet::TestUtils qw(almost_equal);
+use Scalar::Util qw(refaddr);
+use AI::MXNet::Base;
+
+sub test_parameter
+{
+    my $p = gluon->Parameter('weight', shape=>[10, 10]);
+    $p->initialize(init=>'xavier', ctx=>[mx->cpu(0), mx->cpu(1)]);
+    ok(@{$p->list_data} == 2);
+    ok(@{$p->list_grad} == 2);
+    ok($p->data(mx->cpu(1))->context eq mx->cpu(1));
+    is_deeply($p->data(mx->cpu(0))->shape, [10, 10]);
+    ok($p->var->name eq  'weight');
+
+    $p->reset_ctx(ctx=>[mx->cpu(1), mx->cpu(2)]);
+    is_deeply($p->list_ctx, [mx->cpu(1), mx->cpu(2)]);
+}
+
+test_parameter();
+
+sub test_paramdict
+{
+    my $params = gluon->ParameterDict('net_');
+    $params->get('weight', shape=>[10, 10]);
+    is_deeply([$params->keys], ['net_weight']);
+    $params->initialize(ctx=>mx->cpu());
+    $params->save('test.params');
+    $params->load('test.params', ctx => mx->cpu());
+}
+
+test_paramdict();
+
+package Net;
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Function::Parameters;
+extends 'AI::MXNet::Gluon::Block';
+
+sub BUILD
+{
+    my $self = shift;
+    $self->name_scope(sub {
+        $self->dense0(nn->Dense(5, in_units=>5));
+        $self->dense1(nn->Dense(5, in_units=>5));
+    });
+}
+
+method forward($x)
+{
+    return $self->dense1->($self->dense0->($x));
+}
+
+package main;
+
+sub test_parameter_sharing
+{
+    my $net1 = Net->new(prefix=>'net1_');
+    my $net2 = Net->new(prefix=>'net2_', params=>$net1->collect_params());
+    $net1->collect_params()->initialize();
+    $net2->(mx->nd->zeros([3, 5]));
+    $net1->save_params('net1.params');
+    my $net3 = Net->new(prefix=>'net3_');
+    $net3->load_params('net1.params', ctx => mx->cpu());
+}
+
+test_parameter_sharing();
+
+sub test_basic
+{
+    my $model = nn->Sequential();
+    $model->add(nn->Dense(128, activation=>'tanh', in_units=>10, flatten=>0));
+    $model->add(nn->Dropout(0.5));
+    $model->add(nn->Dense(64, activation=>'tanh', in_units=>256));
+    $model->add(nn->Dense(32, in_units=>64));
+    $model->add(nn->Activation('relu'));
+
+    # symbol
+    my $x = mx->sym->var('data');
+    my $y = $model->($x);
+    ok(@{ $y->list_arguments } == 7);
+
+    # ndarray
+    $model->collect_params()->initialize(init => mx->init->Xavier(magnitude=>2.24));
+    $x = $model->(mx->nd->zeros([32, 2, 10]));
+    is_deeply($x->shape, [32, 32]);
+    $x->wait_to_read;
+
+    $model->collect_params()->setattr(grad_req => 'null');
+    ok(not defined( ($model->collect_params()->values())[0]->_grad));
+    $model->collect_params()->setattr(grad_req => 'write');
+    ok(defined (($model->collect_params()->values())[0]->_grad));
+}
+
+test_basic();
+
+sub test_dense
+{
+    my $model = nn->Dense(128, activation=>'tanh', in_units=>10, flatten=>0, prefix=>'test_');
+    my $inputs = mx->sym->Variable('data');
+    my $outputs = $model->($inputs);
+    is_deeply({map { $_ => 1 } $model->collect_params()->keys()}, {'test_weight', 1, 'test_bias', 1});
+    is_deeply($outputs->list_outputs(), ['test_tanh_fwd_output']);
+    my ($args, $outs, $auxs) = $outputs->infer_shape(data=>[2, 3, 10]);
+    is_deeply($outs, [[2, 3, 128]]);
+
+    $model = nn->Dense(128, activation=>'relu', in_units=>30, flatten=>1, prefix=>'test2_');
+    $inputs = mx->sym->Variable('data');
+    $outputs = $model->($inputs);
+    is_deeply({map { $_ => 1 } $model->collect_params()->keys()}, {'test2_weight', 1, 'test2_bias', 1});
+    is_deeply($outputs->list_outputs(), ['test2_relu_fwd_output']);
+    ($args, $outs, $auxs) = $outputs->infer_shape(data=>[17, 2, 5, 3]);
+    is_deeply($outs, [[17, 128]]);
+}
+
+test_dense();
+
+package Net2;
+use AI::MXNet::Gluon::Mouse;
+use AI::MXNet::Function::Parameters;
+extends 'AI::MXNet::Gluon::HybridBlock';
+has 'model' => (is => 'rw');
+
+method hybrid_forward($F, $x)
+{
+    my $out = $self->model->($x);
+    return $F->add_n(map { $_->sum } @{ $out });
+}
+
+package main;
+
+sub test_symbol_block
+{
+    my $model = nn->HybridSequential();
+    $model->add(nn->Dense(128, activation=>'tanh'));
+    $model->add(nn->Dropout(0.5));
+    $model->add(nn->Dense(64, activation=>'tanh'));
+    $model->add(nn->Dense(32, in_units=>64));
+    $model->add(nn->Activation('relu'));
+
+    $model->initialize();
+
+    my $inputs = mx->sym->var('data');
+    my $outputs = $model->($inputs)->get_internals();
+    my $smodel = gluon->SymbolBlock($outputs, $inputs, params=>$model->collect_params);
+
+    ok(@{ $smodel->(mx->nd->zeros([16, 10])) } == 14);
+    my $out = $smodel->(mx->sym->var('in'));
+    ok(@{ $out } == @{ $outputs->list_outputs() });
+
+    my $net = Net2->new(model => $smodel);
+    $net->hybridize();
+    ok(ref $net->(mx->nd->zeros([16, 10])) eq 'AI::MXNet::NDArray');
+
+    $inputs = mx->sym->var('data');
+    $outputs = $model->($inputs);
+    $smodel = gluon->SymbolBlock($outputs, $inputs, params=>$model->collect_params);
+    $net = Net2->new(model => $smodel);
+    $net->hybridize();
+    ok(ref $net->(mx->nd->zeros([16, 10])) eq 'AI::MXNet::NDArray');
+}
+
+test_symbol_block();
+
+sub check_layer_forward
+{
+    my ($layer, $dshape) = @_;
+    $layer->collect_params()->initialize();
+    my $x = mx->nd->ones($dshape);
+    $x->attach_grad();
+    my $out;
+    mx->autograd->record(sub {
+        $out = $layer->($x);
+    });
+    $out->backward();
+    my $pdl_out = $out->aspdl;
+    my $pdl_dx  = $x->grad->aspdl;
+
+    $layer->hybridize();
+
+    $x = mx->nd->ones($dshape);
+    $x->attach_grad();
+    mx->autograd->record(sub {
+        $out = $layer->($x);
+    });
+    $out->backward();
+
+    ok(almost_equal($pdl_out, $out->aspdl, 1e-5));
+    ok(almost_equal($pdl_dx, $x->grad->aspdl, 1e-5));
+}
+
+sub test_conv
+{
+    my @layers1d = (
+        nn->Conv1D(16, 3, in_channels=>4),
+        nn->Conv1D(16, 3, groups=>2, in_channels=>4),
+        nn->Conv1D(16, 3, strides=>3, groups=>2, in_channels=>4),
+    );
+    for my $layer (@layers1d)
+    {
+        check_layer_forward($layer, [1, 4, 10]);
+    }
+
+    my @layers2d = (
+        nn->Conv2D(16, [3, 4], in_channels=>4),
+        nn->Conv2D(16, [5, 4], in_channels=>4),
+        nn->Conv2D(16, [3, 4], groups=>2, in_channels=>4),
+        nn->Conv2D(16, [3, 4], strides=>4, in_channels=>4),
+        nn->Conv2D(16, [3, 4], dilation=>4, in_channels=>4),
+        nn->Conv2D(16, [3, 4], padding=>4, in_channels=>4),
+    );
+    for my $layer (@layers2d)
+    {
+        check_layer_forward($layer, [1, 4, 20, 20]);
+    }
+
+    my @layers3d = (
+        nn->Conv3D(16, [1, 8, 4], in_channels=>4, activation=>'relu'),
+        nn->Conv3D(16, [5, 4, 3], in_channels=>4),
+        nn->Conv3D(16, [3, 3, 3], groups=>2, in_channels=>4),
+        nn->Conv3D(16, 4, strides=>4, in_channels=>4),
+        nn->Conv3D(16, [3, 3, 3], padding=>4, in_channels=>4),
+    );
+    for my $layer (@layers3d)
+    {
+        check_layer_forward($layer, [1, 4, 10, 10, 10]);
+    }
+
+    # These layouts only supported on GPU for now
+    my $layer = nn->Conv2D(16, [3, 3], layout=>'NHWC', in_channels=>4);
+    #check_layer_forward($layer, [1, 10, 10, 4]);
+
+    $layer = nn->Conv3D(16, [3, 3, 3], layout=>'NDHWC', in_channels=>4);
+    # check_layer_forward(layer, (1, 10, 10, 10, 4))
+}
+
+test_conv();
+
+
+sub test_deconv
+{
+    # commented out code is only supported on GPU for now
+    # my @layers1d = (
+    #     nn->Conv1DTranspose(16, 3, in_channels=>4),
+    #     nn->Conv1DTranspose(16, 3, groups=>2, in_channels=>4),
+    #     nn->Conv1DTranspose(16, 3, strides=>3, groups=>2, in_channels=>4),
+    # );
+    # for my $layer (@layers1d)
+    # {
+    #     check_layer_forward($layer, [1, 4, 10]);
+    # }
+
+
+    my @layers2d = (
+        nn->Conv2DTranspose(16, [3, 4], in_channels=>4),
+        nn->Conv2DTranspose(16, [5, 4], in_channels=>4),
+        nn->Conv2DTranspose(16, [3, 4], groups=>2, in_channels=>4),
+        nn->Conv2DTranspose(16, [3, 4], strides=>4, in_channels=>4),
+        nn->Conv2DTranspose(16, [3, 4], dilation=>4, in_channels=>4),
+        nn->Conv2DTranspose(16, [3, 4], padding=>4, in_channels=>4),
+        nn->Conv2DTranspose(16, [3, 4], strides=>4, output_padding=>3, in_channels=>4),
+    );
+    for my $layer (@layers2d)
+    {
+        check_layer_forward($layer, [1, 4, 20, 20]);
+    }
+
+    # @layers3d = (
+    #     nn->Conv3DTranspose(16, [1, 8, 4], in_channels=>4),
+    #     nn->Conv3DTranspose(16, [5, 4, 3], in_channels=>4),
+    #     nn->Conv3DTranspose(16, [3, 3, 3], groups=>2, in_channels=>4),
+    #     nn->Conv3DTranspose(16, 4, strides=>4, in_channels=>4),
+    #     nn->Conv3DTranspose(16, [3, 3, 3], padding=>4, in_channels=>4),
+    # );
+    # for my $layer (@layers3d)
+    # {
+    #     check_layer_forward($layer, [1, 4, 10, 10, 10]);
+    # }
+    #
+    my $layer = nn->Conv2DTranspose(16, [3, 3], layout=>'NHWC', in_channels=>4);
+    # check_layer_forward($layer, [1, 10, 10, 4]);
+    #
+    # $layer = nn->Conv3DTranspose(16, [3, 3, 3], layout=>'NDHWC', in_channels=>4);
+    # check_layer_forward(layer, [1, 10, 10, 10, 4]);
+}
+
+test_deconv();
+
+sub test_pool
+{
+    my @layers1d = (
+        nn->MaxPool1D(),
+        nn->MaxPool1D(3),
+        nn->MaxPool1D(3, 2),
+        nn->AvgPool1D(),
+        nn->GlobalAvgPool1D(),
+    );
+    for my $layer (@layers1d)
+    {
+        check_layer_forward($layer, [1, 2, 10]);
+    }
+
+    my @layers2d = (
+        nn->MaxPool2D(),
+        nn->MaxPool2D([3, 3]),
+        nn->MaxPool2D(3, 2),
+        nn->AvgPool2D(),
+        nn->GlobalAvgPool2D(),
+    );
+    for my $layer (@layers2d)
+    {
+        check_layer_forward($layer, [1, 2, 10, 10]);
+    }
+
+    my @layers3d = (
+        nn->MaxPool3D(),
+        nn->MaxPool3D([3, 3, 3]),
+        nn->MaxPool3D(3, 2),
+        nn->AvgPool3D(),
+        nn->GlobalAvgPool3D(),
+    );
+    for my $layer (@layers3d)
+    {
+        check_layer_forward($layer, [1, 2, 10, 10, 10]);
+    }
+
+    # test ceil_mode
+    my $x = mx->nd->zeros([2, 2, 10, 10]);
+
+    my $layer = nn->MaxPool2D(3, ceil_mode=>0);
+    $layer->collect_params()->initialize();
+    is_deeply($layer->($x)->shape, [2, 2, 3, 3]);
+
+    $layer = nn->MaxPool2D(3, ceil_mode=>1);
+    $layer->collect_params()->initialize();
+    is_deeply($layer->($x)->shape, [2, 2, 4, 4]);
+}
+
+test_pool();
+
+sub test_batchnorm
+{
+    my $layer = nn->BatchNorm(in_channels=>10);
+    check_layer_forward($layer, [2, 10, 10, 10]);
+}
+
+test_batchnorm();
+
+sub test_reshape
+{
+    my $x = mx->nd->ones([2, 4, 10, 10]);
+    my $layer = nn->Conv2D(10, 2, in_channels=>4);
+    $layer->collect_params()->initialize();
+    mx->autograd->record(sub {
+        $x = $layer->($x);
+        $x = $x->reshape([-1]);
+        $x = $x + 10;
+    });
+    $x->backward();
+}
+
+test_reshape();
+
+sub test_slice
+{
+    my $x = mx->nd->ones([5, 4, 10, 10]);
+    my $layer = nn->Conv2D(10, 2, in_channels=>4);
+    $layer->collect_params()->initialize();
+    mx->autograd->record(sub {
+        $x = $layer->($x);
+        $x = $x->slice([1,3]);
+        $x = $x + 10;
+    });
+    $x->backward();
+}
+
+test_slice();
+
+sub test_at
+{
+    my $x = mx->nd->ones([5, 4, 10, 10]);
+    my $layer = nn->Conv2D(10, 2, in_channels=>4);
+    $layer->collect_params()->initialize();
+    mx->autograd->record(sub {
+        $x = $layer->($x);
+        $x = $x->at(1);
+        $x = $x + 10;
+    });
+    $x->backward();
+}
+
+test_at();
+
+sub test_deferred_init
+{
+    my $x = mx->nd->ones([5, 4, 10, 10]);
+    my $layer = nn->Conv2D(10, 2);
+    $layer->collect_params()->initialize();
+    $layer->($x);
+}
+
+test_deferred_init();
+
+
+sub check_split_data
+{
+    my ($x, $num_slice, $batch_axis, %kwargs) = @_;
+    my $res = gluon->utils->split_data($x, $num_slice, $batch_axis, %kwargs);
+    ok(@{ $res } == $num_slice);
+    ok(almost_equal(mx->nd->concat(@$res, dim=>$batch_axis)->aspdl(), $x->aspdl()));
+}
+
+sub test_split_data
+{
+    my $x = mx->nd->random->uniform(shape=>[128, 33, 64]);
+
+    check_split_data($x, 8, 0);
+    check_split_data($x, 3, 1);
+    check_split_data($x, 4, 1, even_split=>0);
+    check_split_data($x, 15, 1, even_split=>0);
+    eval {
+        check_split_data($x, 4, 1);
+    };
+    ok($@);
+}
+
+test_split_data();
+
+sub test_flatten
+{
+    my $flatten = nn->Flatten();
+    my $x = mx->nd->zeros([3,4,5,6]);
+    is_deeply($flatten->($x)->shape, [3, 4*5*6]);
+    $x = mx->nd->zeros([3,6]);
+    is_deeply($flatten->($x)->shape, [3, 6]);
+    $x = mx->nd->zeros([3]);
+    is_deeply($flatten->($x)->shape, [3, 1]);
+}
+
+test_flatten();
+
+sub test_trainer
+{
+    my $dict_equ = sub { my ($a, $b) = @_;
+        is_deeply({ map { $_ => 1 } keys %$a }, { map { $_ => 1 } keys %$b });
+        for my $k (keys %$a)
+        {
+            ok(($a->{$k}->aspdl == $b->{$k}->aspdl)->all);
+        }
+    };
+    my $x = gluon->Parameter('x', shape=>[10]);
+    $x->initialize(ctx=>[mx->cpu(0), mx->cpu(1)], init=>'zeros');
+    my $trainer = gluon->Trainer([$x], 'sgd', {'learning_rate'=> 1.0, 'momentum'=> 0.5});
+    my $y;
+    mx->autograd->record(sub {
+        for my $w (@{ $x->list_data() })
+        {
+            $y = $w + 1;
+            $y->backward();
+        }
+    });
+    $trainer->step(1);
+
+    ok(($x->data(mx->cpu(1))->aspdl == -2)->all);
+
+    $x->lr_mult(0.5);
+
+    mx->autograd->record(sub {
+        for my $w (@{ $x->list_data() })
+        {
+            $y = $w + 1;
+            $y->backward();
+        }
+    });
+    $trainer->step(1);
+
+    ok(($x->data(mx->cpu(1))->aspdl == -4)->all);
+
+    $trainer->save_states('test.states');
+    my $states;
+    if($trainer->_update_on_kvstore)
+    {
+        $states = { %{ $trainer->_kv_store->_updater->states } };
+    }
+    else
+    {
+        $states = { %{ $trainer->_updaters->[0]->states } };
+    }
+    $trainer->load_states('test.states');
+    if($trainer->_update_on_kvstore)
+    {
+        $dict_equ->($trainer->_kv_store->_updater->states, $states);
+        ok($trainer->_optimizer eq $trainer->_kv_store->_updater->optimizer);
+    }
+    else
+    {
+        for my $updater (@{ $trainer->_updaters })
+        {
+            $dict_equ->($updater->states, $states);
+        }
+        ok($trainer->_optimizer eq $trainer->_updaters->[0]->optimizer);
+    }
+}
+
+test_trainer();
+
+sub test_block_attr_hidden
+{
+    my $b = gluon->Block();
+    # regular attributes can change types
+    $b->a(undef);
+    $b->a(1);
+}
+
+test_block_attr_hidden();
+
+sub test_block_attr_block
+{
+    my $b = gluon->Block();
+    # regular variables can't change types
+    $b->b(gluon->Block());
+    eval { $b->b([2]); };
+    ok($@ =~ /not allowed/i);
+}
+
+test_block_attr_block();
+
+sub test_block_attr_param
+{
+    my $b = gluon->Block();
+    # regular variables can't change types
+    $b->b(gluon->Parameter(name => 'test'));
+    eval { $b->b([2]); };
+    ok($@ =~ /not allowed/i);
+}
+
+test_block_attr_param();
+
+sub test_block_attr_regular
+{
+    my $b = gluon->Block();
+
+    # set block attribute also sets _children
+    $b->c(gluon->Block());
+    my $c2 = gluon->Block();
+    $b->c($c2);
+    ok(refaddr($b->c) == refaddr($c2) and refaddr($b->_children->[0]) == refaddr($c2));
+}
+
+test_block_attr_regular();
+
+sub test_embedding
+{
+    my $layer = gluon->nn->Embedding(10, 100);
+    $layer->initialize();
+    my $x = mx->nd->array([3,4,2,0,1]);
+    my $y;
+    mx->autograd->record(sub {
+        $y = $layer->($x);
+        $y->backward();
+    });
+    ok(($layer->weight->grad->slice([0,4]) == 1)->aspdl->all);
+    ok(($layer->weight->grad->slice([5, -1]) == 0)->aspdl->all);
+}
+
+test_embedding();
diff --git a/perl-package/AI-MXNet/t/test_gluon_data.t b/perl-package/AI-MXNet/t/test_gluon_data.t
new file mode 100644
index 000000000000..92e83b968d23
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_gluon_data.t
@@ -0,0 +1,128 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+use AI::MXNet qw(mx);
+use AI::MXNet::Gluon qw(gluon);
+use AI::MXNet::Gluon::Utils qw(download);
+use Archive::Tar;
+use AI::MXNet::TestUtils qw(almost_equal);
+use AI::MXNet::Base;
+use File::Path qw(make_path);
+use IO::File;
+use Test::More tests => 52;
+
+sub test_array_dataset
+{
+    my $X = mx->nd->random->uniform(shape=>[10, 20]);
+    my $Y = mx->nd->random->uniform(shape=>[10]);
+    my $dataset = gluon->data->ArrayDataset($X, $Y);
+    my $loader = gluon->data->DataLoader($dataset, 2);
+    enumerate(sub {
+        my ($i, $d) = @_;
+        my ($x, $y) = @$d;
+        ok(almost_equal($x->aspdl, $X->slice([$i*2,($i+1)*2-1])->aspdl));
+        ok(almost_equal($y->aspdl, $Y->slice([$i*2,($i+1)*2-1])->aspdl));
+    }, \@{ $loader });
+}
+
+test_array_dataset();
+
+sub prepare_record
+{
+    my ($copy) = @_;
+    if(not -d "data/test_images")
+    {
+        make_path('data/test_images');
+    }
+    if(not -d "data/test_images/test_images")
+    {
+        download("http://data.mxnet.io/data/test_images.tar.gz", path => "data/test_images.tar.gz");
+        my $f = Archive::Tar->new('data/test_images.tar.gz');
+        chdir('data');
+        $f->extract;
+        chdir('..');
+    }
+    if(not -f 'data/test.rec')
+    {
+        my @imgs = glob('data/test_images/*');
+        my $record = mx->recordio->MXIndexedRecordIO('data/test.idx', 'data/test.rec', 'w');
+        enumerate(sub {
+            my ($i, $img) = @_;
+            my $str_img = join('',IO::File->new("./$img")->getlines);
+            my $s = mx->recordio->pack([0, $i, $i, 0], $str_img);
+            $record->write_idx($i, $s);
+        }, \@imgs);
+    }
+    if($copy)
+    {
+        make_path('data/images/test_images');
+        `cp  data/test_images/* data/images/test_images`;
+    }
+    return 'data/test.rec';
+}
+
+sub test_recordimage_dataset
+{
+    my $recfile = prepare_record();
+    my $dataset = gluon->data->vision->ImageRecordDataset($recfile);
+    my $loader = gluon->data->DataLoader($dataset, 1);
+    enumerate(sub {
+        my ($i, $d) = @_;
+        my ($x, $y) = @$d;
+        ok($x->shape->[0] == 1 and $x->shape->[3] == 3);
+        ok($y->asscalar == $i);
+    }, \@{ $loader });
+}
+
+test_recordimage_dataset();
+
+sub test_sampler
+{
+    my $seq_sampler = gluon->data->SequentialSampler(10);
+    is_deeply(\@{ $seq_sampler }, [0..9]);
+    my $rand_sampler = gluon->data->RandomSampler(10);
+    is_deeply([sort { $a <=> $b } @{ $rand_sampler }], [0..9]);
+    my $seq_batch_keep = gluon->data->BatchSampler($seq_sampler, 3, 'keep');
+    is_deeply([map { @$_ } @{ $seq_batch_keep }], [0..9]);
+    my $seq_batch_discard = gluon->data->BatchSampler($seq_sampler, 3, 'discard');
+    is_deeply([map { @$_ } @{ $seq_batch_discard }], [0..8]);
+    my $rand_batch_keep = gluon->data->BatchSampler($rand_sampler, 3, 'keep');
+    is_deeply([sort { $a <=> $b } map { @$_ } @{ $rand_batch_keep }], [0..9]);
+}
+
+test_sampler();
+
+sub test_datasets
+{
+    ok(gluon->data->vision->MNIST(root=>'data/mnist')->len == 60000);
+    ok(gluon->data->vision->FashionMNIST(root=>'data/fashion-mnist')->len == 60000);
+    ok(gluon->data->vision->CIFAR10(root=>'data/cifar10', train=>0)->len == 10000);
+}
+
+test_datasets();
+
+sub test_image_folder_dataset
+{
+    prepare_record(1);
+    my $dataset = gluon->data->vision->ImageFolderDataset('data/images');
+    is_deeply($dataset->synsets, ['test_images']);
+    ok(@{ $dataset->items } == 16);
+}
+
+test_image_folder_dataset();
diff --git a/perl-package/AI-MXNet/t/test_gluon_rnn.t b/perl-package/AI-MXNet/t/test_gluon_rnn.t
new file mode 100644
index 000000000000..13f22931468c
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_gluon_rnn.t
@@ -0,0 +1,334 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+use strict;
+use warnings;
+use Test::More tests => 77;
+use AI::MXNet 'mx';
+use AI::MXNet::Gluon 'gluon';
+use AI::MXNet::TestUtils qw/allclose almost_equal/;
+use AI::MXNet::Base;
+use Scalar::Util 'blessed';
+
+sub test_rnn
+{
+    my $cell = gluon->rnn->RNNCell(100, prefix=>'rnn_');
+    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
+    my ($outputs) = $cell->unroll(3, $inputs);
+    $outputs = mx->sym->Group($outputs);
+    is_deeply([sort $cell->collect_params()->keys()], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
+    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
+
+    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
+}
+
+test_rnn();
+
+sub test_lstm
+{
+    my $cell = gluon->rnn->LSTMCell(100, prefix=>'rnn_');
+    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
+    my ($outputs) = $cell->unroll(3, $inputs);
+    $outputs = mx->sym->Group($outputs);
+    is_deeply([sort $cell->collect_params()->keys()], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
+    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
+
+    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
+}
+
+test_lstm();
+
+sub test_lstm_forget_bias
+{
+    my $forget_bias = 2;
+    my $stack = gluon->rnn->SequentialRNNCell();
+    $stack->add(gluon->rnn->LSTMCell(100, i2h_bias_initializer=>mx->init->LSTMBias($forget_bias), prefix=>'l0_'));
+    $stack->add(gluon->rnn->LSTMCell(100, i2h_bias_initializer=>mx->init->LSTMBias($forget_bias), prefix=>'l1_'));
+
+    my $dshape = [32, 1, 200];
+    my $data = mx->sym->Variable('data');
+
+    my ($sym) = $stack->unroll(1, $data, merge_outputs=>1);
+    my $mod = mx->mod->Module($sym, context=>mx->cpu(0));
+    $mod->bind(data_shapes=>[['data', $dshape]]);
+
+    $mod->init_params();
+
+    my ($bias_argument) = grep { /i2h_bias$/ } @{ $sym->list_arguments() };
+    my $expected_bias = pdl((0)x100, ($forget_bias)x100, (0)x200);
+    ok(allclose(($mod->get_params())[0]->{$bias_argument}->aspdl, $expected_bias));
+}
+
+test_lstm_forget_bias();
+
+sub test_gru
+{
+    my $cell = gluon->rnn->GRUCell(100, prefix=>'rnn_');
+    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
+    my ($outputs) = $cell->unroll(3, $inputs);
+    $outputs = mx->sym->Group($outputs);
+    is_deeply([sort $cell->collect_params()->keys()], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
+    is_deeply($outputs->list_outputs(), ['rnn_t0_out_output', 'rnn_t1_out_output', 'rnn_t2_out_output']);
+
+    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
+}
+
+test_gru();
+
+sub test_residual
+{
+    my $cell = gluon->rnn->ResidualCell(gluon->rnn->GRUCell(50, prefix=>'rnn_'));
+    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..1];
+    my ($outputs) = $cell->unroll(2, $inputs);
+    $outputs = mx->sym->Group($outputs);
+    is_deeply([sort $cell->collect_params()->keys()], ['rnn_h2h_bias', 'rnn_h2h_weight', 'rnn_i2h_bias', 'rnn_i2h_weight']);
+    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50]);
+    is_deeply($outs, [[10, 50], [10, 50]]);
+    $outputs = $outputs->eval(args => { rnn_t0_data=>mx->nd->ones([10, 50]),
+                           rnn_t1_data=>mx->nd->ones([10, 50]),
+                           rnn_i2h_weight=>mx->nd->zeros([150, 50]),
+                           rnn_i2h_bias=>mx->nd->zeros([150]),
+                           rnn_h2h_weight=>mx->nd->zeros([150, 50]),
+                           rnn_h2h_bias=>mx->nd->zeros([150]) });
+    my $expected_outputs = mx->nd->ones([10, 50]);
+    ok(($outputs->[0] == $expected_outputs)->aspdl->all);
+    ok(($outputs->[1] == $expected_outputs)->aspdl->all);
+}
+
+test_residual();
+
+sub test_residual_bidirectional
+{
+    my $cell = gluon->rnn->ResidualCell(
+        gluon->rnn->BidirectionalCell(
+            gluon->rnn->GRUCell(25, prefix=>'rnn_l_'),
+            gluon->rnn->GRUCell(25, prefix=>'rnn_r_')
+        )
+    );
+    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..1];
+    my ($outputs) = $cell->unroll(2, $inputs, merge_outputs => 0);
+    $outputs = mx->sym->Group($outputs);
+    is_deeply([sort $cell->collect_params()->keys()],
+                ['rnn_l_h2h_bias', 'rnn_l_h2h_weight', 'rnn_l_i2h_bias', 'rnn_l_i2h_weight',
+                'rnn_r_h2h_bias', 'rnn_r_h2h_weight', 'rnn_r_i2h_bias', 'rnn_r_i2h_weight']);
+    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50]);
+    is_deeply($outs, [[10, 50], [10, 50]]);
+    $outputs = $outputs->eval(args => { rnn_t0_data=>mx->nd->ones([10, 50])+5,
+                           rnn_t1_data=>mx->nd->ones([10, 50])+5,
+                           rnn_l_i2h_weight=>mx->nd->zeros([75, 50]),
+                           rnn_l_i2h_bias=>mx->nd->zeros([75]),
+                           rnn_l_h2h_weight=>mx->nd->zeros([75, 25]),
+                           rnn_l_h2h_bias=>mx->nd->zeros([75]),
+                           rnn_r_i2h_weight=>mx->nd->zeros([75, 50]),
+                           rnn_r_i2h_bias=>mx->nd->zeros([75]),
+                           rnn_r_h2h_weight=>mx->nd->zeros([75, 25]),
+                           rnn_r_h2h_bias=>mx->nd->zeros([75]),
+    });
+    my $expected_outputs = mx->nd->ones([10, 50])+5;
+    ok(($outputs->[0] == $expected_outputs)->aspdl->all);
+    ok(($outputs->[1] == $expected_outputs)->aspdl->all);
+}
+
+test_residual_bidirectional();
+
+sub test_stack
+{
+    my $cell = gluon->rnn->SequentialRNNCell();
+    for my $i (0..4)
+    {
+        if($i == 1)
+        {
+            $cell->add(gluon->rnn->ResidualCell(gluon->rnn->LSTMCell(100, prefix=>"rnn_stack${i}_")));
+        }
+        else
+        {
+            $cell->add(gluon->rnn->LSTMCell(100, prefix=>"rnn_stack${i}_"));
+        }
+    }
+    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
+    my ($outputs) = $cell->unroll(3, $inputs);
+    $outputs = mx->sym->Group($outputs);
+    my %keys = map { $_ => 1 } $cell->collect_params()->keys();
+    for my $i (0..4)
+    {
+        ok($keys{"rnn_stack${i}_h2h_weight"});
+        ok($keys{"rnn_stack${i}_h2h_bias"});
+        ok($keys{"rnn_stack${i}_i2h_weight"});
+        ok($keys{"rnn_stack${i}_i2h_bias"});
+    }
+    is_deeply($outputs->list_outputs(), ['rnn_stack4_t0_out_output', 'rnn_stack4_t1_out_output', 'rnn_stack4_t2_out_output']);
+    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
+}
+
+test_stack();
+
+sub test_bidirectional
+{
+    my $cell = gluon->rnn->BidirectionalCell(
+            gluon->rnn->LSTMCell(100, prefix=>'rnn_l0_'),
+            gluon->rnn->LSTMCell(100, prefix=>'rnn_r0_'),
+            output_prefix=>'rnn_bi_');
+    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
+    my ($outputs) = $cell->unroll(3, $inputs);
+    $outputs = mx->sym->Group($outputs);
+    is_deeply($outputs->list_outputs(), ['rnn_bi_t0_output', 'rnn_bi_t1_output', 'rnn_bi_t2_output']);
+    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 200], [10, 200], [10, 200]]);
+}
+
+test_bidirectional();
+
+sub test_zoneout
+{
+    my $cell = gluon->rnn->ZoneoutCell(gluon->rnn->RNNCell(100, prefix=>'rnn_'), zoneout_outputs=>0.5,
+                              zoneout_states=>0.5);
+    my $inputs = [map { mx->sym->Variable("rnn_t${_}_data") } 0..2];
+    my ($outputs) = $cell->unroll(3, $inputs);
+    $outputs = mx->sym->Group($outputs);
+    my (undef, $outs) = $outputs->infer_shape(rnn_t0_data=>[10,50], rnn_t1_data=>[10,50], rnn_t2_data=>[10,50]);
+    is_deeply($outs, [[10, 100], [10, 100], [10, 100]]);
+}
+
+test_zoneout();
+
+sub check_rnn_forward
+{
+    my ($layer, $inputs, $deterministic) = @_;
+    $deterministic //= 1;
+    $inputs->attach_grad();
+    $layer->collect_params()->initialize();
+    my $out;
+    mx->autograd->record(sub {
+        $out = ($layer->unroll(3, $inputs, merge_outputs=>0))[0];
+        mx->autograd->backward($out);
+        $out = ($layer->unroll(3, $inputs, merge_outputs=>1))[0];
+        $out->backward;
+    });
+
+    my $pdl_out = $out->aspdl;
+    my $pdl_dx = $inputs->grad->aspdl;
+
+    $layer->hybridize;
+
+    mx->autograd->record(sub {
+        $out = ($layer->unroll(3, $inputs, merge_outputs=>0))[0];
+        mx->autograd->backward($out);
+        $out = ($layer->unroll(3, $inputs, merge_outputs=>1))[0];
+        $out->backward;
+    });
+
+    if($deterministic)
+    {
+        ok(almost_equal($pdl_out, $out->aspdl, 1e-3));
+        ok(almost_equal($pdl_dx, $inputs->grad->aspdl, 1e-3));
+    }
+}
+
+sub test_rnn_cells
+{
+    check_rnn_forward(gluon->rnn->LSTMCell(100, input_size=>200), mx->nd->ones([8, 3, 200]));
+    check_rnn_forward(gluon->rnn->RNNCell(100, input_size=>200), mx->nd->ones([8, 3, 200]));
+    check_rnn_forward(gluon->rnn->GRUCell(100, input_size=>200), mx->nd->ones([8, 3, 200]));
+    my $bilayer = gluon->rnn->BidirectionalCell(
+        gluon->rnn->LSTMCell(100, input_size=>200),
+        gluon->rnn->LSTMCell(100, input_size=>200)
+    );
+    check_rnn_forward($bilayer, mx->nd->ones([8, 3, 200]));
+    check_rnn_forward(gluon->rnn->DropoutCell(0.5), mx->nd->ones([8, 3, 200]), 0);
+    check_rnn_forward(
+        gluon->rnn->ZoneoutCell(
+            gluon->rnn->LSTMCell(100, input_size=>200),
+            0.5, 0.2
+        ),
+        mx->nd->ones([8, 3, 200]),
+        0
+    );
+    my $net = gluon->rnn->SequentialRNNCell();
+    $net->add(gluon->rnn->LSTMCell(100, input_size=>200));
+    $net->add(gluon->rnn->RNNCell(100, input_size=>100));
+    $net->add(gluon->rnn->GRUCell(100, input_size=>100));
+    check_rnn_forward($net, mx->nd->ones([8, 3, 200]));
+}
+
+test_rnn_cells();
+
+sub check_rnn_layer_forward
+{
+    my ($layer, $inputs, $states) = @_;
+    $layer->collect_params()->initialize();
+    $inputs->attach_grad;
+    my $out;
+    mx->autograd->record(sub {
+        $out = $layer->($inputs, $states);
+        if(defined $states)
+        {
+            ok(@$out == 2);
+            $out = $out->[0];
+        }
+        else
+        {
+            ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
+        }
+        $out->backward();
+    });
+
+    my $pdl_out = $out->aspdl;
+    my $pdl_dx = $inputs->grad->aspdl;
+    $layer->hybridize;
+
+    mx->autograd->record(sub {
+        $out = $layer->($inputs, $states);
+        if(defined $states)
+        {
+            ok(@$out == 2);
+            $out = $out->[0]
+        }
+        else
+        {
+            ok(blessed $out and $out->isa('AI::MXNet::NDArray'));
+        }
+        $out->backward();
+    });
+
+    ok(almost_equal($pdl_out, $out->aspdl, 1e-3));
+    ok(almost_equal($pdl_dx, $inputs->grad->aspdl, 1e-3));
+}
+
+sub test_rnn_layers
+{
+    check_rnn_layer_forward(gluon->rnn->RNN(10, 2), mx->nd->ones([8, 3, 20]));
+    check_rnn_layer_forward(gluon->rnn->RNN(10, 2), mx->nd->ones([8, 3, 20]), mx->nd->ones([2, 3, 10]));
+    check_rnn_layer_forward(gluon->rnn->LSTM(10, 2), mx->nd->ones([8, 3, 20]));
+    check_rnn_layer_forward(gluon->rnn->LSTM(10, 2), mx->nd->ones([8, 3, 20]), [mx->nd->ones([2, 3, 10]), mx->nd->ones([2, 3, 10])]);
+    check_rnn_layer_forward(gluon->rnn->GRU(10, 2), mx->nd->ones([8, 3, 20]));
+    check_rnn_layer_forward(gluon->rnn->GRU(10, 2), mx->nd->ones([8, 3, 20]), mx->nd->ones([2, 3, 10]));
+
+    my $net = gluon->nn->Sequential();
+    $net->add(gluon->rnn->LSTM(10, 2, bidirectional=>1));
+    $net->add(gluon->nn->BatchNorm(axis=>2));
+    $net->add(gluon->nn->Flatten());
+    $net->add(gluon->nn->Dense(3, activation=>'relu'));
+    $net->collect_params()->initialize();
+    mx->autograd->record(sub {
+        $net->(mx->nd->ones([2, 3, 10]))->backward();
+    });
+}
+
+test_rnn_layers();
diff --git a/perl-package/AI-MXNet/t/test_loss.t b/perl-package/AI-MXNet/t/test_loss.t
new file mode 100644
index 000000000000..f98d4f25a353
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_loss.t
@@ -0,0 +1,304 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+use Test::More tests => 24;
+use AI::MXNet 'mx';
+use AI::MXNet::Gluon 'gluon';
+use AI::MXNet::TestUtils 'almost_equal';
+use Hash::Ordered;
+
+sub test_loss_ndarray
+{
+    my $output     = mx->nd->array([1, 2, 3, 4]);
+    my $label      = mx->nd->array([1, 3, 5, 7]);
+    my $weighting  = mx->nd->array([0.5, 1, 0.5, 1]);
+
+    my $loss = gluon->loss->L1Loss();
+    ok(mx->nd->sum($loss->($output, $label))->asscalar() == 6);
+    $loss = gluon->loss->L1Loss(weight=>0.5);
+    ok(mx->nd->sum($loss->($output, $label))->asscalar() == 3);
+    $loss = gluon->loss->L1Loss();
+    ok(mx->nd->sum($loss->($output, $label, $weighting))->asscalar() == 5);
+
+    $loss = gluon->loss->L2Loss();
+    ok(mx->nd->sum($loss->($output, $label))->asscalar() == 7);
+    $loss = gluon->loss->L2Loss(weight=>0.25);
+    ok(mx->nd->sum($loss->($output, $label))->asscalar() == 1.75);
+    $loss = gluon->loss->L2Loss();
+    ok(mx->nd->sum($loss->($output, $label, $weighting))->asscalar() == 6);
+
+    $output    = mx->nd->array([[0, 2], [1, 4]]);
+    $label     = mx->nd->array([0, 1]);
+    $weighting = mx->nd->array([[0.5], [1.0]]);
+
+    $loss = gluon->loss->SoftmaxCrossEntropyLoss();
+    my $L = $loss->($output, $label)->aspdl();
+    ok(almost_equal($L, mx->nd->array([ 2.12692809,  0.04858733])->aspdl));
+
+    $L = $loss->($output, $label, $weighting)->aspdl();
+    ok(almost_equal($L, mx->nd->array([ 1.06346405,  0.04858733])->aspdl));
+}
+
+test_loss_ndarray();
+
+sub get_net
+{
+    my ($num_hidden, $flatten) = @_;
+    $flatten //= 1;
+    my $data = mx->symbol->Variable('data');
+    my $fc1 = mx->symbol->FullyConnected($data, name=>'fc1', num_hidden=>128, flatten=>$flatten);
+    my $act1 = mx->symbol->Activation($fc1, name=>'relu1', act_type=>"relu");
+    my $fc2 = mx->symbol->FullyConnected($act1, name => 'fc2', num_hidden => 64, flatten=>$flatten);
+    my $act2 = mx->symbol->Activation($fc2, name=>'relu2', act_type=>"relu");
+    my $fc3 = mx->symbol->FullyConnected($act2, name=>'fc3', num_hidden=>$num_hidden, flatten=>$flatten);
+    return $fc3;
+}
+
+sub test_ce_loss
+{
+    my $nclass = 10;
+    my $N = 20;
+    my $data = mx->random->uniform(-1, 1, shape=>[$N, $nclass]);
+    my $label = mx->nd->array([qw/3 6 5 4 8 9 1 7 9 6 8 0 5 0 9 6 2 0 5 2/], dtype=>'int32');
+    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label');
+    my $output = get_net($nclass);
+    my $l = mx->symbol->Variable('label');
+    my $Loss = gluon->loss->SoftmaxCrossEntropyLoss();
+    my $loss = $Loss->($output, $l);
+    $loss = mx->sym->make_loss($loss);
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
+    local($AI::MXNet::Logging::silent) = 1;
+    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
+            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
+    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.1);
+}
+
+test_ce_loss();
+
+sub test_bce_loss
+{
+    my $N = 20;
+    my $data = mx->random->uniform(-1, 1, shape=>[$N, 20]);
+    my $label = mx->nd->array([qw/1 1 0 1 0 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0/], dtype=>'float32');
+    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label');
+    my $output = get_net(1);
+    my $l = mx->symbol->Variable('label');
+    my $Loss = gluon->loss->SigmoidBinaryCrossEntropyLoss();
+    my $loss = $Loss->($output, $l);
+    $loss = mx->sym->make_loss($loss);
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
+    local($AI::MXNet::Logging::silent) = 1;
+    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
+            eval_metric=>mx->metric->Loss(), optimizer=>'adam',
+            initializer=>mx->init->Xavier(magnitude=>2));
+    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.01);
+}
+
+test_bce_loss();
+
+sub test_bce_equal_ce2
+{
+    my $N = 100;
+    my $loss1 = gluon->loss->SigmoidBCELoss(from_sigmoid=>1);
+    my $loss2 = gluon->loss->SoftmaxCELoss(from_logits=>1);
+    my $out1 = mx->random->uniform(0, 1, shape=>[$N, 1]);
+    my $out2 = mx->nd->log(mx->nd->concat(1-$out1, $out1, dim=>1) + 1e-8);
+    my $label = mx->nd->round(mx->random->uniform(0, 1, shape=>[$N, 1]));
+    ok(almost_equal($loss1->($out1, $label)->aspdl, $loss2->($out2, $label)->aspdl));
+}
+
+test_bce_equal_ce2();
+
+sub test_kl_loss
+{
+    my $N = 20;
+    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
+    my $label = mx->nd->softmax(mx->random->uniform(0, 1, shape=>[$N, 2]));
+    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label');
+    my $output = mx->sym->log_softmax(get_net(2));
+    my $l = mx->symbol->Variable('label');
+    my $Loss = gluon->loss->KLDivLoss();
+    my $loss = $Loss->($output, $l);
+    $loss = mx->sym->make_loss($loss);
+    local($AI::MXNet::Logging::silent) = 1;
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
+    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
+            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
+    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
+}
+
+test_kl_loss();
+
+sub test_l2_loss
+{
+    my $N = 20;
+    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
+    my $label = mx->nd->softmax(mx->random->uniform(-1, 1, shape=>[$N, 1]));
+    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label', shuffle=>1);
+    my $output = get_net(1);
+    my $l = mx->symbol->Variable('label');
+    my $Loss = gluon->loss->L2Loss();
+    my $loss = $Loss->($output, $l);
+    $loss = mx->sym->make_loss($loss);
+    local($AI::MXNet::Logging::silent) = 1;
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
+    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
+            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
+    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.1);
+}
+
+test_l2_loss();
+
+sub test_l1_loss
+{
+    my $N = 20;
+    my $data = mx->random->uniform(-1, 1, shape=>[$N, 10]);
+    my $label = mx->nd->softmax(mx->random->uniform(-1, 1, shape=>[$N, 1]));
+    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label', shuffle=>1);
+    my $output = get_net(1);
+    my $l = mx->symbol->Variable('label');
+    my $Loss = gluon->loss->L1Loss();
+    my $loss = $Loss->($output, $l);
+    $loss = mx->sym->make_loss($loss);
+    local($AI::MXNet::Logging::silent) = 1;
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
+    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
+            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
+    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.1);
+}
+
+test_l1_loss();
+
+sub test_ctc_loss
+{
+    my $loss = gluon->loss->CTCLoss();
+    my $l = $loss->(mx->nd->ones([2,20,4]), mx->nd->array([[1,0,-1,-1],[2,1,1,-1]]));
+    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
+
+    $loss = gluon->loss->CTCLoss(layout=>'TNC');
+    $l = $loss->(mx->nd->ones([20,2,4]), mx->nd->array([[1,0,-1,-1],[2,1,1,-1]]));
+    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
+
+    $loss = gluon->loss->CTCLoss(layout=>'TNC', label_layout=>'TN');
+    $l = $loss->(mx->nd->ones([20,2,4]), mx->nd->array([[1,0,-1,-1],[2,1,1,-1]])->T);
+    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
+
+    $loss = gluon->loss->CTCLoss();
+    $l = $loss->(mx->nd->ones([2,20,4]), mx->nd->array([[2,1,2,2],[3,2,2,2]]), undef, mx->nd->array([2,3]));
+    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
+
+    $loss = gluon->loss->CTCLoss();
+    $l = $loss->(mx->nd->ones([2,25,4]), mx->nd->array([[2,1,-1,-1],[3,2,2,-1]]), mx->nd->array([20,20]));
+    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
+
+    $loss = gluon->loss->CTCLoss();
+    $l = $loss->(mx->nd->ones([2,25,4]), mx->nd->array([[2,1,3,3],[3,2,2,3]]), mx->nd->array([20,20]), mx->nd->array([2,3]));
+    ok(almost_equal($l->aspdl, mx->nd->array([18.82820702, 16.50581741])->aspdl));
+}
+
+test_ctc_loss();
+
+sub test_ctc_loss_train
+{
+    my $N = 20;
+    my $data = mx->random->uniform(-1, 1, shape=>[$N, 20, 10]);
+    my $label = mx->nd->arange(start => 4, repeat=>$N)->reshape([$N, 4]);
+    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label', shuffle=>1);
+    my $output = get_net(5, 0);
+    my $l = mx->symbol->Variable('label');
+    my $Loss = gluon->loss->CTCLoss(layout=>'NTC', label_layout=>'NT');
+    my $loss = $Loss->($output, $l);
+    $loss = mx->sym->make_loss($loss);
+    local($AI::MXNet::Logging::silent) = 1;
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
+    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 1},
+            initializer=>mx->init->Xavier(magnitude=>2), eval_metric=>mx->metric->Loss(),
+            optimizer=>'adam');
+    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 10);
+}
+
+test_ctc_loss_train();
+
+sub test_sample_weight_loss
+{
+    my $nclass = 10;
+    my $N = 20;
+    my $data = mx->random->uniform(-1, 1, shape=>[$N, $nclass]);
+    my $label = mx->nd->array([qw/2 0 8 4 3 4 2 5 5 7 2 3 7 1 2 6 4 2 8 0/], dtype=>'int32');
+    my $weight = mx->nd->array([(1)x10,(0)x10]);
+    my $data_iter = mx->io->NDArrayIter(
+        $data,
+        Hash::Ordered->new(label => $label, w => $weight),
+        batch_size=>10
+    );
+    my $output = get_net($nclass);
+    my $l = mx->symbol->Variable('label');
+    my $w = mx->symbol->Variable('w');
+    my $Loss = gluon->loss->SoftmaxCrossEntropyLoss();
+    my $loss = $Loss->($output, $l, $w);
+    $loss = mx->sym->make_loss($loss);
+    local($AI::MXNet::Logging::silent) = 1;
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label', 'w']);
+    $mod->fit($data_iter, num_epoch=>200, optimizer_params=>{learning_rate => 0.01},
+            eval_metric=>mx->metric->Loss(), optimizer=>'adam');
+    $data_iter = mx->io->NDArrayIter(
+        $data->slice([10,$data->len-1]),
+        Hash::Ordered->new(label => $label, w => $weight),
+        batch_size=>10
+    );
+    my $score =  $mod->score($data_iter, mx->metric->Loss())->{loss};
+    ok($score > 1);
+    $data_iter = mx->io->NDArrayIter(
+        $data->slice([0,9]),
+        Hash::Ordered->new(label => $label, w => $weight),
+        batch_size=>10
+    );
+    $score =  $mod->score($data_iter, mx->metric->Loss())->{loss};
+    ok($score < 0.05);
+}
+
+test_sample_weight_loss();
+
+sub test_saveload
+{
+    mx->random->seed(1234);
+    my $nclass = 10;
+    my $N = 20;
+    my $data = mx->random->uniform(-1, 1, shape=>[$N, $nclass]);
+    my $label = mx->nd->array([qw/2 0 8 4 3 4 2 5 5 7 2 3 7 1 2 6 4 2 8 0/], dtype=>'int32');
+    my $data_iter = mx->io->NDArrayIter($data, $label, batch_size=>10, label_name=>'label');
+    my $output = get_net($nclass);
+    my $l = mx->symbol->Variable('label');
+    my $Loss = gluon->loss->SoftmaxCrossEntropyLoss();
+    my $loss = $Loss->($output, $l);
+    $loss = mx->sym->make_loss($loss);
+    local($AI::MXNet::Logging::silent) = 1;
+    my $mod = mx->mod->Module($loss, data_names=>['data'], label_names=>['label']);
+    $mod->fit($data_iter, num_epoch=>100, optimizer_params=>{learning_rate => 1},
+            eval_metric=>mx->metric->Loss());
+    $mod->save_checkpoint('test', 100, 1);
+    $mod = mx->mod->Module->load('test', 100, 1,
+                             data_names=>['data'], label_names=>['label']);
+    $mod->fit($data_iter, num_epoch=>100, optimizer_params=>{learning_rate => 1},
+            eval_metric=>mx->metric->Loss()
+    );
+    ok($mod->score($data_iter, mx->metric->Loss())->{loss} < 0.05);
+}
+
+test_saveload();
diff --git a/perl-package/AI-MXNet/t/test_metric.t b/perl-package/AI-MXNet/t/test_metric.t
new file mode 100644
index 000000000000..031f2052b780
--- /dev/null
+++ b/perl-package/AI-MXNet/t/test_metric.t
@@ -0,0 +1,45 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+use strict;
+use warnings;
+use Test::More tests => 6;
+use JSON::PP;
+use AI::MXNet 'mx';
+
+sub check_metric
+{
+    my ($metric, @args) = @_;
+    $metric = mx->metric->create($metric, @args);
+    my $str_metric = encode_json($metric->get_config());
+    my $metric2 = mx->metric->create($str_metric);
+    is_deeply($metric->get_config(), $metric2->get_config());
+}
+
+
+sub test_metrics
+{
+    check_metric('acc', axis=>0);
+    check_metric('f1');
+    check_metric('perplexity', -1);
+    check_metric('pearsonr');
+    check_metric('confidence', 2, [0.5, 0.9]);
+    my $composite = mx->metric->create(['acc', 'f1']);
+    check_metric($composite);
+}
+
+test_metrics();
diff --git a/perl-package/AI-MXNet/t/test_model_parallel.t b/perl-package/AI-MXNet/t/test_model_parallel.t
index 6a8aba7aab06..76fe25625be3 100644
--- a/perl-package/AI-MXNet/t/test_model_parallel.t
+++ b/perl-package/AI-MXNet/t/test_model_parallel.t
@@ -65,10 +65,10 @@ sub test_chain
     $out_grad .= 1;
     $exec1->backward([$out_grad]);
     $exec2->backward([$out_grad->copyto($ctx1)]);
-    zip(sub {
-        my ($a, $b) = @_;
+    for(zip($arr_grad, $arr_grad2)) {
+        my ($a, $b) = @$_;
         ok(reldiff($a->aspdl, $b->aspdl) < 1e-6);
-    }, $arr_grad, $arr_grad2);
+    }
 }
 
 test_chain();
diff --git a/perl-package/AI-MXNet/t/test_module.t b/perl-package/AI-MXNet/t/test_module.t
index 4d19a8e7d5df..305b232a7222 100644
--- a/perl-package/AI-MXNet/t/test_module.t
+++ b/perl-package/AI-MXNet/t/test_module.t
@@ -4,7 +4,6 @@ use Test::More tests => 257;
 use AI::MXNet qw(mx);
 use AI::MXNet::Base;
 use AI::MXNet::TestUtils qw(almost_equal enumerate same_array dies_like);
-use Data::Dumper;
 
 sub test_module_layout
 {
@@ -149,10 +148,10 @@ sub test_module_states
     $mod->forward($batch);
     my $out2 = $mod->get_outputs(1);
 
-    zip(sub {
-        my ($x1, $x2) = @_;
+    for(zip($out1, $out2)) {
+        my ($x1, $x2) = @$_;
         ok(not almost_equal($x1->aspdl, $x2->aspdl, 1e-3));
-    }, $out1, $out2);
+    }
 }
 
 sub test_module_switch_bucket
@@ -620,4 +619,4 @@ test_module_reshape();
 test_save_load();
 test_executor_group();
 test_module_set_params();
-test_forward_reshape();
\ No newline at end of file
+test_forward_reshape();
diff --git a/perl-package/AI-MXNet/t/test_multi_device_exec.t b/perl-package/AI-MXNet/t/test_multi_device_exec.t
index 87ca25778c92..15111a7a5d80 100644
--- a/perl-package/AI-MXNet/t/test_multi_device_exec.t
+++ b/perl-package/AI-MXNet/t/test_multi_device_exec.t
@@ -41,8 +41,8 @@ sub test_ctx_group
         shapes    => { data => [1,200] }
     );
 
-    zip(sub {
-        my ($arr, $name) = @_;
+    for(zip($texec->arg_arrays, $mlp->list_arguments())) {
+        my ($arr, $name) = @$_;
         if(exists $set_stage1{ $name })
         {
             ok($arr->context == $group2ctx->{stage1});
@@ -51,7 +51,7 @@ sub test_ctx_group
         {
             ok($arr->context == $group2ctx->{stage2});
         }
-    }, $texec->arg_arrays, $mlp->list_arguments());
+    }
 }
 
 test_ctx_group();
diff --git a/perl-package/AI-MXNet/t/test_ndarray.t b/perl-package/AI-MXNet/t/test_ndarray.t
index 4faf464d3b56..b40ed8fa78ab 100644
--- a/perl-package/AI-MXNet/t/test_ndarray.t
+++ b/perl-package/AI-MXNet/t/test_ndarray.t
@@ -1,8 +1,8 @@
 use strict;
 use warnings;
 use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(almost_equal);
-use Test::More tests => 10;
+use AI::MXNet::TestUtils qw(almost_equal same);
+use Test::More tests => 17;
 
 sub test_ndarray_reshape
 {
@@ -58,15 +58,71 @@ sub test_cached
     my $data = mx->nd->ones([3, 4, 10, 10]);
     my $weight = mx->nd->ones([10, 4, 3, 3]);
     my $bias = mx->nd->ones([10]);
-    my $o1 = &{$op}($data, $weight, $bias);
+    my $o1 = $op->($data, $weight, $bias);
     $bias .= 2;
-    my $o2 = &{$op}($data, $weight, $bias);
+    my $o2 = $op->($data, $weight, $bias);
     ok(almost_equal($o2->aspdl, $o1->aspdl+1));
     $o2 .= 0;
-    &{$op}($data, $weight, $bias, out=>$o2);
+    $op->($data, $weight, $bias, out=>$o2);
     ok(almost_equal($o2->aspdl, $o1->aspdl+1));
+
+    $weight->attach_grad();
+    $bias->attach_grad();
+    my $o;
+    mx->autograd->record(sub {
+        $bias = $bias + 1;
+        $o = $op->($data, $weight, $bias);
+        $o = $o * 2;
+        $o->backward();
+    });
+
+    mx->autograd->record(sub {
+        $bias = $bias + 1;
+        $o = $op->($data, $weight, $bias);
+        $o = $o * 2;
+        $o->backward(retain_graph=>1);
+        $o->backward();
+    });
+
+    # try a different shape
+    $data = mx->nd->ones([5, 2, 10, 10]);
+    $weight = mx->nd->ones([10, 2, 3, 3]);
+    $bias = mx->nd->ones([10]);
+    $data->attach_grad;
+
+    mx->autograd->record(sub {
+        $bias = $bias + 1;
+        $o = $op->($data, $weight, $bias);
+        $o = $o * 2;
+        $o->backward();
+    });
+}
+
+sub test_ndarray_slice
+{
+    my $shape = [10];
+    my $A = mx->random->uniform(-10, 10, $shape);
+    my $A2 = $A->aspdl;
+    ok(same($A->slice([3,7])->aspdl, $A2->slice([3, 7])));
+    $A2->slice([3, 7]) *= 10;
+    $A->slice([3,7]) .= $A2->slice([3, 7]);
+    ok(same($A->slice([3,7])->aspdl, $A2->slice([3, 7])));
+
+    $shape = [3,4,5,6,7];
+    $A = mx->nd->random->uniform(shape=>$shape);
+    $A2 = $A->aspdl;
+
+    ok(same($A->slice([1], [3,3], 'X', [1,4], 'X')->aspdl, $A2->slice('X', [1,4], 'X', [3,3], [1])));
+    ok(($A->slice([1], [3,3], 'X', [1,4], 'X') == mx->nd->array($A2->slice('X', [1,4], 'X', [3,3], [1])))->aspdl->all);
+
+    ok($A->slice(1,2,3,4,5)->asscalar() == $A2->at(5, 4, 3, 2, 1));
+
+    my $a = mx->nd->array([[0, 1], [2, 3]]);
+    ok(($a->slice([[1, 1, 0], [0, 1, 0]])->aspdl == mx->nd->array([2, 3, 0])->aspdl)->all);
+    ok(($a->slice([mx->nd->array([1, 1, 0]), mx->nd->array([0, 1, 0])])->aspdl == mx->nd->array([2, 3, 0])->aspdl)->all);
 }
 
+test_ndarray_slice();
 test_ndarray_reshape();
 test_moveaxis();
 test_output();
diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t
index 52ff3072d9eb..d87dd0d566a7 100644
--- a/perl-package/AI-MXNet/t/test_optimizers.t
+++ b/perl-package/AI-MXNet/t/test_optimizers.t
@@ -62,13 +62,13 @@ method update($index, $weight, $grad, $state)
     $grad = $grad * $self->rescale_grad + $wd * $weight;
     if($self->clip_gradient)
     {
-        mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient, { out => $grad });
+        mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient, out => $grad);
     }
     $mean *= $self->beta1;
     $mean += $grad * (1 - $self->beta1);
 
     $variance *= $self->beta2;
-    $variance += (1 - $self->beta2) * mx->nd->square($grad, { out => $grad });
+    $variance += (1 - $self->beta2) * mx->nd->square($grad, out => $grad);
 
     my $coef1 = 1 - $self->beta1**$t;
     my $coef2 = 1 - $self->beta2**$t;
@@ -182,7 +182,7 @@ method update($index, $weight, $grad, $state)
     }
     if($self->clip_weights)
     {
-        mx->nd->clip($weight, -$self->clip_weights, $self->clip_weights, { out => $weight });
+        mx->nd->clip($weight, -$self->clip_weights, $self->clip_weights, out => $weight);
     }
 }
 
diff --git a/perl-package/AI-MXNet/t/test_random.t b/perl-package/AI-MXNet/t/test_random.t
index 82175948efc4..c95a199f2104 100644
--- a/perl-package/AI-MXNet/t/test_random.t
+++ b/perl-package/AI-MXNet/t/test_random.t
@@ -1,55 +1,210 @@
 use strict;
 use warnings;
-use Test::More tests => 8;
+use Test::More tests => 505;
 use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(same);
+use AI::MXNet::TestUtils qw(same enumerate);
 
 sub check_with_device
 {
-    my ($device)     = @_;
-    my ($a, $b)      = (-10, 10);
-    my ($mu, $sigma) = (10, 2);
-    my $shape        = [100, 100];
-    mx->random->seed(128);
-    my $ret1 = mx->random->normal($mu, $sigma, $shape, { ctx => $device });
-    my $un1  = mx->random->uniform($a, $b, $shape, { ctx => $device });
-    mx->random->seed(128);
-    my $ret2 = mx->random->normal($mu, $sigma, $shape, { ctx => $device });
-    my $un2  = mx->random->uniform($a, $b, $shape, { ctx => $device });
-    ok(same($ret1->aspdl, $ret2->aspdl));
-    ok(same($un1->aspdl, $un2->aspdl));
-    ok(abs($ret1->aspdl->avg - $mu) < 0.1);
-    ok(abs(($ret1->aspdl->stats)[6] - $sigma) < 0.1);
-    ok(abs($un1->aspdl->avg - ($a+$b)/2) < 0.1);
-}
-
-sub check_symbolic_random
-{
-    my ($dev) = @_;
-    my ($a, $b) = (-10, 10);
-    my ($mu, $sigma) = (10, 2);
+    my ($device, $dtype) = @_;
+    my $tol = 0.1;
+    my @symbols = (
+        {
+            name   => 'normal',
+            symbol => sub { mx->sym->random->normal(@_) },
+            ndop   => sub { mx->nd->random->normal(@_)  },
+            params => { loc => 10.0, scale => 0.5 },
+            inputs => [ [loc => [ [ 0.0, 2.5 ], [ -9.75, -7.0 ] ]] , [scale => [ [ 1.0, 3.7 ], [ 4.2, 1.5 ] ]] ],
+            checks => [
+                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{loc} }, $tol],
+                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - $params->{scale} }, $tol]
+            ]
+        },
+        {
+            name   => 'uniform',
+            symbol => sub { mx->sym->random->uniform(@_) },
+            ndop   => sub { mx->nd->random->uniform(@_)  },
+            params => { low => -1.5, high => 3 },
+            inputs => [ [low => [ [ 0.0, 2.5 ], [ -9.75, -1.0 ] ]] , [high => [ [ 1.0, 3.7 ], [ 4.2, 10.5 ] ]] ],
+            checks => [
+                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - ($params->{low} + $params->{high})/2 }, $tol],
+                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt(1/12) * ($params->{high} - $params->{low}) }, $tol]
+            ]
+        },
+        {
+            name   => 'gamma',
+            symbol => sub { mx->sym->random->gamma(@_) },
+            ndop   => sub { mx->nd->random->gamma(@_)  },
+            params => { alpha => 9, beta => 0.5 },
+            inputs => [ [alpha => [ [ 0.0, 2.5 ], [ 9.75, 11 ] ]] , [beta => [ [ 1, 0.7 ], [ 0.5, 0.3 ] ]] ],
+            checks => [
+                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{alpha} * $params->{beta} }, $tol],
+                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt($params->{alpha} * $params->{beta}**2) }, $tol]
+            ]
+        },
+        {
+            name   => 'exponential',
+            symbol => sub { mx->sym->random->exponential(@_) },
+            ndop   => sub { mx->nd->random->exponential(@_)  },
+            params => { scale => 1/4 },
+            inputs => [ [scale => [ [ 1/1, 1/8.5 ], [ 1/2.7, 1/0.5 ] ]] ],
+            checks => [
+                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{scale} }, $tol],
+                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - $params->{scale} }, $tol]
+            ]
+        },
+        {
+            name   => 'poisson',
+            symbol => sub { mx->sym->random->poisson(@_) },
+            ndop   => sub { mx->nd->random->poisson(@_)  },
+            params => { lam => 4 },
+            inputs => [ [lam => [ [ 1, 8.5 ], [ 2.7, 0.5 ] ]] ],
+            checks => [
+                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{lam} }, $tol],
+                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt($params->{lam}) }, $tol]
+            ]
+        },
+        {
+            name   => 'neg-binomial',
+            symbol => sub { mx->sym->random->negative_binomial(@_) },
+            ndop   => sub { mx->nd->random->negative_binomial(@_)  },
+            params => { k => 3, p => 0.4 },
+            inputs => [ [k => [ [ 3, 4 ], [ 5, 6 ] ]] , [p => [ [ 0.4, 0.77 ], [ 0.5, 0.84 ] ]] ],
+            checks => [
+                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{k}*(1-$params->{p})/$params->{p} }, $tol],
+                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt($params->{k}*(1-$params->{p}))/$params->{p} }, $tol]
+            ]
+        },
+        {
+            name   => 'gen-neg-binomial',
+            symbol => sub { mx->sym->random->generalized_negative_binomial(@_) },
+            ndop   => sub { mx->nd->random->generalized_negative_binomial(@_)  },
+            params => { mu => 2, alpha => 0.3 },
+            inputs => [ [mu => [ [ 2, 2.5 ], [ 1.3, 1.9 ] ]] , [alpha => [ [ 1.0, 0.1 ], [ 0.2, 0.5 ] ]] ],
+            checks => [
+                [mean => sub { my ($x, $params) = @_; $x->astype('float64')->aspdl->avg - $params->{mu} }, $tol],
+                [std  => sub { my ($x, $params) = @_; ($x->astype('float64')->aspdl->stats)[6] - sqrt($params->{mu}+$params->{alpha}*$params->{mu}**2) }, $tol]
+            ]
+        },
+    );
     my $shape = [100, 100];
-    my $X = mx->sym->Variable("X");
-    my $Y = mx->sym->uniform(low=>$a, high=>$b, shape=>$shape) + $X;
-    my $x = mx->nd->zeros($shape, ctx=>$dev);
-    my $xgrad = mx->nd->zeros($shape, ctx=>$dev);
-    my $yexec = $Y->bind(ctx => $dev, args => {X => $x}, args_grad => {X => $xgrad});
-    mx->random->seed(128);
-    $yexec->forward(1);
-    $yexec->backward($yexec->outputs->[0]);
-    my $un1 = ($yexec->outputs->[0] - $x)->copyto($dev);
-    ok(same($xgrad->aspdl, $un1->aspdl));
-    mx->random->seed(128);
-    $yexec->forward;
-    my $un2 = ($yexec->outputs->[0] - $x)->copyto($dev);
-    ok(same($un1->aspdl, $un2->aspdl));
-    ok(abs($un1->aspdl->avg - ($a+$b)/2) < 0.1);
+    for my $symbdic (@symbols)
+    {
+        my $name = $symbdic->{name};
+        my $ndop = $symbdic->{ndop};
+
+        # check directly
+        my %params = %{ $symbdic->{params} };
+        %params = (%params, shape=>$shape, dtype=>$dtype, ctx=>$device);
+        mx->random->seed(128);
+        my $ret1 = $ndop->(%params);
+        mx->random->seed(128);
+        my $ret2 = $ndop->(%params);
+        ok(same($ret1->aspdl, $ret2->aspdl), "simple $name");
+
+        for my $d (@{ $symbdic->{checks} })
+        {
+            my ($check_name, $check_func, $tol) = @$d;
+            ok((abs($check_func->($ret1, \%params)) < $tol), "simple $name, $check_name");
+        }
+
+        # check multi-distribution sampling, only supports cpu for now
+        %params = (shape=>$shape, dtype=>$dtype, ctx=>$device);
+        %params = (%params, map { $_->[0] => mx->nd->array($_->[1], ctx=>$device, dtype=>$dtype) } @{ $symbdic->{inputs} });
+        mx->random->seed(128);
+        $ret1 = $ndop->(%params);
+        mx->random->seed(128);
+        $ret2 = $ndop->(%params);
+        ok(same($ret1->aspdl, $ret2->aspdl), "advanced $name");
+
+        for my $i (0,1)
+        {
+            for my $j (0,1)
+            {
+                my %stats = map { $_->[0] => $_->[1][$i][$j] } @{ $symbdic->{inputs} };
+                for my $d (@{ $symbdic->{checks} })
+                {
+                    my ($check_name, $check_func, $tol) = @$d;
+                    ok((abs($check_func->($ret2->at($i)->at($j), \%stats)) < $tol), "advanced $name, $check_name");
+                }
+            }
+        }
+
+        # check symbolic
+        my $symbol = $symbdic->{symbol};
+        my $X = mx->sym->Variable("X");
+        %params = %{ $symbdic->{params} };
+        %params = (%params, shape=>$shape, dtype=>$dtype);
+        my $Y = $symbol->(%params) + $X;
+        my $x = mx->nd->zeros($shape, dtype=>$dtype, ctx=>$device);
+        my $xgrad = mx->nd->zeros($shape, dtype=>$dtype, ctx=>$device);
+        my $yexec = $Y->bind(ctx => $device, args => { X => $x }, args_grad => { X => $xgrad });
+        mx->random->seed(128);
+        $yexec->forward(1);
+        $yexec->backward($yexec->outputs->[0]);
+        my $un1 = ($yexec->outputs->[0] - $x)->copyto($device);
+        ok(same($xgrad->aspdl, $un1->aspdl), "symbolic simple");
+        mx->random->seed(128);
+        $yexec->forward();
+        my $un2 = ($yexec->outputs->[0] - $x)->copyto($device);
+        ok(same($un1->aspdl, $un2->aspdl), "symbolic simple $name");
+
+        for my $d (@{ $symbdic->{checks} })
+        {
+            my ($check_name, $check_func, $tol) = @$d;
+            ok((abs($check_func->($un1, \%params)) < $tol), "symbolic $name, $check_name");
+        }
+
+        # check multi-distribution sampling, only supports cpu for now
+        $symbol = $symbdic->{symbol};
+        %params = (shape=>$shape, dtype=>$dtype);
+        my $single_param = @{ $symbdic->{inputs} } == 1;
+        my $v1 = mx->sym->Variable('v1');
+        my $v2 = mx->sym->Variable('v2');
+        $Y = $symbol->($single_param ? ($v1) : ($v1, $v2), %params);
+        my $bindings = { v1 => mx->nd->array($symbdic->{inputs}[0][1]) };
+        if(not $single_param)
+        {
+            $bindings->{v2} = mx->nd->array($symbdic->{inputs}[1][1]);
+        }
+        $yexec = $Y->bind(ctx=>$device, args=>$bindings);
+        $yexec->forward();
+        $un1 = $yexec->outputs->[0]->copyto($device);
+        %params = ();
+        enumerate(sub {
+            my ($i, $r) = @_;
+            enumerate(sub {
+                my ($j, $p1) = @_;
+                $params{ $symbdic->{inputs}[0][0] } = $p1;
+                if(not $single_param)
+                {
+                    $params{ $symbdic->{inputs}[1][0] } = $symbdic->{inputs}[1][1][$i][$j];
+                }
+                my $samples = $un1->at($i)->at($j);
+                for my $d (@{ $symbdic->{checks} })
+                {
+                    my ($check_name, $check_func, $tol) = @$d;
+                    ok((abs($check_func->($samples, \%params)) < $tol), "symbolic advanced $name, $check_name");
+                }
+            }, $r);
+        }, $symbdic->{inputs}[0][1]);
+    }
 }
 
 sub test_random
 {
-    check_with_device(mx->cpu);
-    check_symbolic_random(mx->cpu);
+    check_with_device(mx->context->current_context(), 'float16');
+    check_with_device(mx->context->current_context(), 'float32');
+    check_with_device(mx->context->current_context(), 'float64');
 }
 
 test_random();
+
+sub test_sample_multinomial
+{
+    my $x = mx->nd->array([[0,1,2,3,4],[4,3,2,1,0]])/10.0;
+    ok(@{ mx->nd->random->multinomial($x, shape=>1000, get_prob=>1) }, "multiminomial");
+}
+
+test_sample_multinomial();
+
diff --git a/perl-package/AI-MXNetCAPI/Changes b/perl-package/AI-MXNetCAPI/Changes
index 1a6356c0333d..e8736800453c 100644
--- a/perl-package/AI-MXNetCAPI/Changes
+++ b/perl-package/AI-MXNetCAPI/Changes
@@ -1,6 +1,9 @@
 Revision history for Perl extension AI::MXNetCAPI
 
-1.0102 Sun Aug  6 16:55:08 PDT 2017
+1.1     Sun Oct  1 10:19:08 PDT 2017
+        - support for perl 5.14, Gluon, cuda kernels
+
+1.0102  Sun Aug  6 16:55:08 PDT 2017
         - updated autograd calls.
 
 1.0101  Sun Jul  2 17:16:01 PDT 2017
diff --git a/perl-package/AI-MXNetCAPI/META.json b/perl-package/AI-MXNetCAPI/META.json
index a6d65fd2d73a..92864bcc7071 100644
--- a/perl-package/AI-MXNetCAPI/META.json
+++ b/perl-package/AI-MXNetCAPI/META.json
@@ -37,5 +37,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.0102"
+   "version" : "1.1"
 }
diff --git a/perl-package/AI-MXNetCAPI/META.yml b/perl-package/AI-MXNetCAPI/META.yml
index 0e3bb53c475c..99ffd0718b99 100644
--- a/perl-package/AI-MXNetCAPI/META.yml
+++ b/perl-package/AI-MXNetCAPI/META.yml
@@ -19,4 +19,4 @@ no_index:
     - inc
 requires:
   Test::More: '0'
-version: '1.0102'
+version: '1.1'
diff --git a/perl-package/AI-MXNetCAPI/README b/perl-package/AI-MXNetCAPI/README
index 5c531463e83b..c71ca554780b 100644
--- a/perl-package/AI-MXNetCAPI/README
+++ b/perl-package/AI-MXNetCAPI/README
@@ -1,4 +1,4 @@
-AI-MXNetCAPI version 1.0102
+AI-MXNetCAPI version 1.1
 =====================
 
 Swig interface to MXNet c api.
diff --git a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
index 0a93d71916f8..8c1499d8d558 100644
--- a/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
+++ b/perl-package/AI-MXNetCAPI/lib/AI/MXNetCAPI.pm
@@ -18,7 +18,7 @@
 package AI::MXNetCAPI;
 use base qw(DynaLoader);
 bootstrap AI::MXNetCAPI;
-our $VERSION = '1.0102';
+our $VERSION = '1.1';
 1;
 __END__
 
diff --git a/perl-package/AI-MXNetCAPI/mxnet.i b/perl-package/AI-MXNetCAPI/mxnet.i
index fd1a471bcf16..663a0c285f0b 100644
--- a/perl-package/AI-MXNetCAPI/mxnet.i
+++ b/perl-package/AI-MXNetCAPI/mxnet.i
@@ -106,7 +106,44 @@ static void ExecutorMonitor_callback(const char* name, NDArrayHandle handle, voi
 
 %}
 
+%{
+
+/* this is an adaptation of Python/bltinmodule.c's builtin_zip() */
+XS(py_zip) {
+    dXSARGS;
+    I32 i;
+    I32 len = -1;
+    AV *l[items];
+
+    for(i = 0; i < items; i++) {
+        AV *av = (AV *)SvRV(ST(i));
+        I32 thislen;
+
+        if(SvTYPE(av) != SVt_PVAV)
+            croak("zip argument#%d must be an array", i);
+        thislen = av_len(av) + 1;
+        if(len < 0 || thislen < len)
+            len = thislen;
+        l[i] = av;
+    }
+    EXTEND(SP, len);
+    for(i = 0; i < len; i++) {
+        I32 j;
+        SV *next[items];
+
+        for(j = 0; j < items; j++) {
+            SV **sv = av_fetch(l[j], i, 0);
+            next[j] = sv ? *sv : &PL_sv_undef;
+        }
+        ST(i) = sv_2mortal(newRV_noinc((SV *)av_make(items, next)));
+    }
+    XSRETURN(len);
+}
+
+%}
+
 %init %{
+    newXS(SWIG_prefix "py_zip", py_zip, (char *)__FILE__);
     /* These SWIG_TypeClientData() calls might break in the future, but
      * %rename should work on these types before that happens. */
     SWIG_TypeClientData(SWIGTYPE_p_MXNDArray, (void *)"NDArrayHandle");
@@ -120,6 +157,8 @@ static void ExecutorMonitor_callback(const char* name, NDArrayHandle handle, voi
     SWIG_TypeClientData(SWIGTYPE_p_MXRecordIO, (void *)"RecordIOHandle");
     SWIG_TypeClientData(SWIGTYPE_p_MXRtc, (void *)"RtcHandle");
     SWIG_TypeClientData(SWIGTYPE_p_MXCachedOp, (void *)"CachedOpHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_MXCudaModuleHandle, (void *)"CudaModuleHandle");
+    SWIG_TypeClientData(SWIGTYPE_p_MXCudaKernelHandle, (void *)"CudaKernelHandle");
 %}
 
 /*! \brief manually define unsigned int */
@@ -153,6 +192,10 @@ typedef MXRecordIO *RecordIOHandle;
 typedef MXRtc *RtcHandle;
 /*! \brief handle to cached operator */
 typedef MXCachedOp *CachedOpHandle;
+/*! \brief handle to rtc cuda module*/
+typedef MXCudaModuleHandle *CudaModuleHandle;
+/*! \brief handle to rtc cuda kernel*/
+typedef MXCudaKernelHandle *CudaKernelHandle;
 
 typedef void (*ExecutorMonitorCallback)(const char*,
                                                        NDArrayHandle,
@@ -240,6 +283,13 @@ int MXDumpProfile();
 /*! \brief Set the number of OMP threads to use */
 int MXSetNumOMPThreads(int thread_num);
 
+/*!
+ * \brief get the MXNet library version as an integer
+ * \param pointer to the integer holding the version number
+ * \return 0 when success, -1 when failure happens
+ */
+int MXGetVersion(int *out);
+
 //-------------------------------------
 // Part 1: NDArray creation and deletion
 //-------------------------------------
@@ -597,6 +647,14 @@ int MXImperativeInvoke(AtomicSymbolCreator in,
                                  int num_params,
                                  const char **keys,
                                  const char **vals);
+/*!
+  * \brief set whether to record operator for autograd
+ * \param is_recording 1 when recording, 0 when not recording.
+ * \param prev returns the previous status before this set.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXAutogradSetIsRecording(int is_recording, int* out);
+
 /*!
  * \brief set whether to record operator for autograd
  * \param is_train 1 when training, 0 when testing
@@ -604,6 +662,21 @@ int MXImperativeInvoke(AtomicSymbolCreator in,
  * \return 0 when success, -1 when failure happens
  */
 int MXAutogradSetIsTraining(int is_training, int* out);
+
+/*!
+ * \brief get whether autograd recording is on
+ * \param curr returns the current status.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXAutogradIsRecording(bool* out);
+
+/*!
+ * \brief get whether training mode is on
+ * \param curr returns the current status.
+ * \return 0 when success, -1 when failure happens
+ */
+int MXAutogradIsTraining(bool* out);
+
 /*!
  * \brief mark NDArrays as variables to compute gradient for autograd
  * \param num_var number of variable NDArrays
@@ -635,6 +708,33 @@ int MXAutogradBackward(mx_uint num_output,
                                  NDArrayHandle* in,
                                  int retain_graph);
 
+/*!
+ * \brief compute the gradient of outputs w.r.t variabels
+ * \param num_output number of output NDArray
+ * \param output_handles output NDArrays
+ * \param ograd_handles head gradient for NDArrays
+ * \param retain_graph whether to keep the graph after backward
+ * \param is_train whether to do backward for training or inference
+ * \return 0 when success, -1 when failure happens
+ */
+int MXAutogradBackwardEx(mx_uint num_output,
+                                   NDArrayHandle *in,
+                                   NDArrayHandle *in,
+                                   mx_uint num_variables,
+                                   NDArrayHandle *in,
+                                   int retain_graph,
+                                   int create_graph,
+                                   int is_train,
+                                   NDArrayHandle **out_grad,
+                                   int **out_stype);
+
+/*
+ * \brief get the graph constructed by autograd.
+ * \param handle ndarray handle
+ * \param out output symbol handle
+ */
+int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out);
+
  /*!
   * \brief create cached operator
   */
@@ -1076,6 +1176,21 @@ int MXExecutorBackward(ExecutorHandle handle,
                                  mx_uint len,
                                  NDArrayHandle *in);
 
+/*!
+ * \brief Excecutor run backward
+ *
+ * \param handle execute handle
+ * \param len lenth
+ * \param head_grads NDArray handle for heads' gradient
+ * \param is_train int value to indicate whether the backward pass is for evaluation
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+int MXExecutorBackwardEx(ExecutorHandle handle,
+                                   mx_uint len,
+                                   NDArrayHandle *in,
+                                   int is_train);
+
 /*!
  * \brief Get executor's head NDArray
  *
@@ -1203,6 +1318,12 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
                          const mx_uint num_provided_arg_dtypes,
                          const char** in, // provided_arg_dtype_names,
                          const int* in, // provided_arg_dtypes,
+
+//---------------        sparse related variables, ignored for now
+                         const mx_uint num_provided_arg_stypes,
+                         const char** provided_arg_stype_names,
+                         const int* provided_arg_stypes,
+//---------------
                          const mx_uint num_shared_arg_names,
                          const char** in, // shared_arg_name_list,
 //------------
@@ -1343,7 +1464,6 @@ int MXInitPSEnv(mx_uint num_vars,
                           const char **keys,
                           const char **vals);
 
-
 /*!
  * \brief Create a kvstore
  * \param type the type of KVStore
@@ -1636,4 +1756,57 @@ int MXRtcPush(RtcHandle handle, mx_uint num_input, mx_uint num_output,
 */
 int MXRtcFree(RtcHandle handle);
 
-int MXCustomOpRegister(const char* op_type, CustomOpPropCreator creator);
+/*
+ * \brief create cuda rtc module
+ * \param source cuda source code
+ * \param num_options number of compiler flags
+ * \param options compiler flags
+ * \param num_exports number of exported function names
+ * \param exported function names
+ * \param out handle to created module
+ */
+int MXRtcCudaModuleCreate(const char* source, int num_options,
+                                    const char** in, int num_exports,
+                                    const char** in, CudaModuleHandle *out);
+/*
+ * \brief delete cuda rtc module
+ * \param handle handle to cuda module
+ */
+int MXRtcCudaModuleFree(CudaModuleHandle handle);
+/*
+ * \brief get kernel from module
+ * \param handle handle to cuda module
+ * \param name name of kernel function
+ * \param num_args number of arguments
+ * \param is_ndarray whether argument is ndarray
+ * \param is_const whether argument is constant
+ * \param arg_types data type of arguments
+ * \param out created kernel
+ */
+int MXRtcCudaKernelCreate(CudaModuleHandle handle, const char* name,
+                                    int num_args, int* in, int* in,
+                                    int* in, CudaKernelHandle *out);
+/*
+ * \brief delete kernel
+ * \param handle handle to previously created kernel
+ */
+int MXRtcCudaKernelFree(CudaKernelHandle handle);
+/*
+ * \brief launch cuda kernel
+ * \param handle handle to kernel
+ * \param dev_id (GPU) device id
+ * \param args pointer to arguments
+ * \param grid_dim_x grid dimension x
+ * \param grid_dim_y grid dimension y
+ * \param grid_dim_z grid dimension z
+ * \param block_dim_x block dimension x
+ * \param block_dim_y block dimension y
+ * \param block_dim_z block dimension z
+ * \param shared_mem size of dynamically allocated shared memory
+ */
+int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** cuda_kernel_args,
+                                  mx_uint grid_dim_x, mx_uint grid_dim_y,
+                                  mx_uint grid_dim_z, mx_uint block_dim_x,
+                                  mx_uint block_dim_y, mx_uint block_dim_z,
+                                  mx_uint shared_mem);
+
diff --git a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
index 640215fd7792..fb794de6e8d5 100644
--- a/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
+++ b/perl-package/AI-MXNetCAPI/mxnet_typemaps.i
@@ -4,19 +4,18 @@
     I32 len;
     int i;
     SV  **tv;
-    STRLEN len2;
     if (!SvROK($input))
         croak("Argument $argnum is not a reference.");
         if (SvTYPE(SvRV($input)) != SVt_PVAV)
         croak("Argument $argnum is not an array.");
         tempav = (AV*)SvRV($input);
-    len = av_top_index(tempav) + 1;
+    len = av_len(tempav) + 1;
     if(len!=0) 
     {
         $1 = (char **) safemalloc((len)*sizeof(char *));
         for (i = 0; i < len; i++) {
             tv = av_fetch(tempav, i, 0);
-            $1[i] = (char *) SvPV(*tv,len2);
+            $1[i] = (char *) SvPV_nolen(*tv);
         }
     }
     else
@@ -34,7 +33,6 @@
     char *key;
     SV *val;
     I32 len;
-    STRLEN len2;
     int hash_len;
     int i = 0;
     if (!SvROK($input))
@@ -50,7 +48,7 @@
         while ((val = hv_iternextsv(temphv, &key, &len)))
         {
             $1[i] = key;
-            $2[i] = SvPV(val, len2);
+            $2[i] = SvPV_nolen(val);
             ++i;
         }
     }
@@ -82,13 +80,29 @@
     }
 }
 
-%typemap(in,numinputs=0) (int *out) (int temp)
+%typemap(in) (void **out_pdata) (void *temp)
+{
+    temp = NULL;
+    $1 = &temp;
+}
+
+%typemap(argout) (void **out_pdata)
+{
+    if(!result)
+    {
+        $result = newSVpvn((char*)(*$1), SvIV(ST(1)));
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (int *out) (int temp), (bool *out) (bool temp)
 {
     temp = 0;
     $1 = &temp;
 }
 
-%typemap(argout) (int *out)
+%typemap(argout) (int *out), (bool *out)
 {
     if(!result)
     {
@@ -172,14 +186,12 @@
 
 %typemap(in) (const void *in), (void *in)
 {
-    STRLEN len;
-    $1 = (void *)SvPV($input, len);
+    $1 = (void *)SvPV_nolen($input);
 }
 
 %typemap(in) (const char *in)
 {
-    STRLEN len;
-    $1 = SvPV($input, len);
+    $1 = SvPV_nolen($input);
 }
 
 %typemap(in) (const mx_uint *in), (mx_uint *in)
@@ -187,18 +199,18 @@
     AV *tempav;
     int i;
     SV  **tv;
-    int av_len; 
+    int av_len;
     if (!SvROK($input))
         croak("Argument $argnum is not a reference.");
         if (SvTYPE(SvRV($input)) != SVt_PVAV)
         croak("Argument $argnum is not an array.");
         tempav = (AV*)SvRV($input);
-    av_len = av_top_index(tempav) + 1;
+    av_len = av_len(tempav) + 1;
     if(av_len)
     {
         $1 = (mx_uint *)safemalloc(av_len*sizeof(mx_uint));
         for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);    
+            tv = av_fetch(tempav, i, 0);
             $1[i] = (mx_uint)SvIV(*tv);
         }
     }
@@ -223,12 +235,12 @@
         if (SvTYPE(SvRV($input)) != SVt_PVAV)
         croak("Argument $argnum is not an array.");
         tempav = (AV*)SvRV($input);
-    av_len = av_top_index(tempav) + 1;
+    av_len = av_len(tempav) + 1;
     if(av_len)
     {
         $1 = (int *)safemalloc(av_len*sizeof(int));
         for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);    
+            tv = av_fetch(tempav, i, 0);
             $1[i] = (int)SvIV(*tv);
         }
     }
@@ -255,12 +267,12 @@
         if (SvTYPE(SvRV($input)) != SVt_PVAV)
         croak("Argument $argnum is not an array.");
         tempav = (AV*)SvRV($input);
-    av_len = av_top_index(tempav) + 1;
+    av_len = av_len(tempav) + 1;
     if(av_len)
     {
         $1 = ($1_type)safemalloc(av_len*sizeof($*1_type));
         for (i = 0; i < av_len; i++) {
-            tv = av_fetch(tempav, i, 0);    
+            tv = av_fetch(tempav, i, 0);
             res = SWIG_ConvertPtr(*tv,SWIG_as_voidptrptr(&$1[i]), $*1_descriptor, 0);
             if (!SWIG_IsOK(res)) {
                 SWIG_exception_fail(SWIG_ArgError(res), "in method '" "$symname" "', argument " "$argnum"" of type '" "$*1_type""'"); 
@@ -276,6 +288,39 @@
     Safefree($1);
 }
 
+%typemap(in) (void** cuda_kernel_args)
+{
+    AV *tempav;
+    int i;
+    SV  **tv;
+    int res;
+    int av_len;
+    if (!SvROK($input))
+        croak("Argument $argnum is not a reference.");
+        if (SvTYPE(SvRV($input)) != SVt_PVAV)
+        croak("Argument $argnum is not an array.");
+        tempav = (AV*)SvRV($input);
+    av_len = av_len(tempav) + 1;
+    if(av_len)
+    {
+        $1 = ($1_type)safemalloc(av_len*sizeof($*1_type));
+        for (i = 0; i < av_len; i++) {
+            tv = av_fetch(tempav, i, 0);
+            res = SWIG_ConvertPtr(*tv,SWIG_as_voidptrptr(&$1[i]), SWIGTYPE_p_MXNDArray, 0);
+            if (!SWIG_IsOK(res)) {
+                $1[i] = (void*)SvPV_nolen(*tv);
+            }
+        }
+    }
+    else
+    {
+       $1 = NULL;
+    }
+}
+%typemap(freearg) (void** cuda_kernel_args) {
+    Safefree($1);
+}
+
 %typemap(in) (mx_float *in)
 {
     AV *tempav;
@@ -286,7 +331,7 @@
         if (SvTYPE(SvRV($input)) != SVt_PVAV)
         croak("Argument $argnum is not an array.");
         tempav = (AV*)SvRV($input);
-    len = av_top_index(tempav) + 1;
+    len = av_len(tempav) + 1;
     if(len)
     {
         $1 = (mx_float *)safemalloc(len*sizeof(mx_float));
@@ -306,19 +351,23 @@
 }
 
 %typemap(in,numinputs=0) (NDArrayHandle *out) (NDArrayHandle temp),
-                         (FunctionHandle* out) (FunctionHandle temp), 
+                         (FunctionHandle* out) (FunctionHandle temp),
                          (SymbolHandle *out) (SymbolHandle temp),
                          (ExecutorHandle *out) (ExecutorHandle temp),
                          (DataIterHandle *out) (ExecutorHandle temp),
                          (KVStoreHandle *out) (KVStoreHandle temp),
                          (RecordIOHandle *out) (RecordIOHandle temp),
                          (RtcHandle *out) (RtcHandle temp),
-                         (CachedOpHandle *out) (CachedOpHandle temp)
+                         (CachedOpHandle *out) (CachedOpHandle temp),
+                         (CudaModuleHandle *out) (CudaModuleHandle temp),
+                         (CudaKernelHandle *out) (CudaKernelHandle temp)
 {
     $1 = &temp;
 }
 %typemap(argout) (NDArrayHandle *out), (FunctionHandle* out), (SymbolHandle *out), (ExecutorHandle *out), (DataIterHandle *out),
-                 (KVStoreHandle *out), (RecordIOHandle *out), (RtcHandle *out) (RtcHandle temp), (CachedOpHandle *out) (CachedOpHandle temp)
+                 (KVStoreHandle *out), (RecordIOHandle *out), (RtcHandle *out) (RtcHandle temp), (CachedOpHandle *out) (CachedOpHandle temp),
+                 (CudaModuleHandle *out) (CudaModuleHandle temp), (CudaKernelHandle *out) (CudaKernelHandle temp)
+
 {
     if(!result)
     {
@@ -522,6 +571,72 @@
     }
 }
 
+%typemap(in,numinputs=0) (NDArrayHandle **out_grad) (NDArrayHandle* temp)
+{
+    int vars = SvIV(ST(3));
+    if(vars)
+    {
+        $1 = &temp;
+    }
+    else
+    {
+        $1 = NULL;
+    }
+}
+
+%typemap(argout) (NDArrayHandle** out_grad)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        int len = SvIV(ST(3));
+        svs = (SV **)safemalloc(len*sizeof(SV *));
+        for (i = 0; i < len ; i++) {
+            svs[i] = SWIG_NewPointerObj(SWIG_as_voidptr((*$1)[i]), SWIGTYPE_p_MXNDArray, 0);
+        }
+        myav = av_make(len,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
+%typemap(in,numinputs=0) (int **out_stype) (int *temp)
+{
+    int vars = SvIV(ST(3));
+    if(vars)
+    {
+        $1 = &temp;
+    }
+    else
+    {
+        $1 = NULL;
+    }
+}
+
+%typemap(argout) (int** out_stype)
+{
+    if(!result)
+    {
+        AV *myav;
+        SV **svs;
+        int i = 0;
+        int len = SvIV(ST(3));
+        svs = (SV **)safemalloc(len*sizeof(SV *));
+        for (i = 0; i < len ; i++) {
+            svs[i] = newSViv((*$1)[i]);
+        }
+        myav = av_make(len,svs);
+        Safefree(svs);
+        $result = newRV_noinc((SV*)myav);
+        sv_2mortal($result);
+        argvi++;
+    }
+}
+
 %typemap(in) (int *out_size, NDArrayHandle** out_array) (int temp, NDArrayHandle* temp_array)
 {
     AV *tempav;
@@ -534,7 +649,7 @@
         if (SvTYPE(SvRV($input)) != SVt_PVAV)
         croak("Argument $argnum is not an array.");
         tempav = (AV*)SvRV($input);
-    av_len = av_top_index(tempav) + 1;
+    av_len = av_len(tempav) + 1;
     temp_array = NULL;
     if(av_len)
     {
@@ -553,7 +668,7 @@
 }
 
 %typemap(freearg) (int *out_size, NDArrayHandle** out_array) {
-    if(av_top_index((AV*)SvRV(ST(3))) > -1)
+    if(av_len((AV*)SvRV(ST(3))) > -1)
     {
         Safefree(*$2);
     }
@@ -563,7 +678,7 @@
 {
     SV **svs;
     int i = 0;
-    if(av_top_index((AV*)SvRV(ST(3))) == -1)
+    if(av_len((AV*)SvRV(ST(3))) == -1)
     {
         if(!result)
         {
@@ -820,6 +935,14 @@
     }
 }
 
+%typemap(in,numinputs=0) (const mx_uint num_provided_arg_stypes, const char** provided_arg_stype_names,
+                          const int* provided_arg_stypes)
+{
+    $1 = 0;
+    $2 = NULL;
+    $3 = NULL;
+}
+
 %typemap(in,numinputs=0) (mx_uint* num_aux_states,
                           NDArrayHandle** aux_states)
                          (mx_uint temp1,
diff --git a/perl-package/AI-NNVMCAPI/Changes b/perl-package/AI-NNVMCAPI/Changes
index 09395184e3c6..480090de451b 100644
--- a/perl-package/AI-NNVMCAPI/Changes
+++ b/perl-package/AI-NNVMCAPI/Changes
@@ -1,5 +1,8 @@
 Revision history for Perl extension AI::NNVMCAPI.
 
+1.1     Sun Sep 24 10:26:54 PDT 2017
+        - support for perl 5.14
+
 1.01    Sat Jun 10 23:57:27 PDT 2017
         - sync with python.
 
diff --git a/perl-package/AI-NNVMCAPI/META.json b/perl-package/AI-NNVMCAPI/META.json
index 42247c6b98ff..e65204307ebe 100644
--- a/perl-package/AI-NNVMCAPI/META.json
+++ b/perl-package/AI-NNVMCAPI/META.json
@@ -37,5 +37,5 @@
       }
    },
    "release_status" : "stable",
-   "version" : "1.01"
+   "version" : "1.1"
 }
diff --git a/perl-package/AI-NNVMCAPI/META.yml b/perl-package/AI-NNVMCAPI/META.yml
index 6d48cc7b8578..446c9df952f3 100644
--- a/perl-package/AI-NNVMCAPI/META.yml
+++ b/perl-package/AI-NNVMCAPI/META.yml
@@ -19,4 +19,4 @@ no_index:
     - inc
 requires:
   Test::More: '0'
-version: '1.01'
+version: '1.1'
diff --git a/perl-package/AI-NNVMCAPI/README b/perl-package/AI-NNVMCAPI/README
index 50579140de82..9e23d93b7ef8 100644
--- a/perl-package/AI-NNVMCAPI/README
+++ b/perl-package/AI-NNVMCAPI/README
@@ -1,4 +1,4 @@
-AI-NNVMCAPI version 1.01
+AI-NNVMCAPI version 1.1
 =====================
 
 Swig interface to MXNet c api.
diff --git a/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm b/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm
index 134d922b4d8d..df87d4780a7c 100644
--- a/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm
+++ b/perl-package/AI-NNVMCAPI/lib/AI/NNVMCAPI.pm
@@ -18,7 +18,7 @@
 package AI::NNVMCAPI;
 use base qw(DynaLoader);
 bootstrap AI::NNVMCAPI;
-our $VERSION = '1.01';
+our $VERSION = '1.1';
 1;
 __END__
 
diff --git a/perl-package/AI-NNVMCAPI/nnvm_typemaps.i b/perl-package/AI-NNVMCAPI/nnvm_typemaps.i
index 19baf804e187..ccfa8d05de5e 100644
--- a/perl-package/AI-NNVMCAPI/nnvm_typemaps.i
+++ b/perl-package/AI-NNVMCAPI/nnvm_typemaps.i
@@ -4,24 +4,23 @@
     I32 len;
     int i;
     SV  **tv;
-    STRLEN len2;
     if (!SvROK($input))
         croak("Argument $argnum is not a reference.");
         if (SvTYPE(SvRV($input)) != SVt_PVAV)
         croak("Argument $argnum is not an array.");
         tempav = (AV*)SvRV($input);
-    len = av_top_index(tempav) + 1;
+    len = av_len(tempav) + 1;
     if(len!=0) 
     {
         $1 = (char **) safemalloc((len)*sizeof(char *));
         for (i = 0; i < len; i++) {
-            tv = av_fetch(tempav, i, 0);    
-            $1[i] = (char *) SvPV(*tv,len2);
+            tv = av_fetch(tempav, i, 0);
+            $1[i] = (char *) SvPV_nolen(*tv);
         }
     }
     else
     {
-       $1 = NULL;     
+       $1 = NULL;
     }
 }
 %typemap(freearg) (const char** in), (char** in)  {
@@ -34,7 +33,6 @@
     char *key;
     SV *val;
     I32 len;
-    STRLEN len2;
     int hash_len;
     int i = 0;
     if (!SvROK($input))
@@ -50,7 +48,7 @@
         while ((val = hv_iternextsv(temphv, &key, &len))) 
         {
             $1[i] = key;
-            $2[i] = SvPV(val, len2);
+            $2[i] = SvPV_nolen(val);
             ++i;
         }
     }
@@ -156,7 +154,7 @@
         if (SvTYPE(SvRV($input)) != SVt_PVAV)
         croak("Argument $argnum is not an array.");
         tempav = (AV*)SvRV($input);
-    len = av_top_index(tempav) + 1;
+    len = av_len(tempav) + 1;
     if(len)
     {
         $1 = ($1_type)safemalloc(len*sizeof($*1_type));
diff --git a/perl-package/README.md b/perl-package/README.md
index 81cb5a017373..d25f322f2717 100644
--- a/perl-package/README.md
+++ b/perl-package/README.md
@@ -1,5 +1,6 @@
-[![Build Status](https://travis-ci.org/dmlc/mxnet.svg?branch=master)](https://travis-ci.org/dmlc/mxnet)
-[![Documentation Status](https://readthedocs.org/projects/mxnet/badge/?version=latest)](http://mxnet.io/api/perl/index.html)
+[![Build Status](https://builds.apache.org/job/incubator-mxnet/job/master/badge/icon)](https://builds.apache.org/job/incubator-mxnet/job/master/)
+[![Documentation Status](https://builds.apache.org/job/incubator-mxnet-build-site/badge/icon)](https://mxnet.incubator.apache.org/api/perl/index.html)
+[![GitHub license](http://dmlc.github.io/img/apache2.svg)](../LICENSE)
 
 You have found MXNet Perl Package! The MXNet Perl packages brings flexible and efficient GPU
 computing and state-of-art deep learning to Perl.
@@ -10,7 +11,7 @@ computing and state-of-art deep learning to Perl.
 
 Installation
 ---------
-* [MXNet Setup Document](http://mxnet.io/get_started/ubuntu_setup.html)
+* [MXNet Setup Document](https://mxnet.incubator.apache.org/get_started/ubuntu_setup.html)
   - Check this out for detailed documents, examples and installation guides.
 
 License
diff --git a/perl-package/test.sh b/perl-package/test.sh
index c8509c141920..1a4bd7227a08 100755
--- a/perl-package/test.sh
+++ b/perl-package/test.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,7 +17,6 @@
 # under the License.
 
 MXNET_HOME=${PWD}
-export LD_LIBRARY_PATH=${MXNET_HOME}/lib
 export PERL5LIB=${MXNET_HOME}/perl5/lib/perl5
 
 cd ${MXNET_HOME}/perl-package/AI-MXNetCAPI/
diff --git a/plugin/caffe/caffe_blob.cc b/plugin/caffe/caffe_blob.cc
index 697efbfa99f2..4d655f32dd01 100644
--- a/plugin/caffe/caffe_blob.cc
+++ b/plugin/caffe/caffe_blob.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_blob.cc
  * \brief Implementations of SetDataGradToBlob given various device/dimension
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_blob.h b/plugin/caffe/caffe_blob.h
index 666d269fdae1..a54c5c81ff47 100644
--- a/plugin/caffe/caffe_blob.h
+++ b/plugin/caffe/caffe_blob.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_blob.h
  * \brief conversion between tensor and caffeBlob
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_common.cc b/plugin/caffe/caffe_common.cc
index 53513a17d6c8..dd445efbd659 100644
--- a/plugin/caffe/caffe_common.cc
+++ b/plugin/caffe/caffe_common.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_common.h
  * \brief Common functions for caffeOp and caffeLoss symbols
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_common.h b/plugin/caffe/caffe_common.h
index 8565d9e2e27c..ba7b9ad7b8da 100644
--- a/plugin/caffe/caffe_common.h
+++ b/plugin/caffe/caffe_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_common.h
  * \brief Common functions for caffeOp and caffeLoss symbols
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_data_iter.cc b/plugin/caffe/caffe_data_iter.cc
index 2682298b4b3d..cc96c3898e80 100644
--- a/plugin/caffe/caffe_data_iter.cc
+++ b/plugin/caffe/caffe_data_iter.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file caffe_data_iter.cc
  * \brief register mnist iterator
 */
diff --git a/plugin/caffe/caffe_fieldentry.h b/plugin/caffe/caffe_fieldentry.h
index 47d246f4439f..f97b76519e0e 100644
--- a/plugin/caffe/caffe_fieldentry.h
+++ b/plugin/caffe/caffe_fieldentry.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_fieldentry.h
  * \brief Implement FieldEntry<caffe::LayerParameter>
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_loss-inl.h b/plugin/caffe/caffe_loss-inl.h
index 37bfcf06be4a..60b03b1d923b 100644
--- a/plugin/caffe/caffe_loss-inl.h
+++ b/plugin/caffe/caffe_loss-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_loss-inl.h
  * \brief Caffe Operator
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_loss.cc b/plugin/caffe/caffe_loss.cc
index ce697d6c8ff9..5ce8bb247e86 100644
--- a/plugin/caffe/caffe_loss.cc
+++ b/plugin/caffe/caffe_loss.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_loss.cc
  * \brief caffe loss
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_loss.cu b/plugin/caffe/caffe_loss.cu
index 2002cf2c661d..698dbe1f1b84 100644
--- a/plugin/caffe/caffe_loss.cu
+++ b/plugin/caffe/caffe_loss.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_loss_gpu.cc
  * \brief caffe loss
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_op-inl.h b/plugin/caffe/caffe_op-inl.h
index 43b9b5a091af..2c1c9bac170a 100644
--- a/plugin/caffe/caffe_op-inl.h
+++ b/plugin/caffe/caffe_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_op-inl.h
  * \brief Caffe Operator
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_op.cc b/plugin/caffe/caffe_op.cc
index 5198ccaac7c9..9db9df04068d 100644
--- a/plugin/caffe/caffe_op.cc
+++ b/plugin/caffe/caffe_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_op.cc
  * \brief caffe operator
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_op.cu b/plugin/caffe/caffe_op.cu
index be6c20a1084f..0802b61313bb 100644
--- a/plugin/caffe/caffe_op.cu
+++ b/plugin/caffe/caffe_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_operator_gpu.cc
  * \brief caffe operator
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_stream.cc b/plugin/caffe/caffe_stream.cc
index 03badda65ca2..823948a8aa2f 100644
--- a/plugin/caffe/caffe_stream.cc
+++ b/plugin/caffe/caffe_stream.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_stream.cc
  * \brief define stream opertors >> and <<
  * \author Haoran Wang
diff --git a/plugin/caffe/caffe_stream.h b/plugin/caffe/caffe_stream.h
index b9a08d028f38..228e3727daed 100644
--- a/plugin/caffe/caffe_stream.h
+++ b/plugin/caffe/caffe_stream.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file caffe_stream.h
  * \brief define stream opertors >> and <<
  * \author Haoran Wang
diff --git a/plugin/opencv/cv_api.cc b/plugin/opencv/cv_api.cc
index b0bcbbce203e..1508de376d22 100644
--- a/plugin/opencv/cv_api.cc
+++ b/plugin/opencv/cv_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file cv_api.h
  * \brief C API for opencv
  * \author Junyuan Xie
diff --git a/plugin/opencv/cv_api.h b/plugin/opencv/cv_api.h
index e04357bf30b7..b318041eb6b9 100644
--- a/plugin/opencv/cv_api.h
+++ b/plugin/opencv/cv_api.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file cv_api.h
  * \brief C API for opencv
  * \author Junyuan Xie
diff --git a/plugin/sframe/iter_sframe.cc b/plugin/sframe/iter_sframe.cc
index 2a987e2b10f2..9f09916b8166 100644
--- a/plugin/sframe/iter_sframe.cc
+++ b/plugin/sframe/iter_sframe.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file iter_sframe_image.cc
  * \brief
  * \author Bing Xu
diff --git a/plugin/torch/torch_base.cc b/plugin/torch/torch_base.cc
index 89f832ccdfae..8a9d85b06465 100644
--- a/plugin/torch/torch_base.cc
+++ b/plugin/torch/torch_base.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file torch_base.cc
  * \brief torch_state
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_base.h b/plugin/torch/torch_base.h
index 3aaaa2f13902..04bee24974bf 100644
--- a/plugin/torch/torch_base.h
+++ b/plugin/torch/torch_base.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file torch_base.h
  * \brief Torch interface.
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_criterion-inl.h b/plugin/torch/torch_criterion-inl.h
index 7f592f156226..e0687ab39bff 100644
--- a/plugin/torch/torch_criterion-inl.h
+++ b/plugin/torch/torch_criterion-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file torch_module-inl.h
  * \brief torch module operator
  * \author Min Lin
diff --git a/plugin/torch/torch_criterion.cc b/plugin/torch/torch_criterion.cc
index bdfb2f42e61a..110a58156a26 100644
--- a/plugin/torch/torch_criterion.cc
+++ b/plugin/torch/torch_criterion.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_criterion.cu b/plugin/torch/torch_criterion.cu
index 68c519c7c9f1..ccb7145f36af 100644
--- a/plugin/torch/torch_criterion.cu
+++ b/plugin/torch/torch_criterion.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
diff --git a/plugin/torch/torch_function.cc b/plugin/torch/torch_function.cc
index a1c5ff578da7..3ec9a000acfd 100644
--- a/plugin/torch/torch_function.cc
+++ b/plugin/torch/torch_function.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file torch_base.cc
  * \brief torch_state
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_function.h b/plugin/torch/torch_function.h
index 8fb2ccfde454..f6f760231bdf 100644
--- a/plugin/torch/torch_function.h
+++ b/plugin/torch/torch_function.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file torch_function.h
  * \brief Torch interface.
  * \author Junyuan Xie
diff --git a/plugin/torch/torch_module-inl.h b/plugin/torch/torch_module-inl.h
index 15b569fbbeef..7fb0440aa575 100644
--- a/plugin/torch/torch_module-inl.h
+++ b/plugin/torch/torch_module-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file torch_module-inl.h
  * \brief torch module operator
  * \author Min Lin
diff --git a/plugin/torch/torch_module.cc b/plugin/torch/torch_module.cc
index 658669fb419c..4ab792c4dd58 100644
--- a/plugin/torch/torch_module.cc
+++ b/plugin/torch/torch_module.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
diff --git a/plugin/torch/torch_module.cu b/plugin/torch/torch_module.cu
index caf9eb19911a..d743da5fd922 100644
--- a/plugin/torch/torch_module.cu
+++ b/plugin/torch/torch_module.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
diff --git a/plugin/warpctc/warpctc-inl.h b/plugin/warpctc/warpctc-inl.h
index d492656b2f15..5a540c57940a 100644
--- a/plugin/warpctc/warpctc-inl.h
+++ b/plugin/warpctc/warpctc-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file warpctc-inl.h
  * \brief warpctc operator
  * \author Liang Xiang
diff --git a/plugin/warpctc/warpctc.cc b/plugin/warpctc/warpctc.cc
index 0ff61be758c7..055a6d645d1b 100644
--- a/plugin/warpctc/warpctc.cc
+++ b/plugin/warpctc/warpctc.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file warpctc.cc
  * \brief warpctc op
  * \author Liang Xiang
diff --git a/plugin/warpctc/warpctc.cu b/plugin/warpctc/warpctc.cu
index 7562a12a3c9d..3ee20fc9d3fe 100644
--- a/plugin/warpctc/warpctc.cu
+++ b/plugin/warpctc/warpctc.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file warpctc.cc
  * \brief warpctc op
  * \author Liang Xiang
diff --git a/prepare_mkl.sh b/prepare_mkl.sh
index 9769731f5396..97a1e49236f2 100755
--- a/prepare_mkl.sh
+++ b/prepare_mkl.sh
@@ -75,10 +75,16 @@ MXNET_ROOT=`dirname $0`
 USE_MKLML=0
 # NOTE: if you update the following line, please also update the dockerfile at
 # tests/ci_build/Dockerfile.mkl
-VERSION_MATCH=20170425
-ARCHIVE_BASENAME=mklml_lnx_2018.0.20170425.tgz
+VERSION_MATCH=20170908
+PLATFORM=$(uname)
+if [ $PLATFORM == "Darwin" ]; then
+    INFIX=mac
+elif [ $PLATFORM == "Linux" ]; then
+    INFIX=lnx
+fi
+ARCHIVE_BASENAME=mklml_${INFIX}_2018.0.20170908.tgz
 MKL_CONTENT_DIR=`echo $ARCHIVE_BASENAME | rev | cut -d "." -f 2- | rev`
-MKLURL="https://github.com/01org/mkl-dnn/releases/download/v0.7/$ARCHIVE_BASENAME"
+MKLURL="https://github.com/01org/mkl-dnn/releases/download/v0.10/$ARCHIVE_BASENAME"
 # there are diffrent MKL lib to be used for GCC and for ICC
 reg='^[0-9]+$'
 VERSION_LINE=`GetVersionName $MKLROOT`
@@ -90,18 +96,26 @@ if [ -z $MKLROOT ]; then
     #echo $VERSION_LINE
     if [ $VERSION_LINE -lt $VERSION_MATCH ] ; then
       #...If it is not then downloaded and unpacked
-      wget --quiet --no-check-certificate -P $MXNET_ROOT $MKLURL -O $MXNET_ROOT/$ARCHIVE_BASENAME
+      if [ $PLATFORM == "Darwin" ]; then
+        curl -L -o $MXNET_ROOT/$ARCHIVE_BASENAME $MKLURL
+      elif [ $PLATFORM == "Linux" ]; then
+        wget --quiet --no-check-certificate -P $MXNET_ROOT $MKLURL -O $MXNET_ROOT/$ARCHIVE_BASENAME
+      fi
       tar -xzf $MXNET_ROOT/$ARCHIVE_BASENAME -C $MXNET_ROOT
       #echo $HOME_MKL
       yes | cp -rf $MXNET_ROOT/$MKL_CONTENT_DIR/* $HOME_MKL
       rm -rf $MXNET_ROOT/$MKL_CONTENT_DIR
     fi
-  MKLLIB=`find $HOME_MKL -name libmklml_gnu.so`
+  if [ $PLATFORM == "Darwin" ]; then
+    MKLLIB=`find $HOME_MKL -name libmklml.dylib`
+  elif [ $PLATFORM == "Linux" ]; then
+    MKLLIB=`find $HOME_MKL -name libmklml_gnu.so`
+  fi
   MKLROOT=`echo $MKLLIB | sed -e 's/lib.*$//'`
 fi
 
 # Check what MKL lib we have in MKLROOT
-if [ -z `find $MKLROOT -name libmklml_gnu.so -print -quit` ]; then
+if [ -z `find $MKLROOT -name libmklml_gnu.so -o -name libmklml.dylib -print -quit` ]; then
   USE_MKLML=0
 elif [ -z `find $MKLROOT -name libmkl_core.so -print -quit` ]; then
   USE_MKLML=1
diff --git a/ps-lite b/ps-lite
index acdb698fa3bb..2ce8b9a25620 160000
--- a/ps-lite
+++ b/ps-lite
@@ -1 +1 @@
-Subproject commit acdb698fa3bb80929ef83bb37c705f025e119b82
+Subproject commit 2ce8b9a256207947acfa2cb9b09ab74b8de74547
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index 3c3ce76a9284..4e2c4f0134f4 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -22,10 +22,12 @@
 from __future__ import absolute_import
 
 from .context import Context, current_context, cpu, gpu
+from . import engine
 from .base import MXNetError
 from . import base
 from . import contrib
 from . import ndarray
+from . import ndarray as nd
 from . import name
 # use mx.sym as short for symbol
 from . import symbol as sym
@@ -34,8 +36,6 @@
 from . import io
 from . import recordio
 from . import operator
-# use mx.nd as short for mx.ndarray
-from . import ndarray as nd
 # use mx.rnd as short for mx.random
 from . import random as rnd
 from . import random
@@ -55,7 +55,7 @@
 from . import kvstore as kv
 from . import kvstore_server
 # Runtime compile module
-from .rtc import Rtc as rtc
+from . import rtc
 # Attribute scope to add attributes to symbolic graphs
 from .attribute import AttrScope
 
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index 5a50f80498ec..a0c01a6e069b 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -22,20 +22,18 @@
 from __future__ import absolute_import as _abs
 
 import ctypes
-import sys as _sys
-import numpy as np
 
 from ..base import _LIB
-from ..base import c_array, py_str, c_str, mx_uint, _Null
-from ..base import NDArrayHandle, OpHandle, CachedOpHandle
+from ..base import c_str_array, c_handle_array
+from ..base import NDArrayHandle, CachedOpHandle
 from ..base import check_call
-from ..ndarray_doc import _build_doc
 
 
 class NDArrayBase(object):
     """Base data structure for ndarray"""
     __slots__ = ["handle", "writable"]
     # pylint: disable= no-member
+
     def __init__(self, handle, writable=True):
         """initialize a new NDArray
 
@@ -71,29 +69,36 @@ def _imperative_invoke(handle, ndargs, keys, vals, out):
         if isinstance(out, NDArrayBase):
             out = (out,)
         num_output = ctypes.c_int(len(out))
-        output_vars = c_array(NDArrayHandle, [i.handle for i in out])
+        output_vars = c_handle_array(out)
         output_vars = ctypes.cast(output_vars, ctypes.POINTER(NDArrayHandle))
     else:
         original_output = None
         output_vars = ctypes.POINTER(NDArrayHandle)()
         num_output = ctypes.c_int(0)
 
-    check_call(_LIB.MXImperativeInvoke(
+    # return output stypes to avoid the c_api call for checking
+    # a handle's stype in _ndarray_cls
+    out_stypes = ctypes.POINTER(ctypes.c_int)()
+
+    check_call(_LIB.MXImperativeInvokeEx(
         ctypes.c_void_p(handle),
         ctypes.c_int(len(ndargs)),
-        c_array(NDArrayHandle, [arr.handle for arr in ndargs]),
+        c_handle_array(ndargs),
         ctypes.byref(num_output),
         ctypes.byref(output_vars),
         ctypes.c_int(len(keys)),
-        c_array(ctypes.c_char_p, [c_str(key) for key in keys]),
-        c_array(ctypes.c_char_p, [c_str(str(val)) for val in vals])))
+        c_str_array(keys),
+        c_str_array([str(s) for s in vals]),
+        ctypes.byref(out_stypes)))
 
     if original_output is not None:
         return original_output
     if num_output.value == 1:
-        return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle))
+        return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
+                            stype=out_stypes[0])
     else:
-        return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle))
+        return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
+                             stype=out_stypes[i])
                 for i in range(num_output.value)]
 
 
@@ -117,7 +122,7 @@ def __call__(self, *args, **kwargs):
             if isinstance(out, NDArrayBase):
                 out = (out,)
             num_output = ctypes.c_int(len(out))
-            output_vars = c_array(NDArrayHandle, [i.handle for i in out])
+            output_vars = c_handle_array(out)
             output_vars = ctypes.cast(output_vars, ctypes.POINTER(NDArrayHandle))
         else:
             original_output = None
@@ -128,17 +133,24 @@ def __call__(self, *args, **kwargs):
                 "CachedOp.__call__ got unexpected keyword argument(s): " + \
                 ', '.join(kwargs.keys()))
 
-        check_call(_LIB.MXInvokeCachedOp(
+        # return output stypes to avoid the c_api call for checking
+        # a handle's stype in _ndarray_cls
+        out_stypes = ctypes.POINTER(ctypes.c_int)()
+
+        check_call(_LIB.MXInvokeCachedOpEx(
             self.handle,
             ctypes.c_int(len(args)),
-            c_array(NDArrayHandle, [arr.handle for arr in args]),
+            c_handle_array(args),
             ctypes.byref(num_output),
-            ctypes.byref(output_vars)))
+            ctypes.byref(output_vars),
+            ctypes.byref(out_stypes)))
 
         if original_output is not None:
             return original_output
         if num_output.value == 1:
-            return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle))
+            return _ndarray_cls(ctypes.cast(output_vars[0], NDArrayHandle),
+                                stype=out_stypes[0])
         else:
-            return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle))
+            return [_ndarray_cls(ctypes.cast(output_vars[i], NDArrayHandle),
+                                 stype=out_stypes[i])
                     for i in range(num_output.value)]
diff --git a/python/mxnet/_ctypes/symbol.py b/python/mxnet/_ctypes/symbol.py
index 3ec2ddcdc548..fe4cb950ed14 100644
--- a/python/mxnet/_ctypes/symbol.py
+++ b/python/mxnet/_ctypes/symbol.py
@@ -22,7 +22,7 @@
 
 import ctypes
 from ..base import _LIB
-from ..base import c_array, c_str, mx_uint
+from ..base import c_str_array, c_handle_array, c_str, mx_uint
 from ..base import SymbolHandle
 from ..base import check_call
 
@@ -79,11 +79,11 @@ def _compose(self, *args, **kwargs):
 
         num_args = len(args) + len(kwargs)
         if len(kwargs) != 0:
-            keys = c_array(ctypes.c_char_p, [c_str(key) for key in kwargs])
-            args = c_array(SymbolHandle, [s.handle for s in kwargs.values()])
+            keys = c_str_array(kwargs.keys())
+            args = c_handle_array(kwargs.values())
         else:
             keys = None
-            args = c_array(SymbolHandle, [s.handle for s in args])
+            args = c_handle_array(kwargs.values())
         check_call(_LIB.NNSymbolCompose(
             self.handle, name, num_args, keys, args))
 
@@ -95,10 +95,8 @@ def _set_attr(self, **kwargs):
         **kwargs
             The attributes to set
         """
-        keys = c_array(ctypes.c_char_p,
-                       [c_str(key) for key in kwargs])
-        vals = c_array(ctypes.c_char_p,
-                       [c_str(str(val)) for val in kwargs.values()])
+        keys = c_str_array(kwargs.keys())
+        vals = c_str_array([str(s) for s in kwargs.values()])
         num_args = mx_uint(len(kwargs))
         check_call(_LIB.MXSymbolSetAttrs(
             self.handle, num_args, keys, vals))
@@ -122,8 +120,8 @@ def _symbol_creator(handle, args, kwargs, keys, vals, name):
     check_call(_LIB.MXSymbolCreateAtomicSymbol(
         ctypes.c_void_p(handle),
         mx_uint(len(keys)),
-        c_array(ctypes.c_char_p, [c_str(i) for i in keys]),
-        c_array(ctypes.c_char_p, [c_str(str(i)) for i in vals]),
+        c_str_array(keys),
+        c_str_array([str(v) for v in vals]),
         ctypes.byref(sym_handle)))
 
     if args and kwargs:
diff --git a/python/mxnet/autograd.py b/python/mxnet/autograd.py
index 292bcc2308fc..340a9e66f439 100644
--- a/python/mxnet/autograd.py
+++ b/python/mxnet/autograd.py
@@ -20,14 +20,16 @@
 from __future__ import absolute_import
 from __future__ import division
 
+from array import array
 from threading import Lock
 import traceback
 import ctypes
 from ctypes import c_int, c_void_p, CFUNCTYPE, POINTER, cast
-from .base import _LIB, check_call, string_types
-from .base import mx_uint, NDArrayHandle, c_array, MXCallbackList, SymbolHandle
-from .ndarray import NDArray
-from .symbol import _GRAD_REQ_MAP, Symbol
+from .base import _LIB, check_call, string_types, mx_uint
+from .base import NDArrayHandle, c_array, c_handle_array, c_array_buf, MXCallbackList, SymbolHandle
+from .ndarray import NDArray, _ndarray_cls
+from .ndarray import _GRAD_REQ_MAP
+from .symbol import Symbol
 
 
 def set_recording(is_recording): #pylint: disable=redefined-outer-name
@@ -206,21 +208,36 @@ def mark_variables(variables, gradients, grad_reqs='write'):
         variables = [variables]
         gradients = [gradients]
 
-    variable_handles = []
-    gradient_handles = []
-    for var, gradvar in zip(variables, gradients):
-        variable_handles.append(var.handle)
-        gradient_handles.append(gradvar.handle)
     if isinstance(grad_reqs, string_types):
         grad_reqs = [_GRAD_REQ_MAP[grad_reqs]]*len(variables)
     else:
         grad_reqs = [_GRAD_REQ_MAP[i] for i in grad_reqs]
 
     check_call(_LIB.MXAutogradMarkVariables(
-        len(variable_handles),
-        c_array(NDArrayHandle, variable_handles),
-        c_array(mx_uint, grad_reqs),
-        c_array(NDArrayHandle, gradient_handles)))
+        len(variables),
+        c_handle_array(variables),
+        c_array_buf(mx_uint, array('I', grad_reqs)),
+        c_handle_array(gradients)))
+
+
+def _parse_head(heads, head_grads):
+    """parse head gradient for backward and grad."""
+    if isinstance(heads, NDArray):
+        heads = [heads]
+    if isinstance(head_grads, NDArray):
+        head_grads = [head_grads]
+
+    head_handles = c_handle_array(heads)
+
+    if head_grads is None:
+        hgrad_handles = ctypes.c_void_p(0)
+    else:
+        assert len(heads) == len(head_grads), \
+            "heads and head_grads must be lists of the same length"
+        hgrad_handles = c_array(NDArrayHandle,
+                                [i.handle if i is not None else NDArrayHandle(0)
+                                 for i in head_grads])
+    return head_handles, hgrad_handles
 
 
 def backward(heads, head_grads=None, retain_graph=False, train_mode=True): #pylint: disable=redefined-outer-name
@@ -235,39 +252,95 @@ def backward(heads, head_grads=None, retain_graph=False, train_mode=True): #pyli
     train_mode: bool, optional
         Whether to do backward for training or predicting.
     """
-    if isinstance(heads, NDArray):
-        assert head_grads is None or isinstance(head_grads, NDArray)
-        heads = [heads]
-        head_grads = [head_grads] if head_grads is not None else None
+    head_handles, hgrad_handles = _parse_head(heads, head_grads)
 
-    output_handles = []
-    for arr in heads:
-        output_handles.append(arr.handle)
+    check_call(_LIB.MXAutogradBackwardEx(
+        len(head_handles),
+        head_handles,
+        hgrad_handles,
+        0,
+        ctypes.c_void_p(0),
+        ctypes.c_int(retain_graph),
+        ctypes.c_int(0),
+        ctypes.c_int(train_mode),
+        ctypes.c_void_p(0),
+        ctypes.c_void_p(0)))
 
-    if head_grads is None:
-        check_call(_LIB.MXAutogradBackwardEx(
-            len(output_handles),
-            c_array(NDArrayHandle, output_handles),
-            ctypes.c_void_p(0),
-            ctypes.c_int(retain_graph),
-            ctypes.c_int(train_mode)))
-        return
-
-    ograd_handles = []
-    for arr in head_grads:
-        if arr is not None:
-            ograd_handles.append(arr.handle)
-        else:
-            ograd_handles.append(NDArrayHandle(0))
-    assert len(ograd_handles) == len(output_handles), \
-        "heads and head_grads must have the same length"
+
+def grad(heads, variables, head_grads=None, retain_graph=None, create_graph=False,
+         train_mode=True):  #pylint: disable=redefined-outer-name
+    """Compute the gradients of heads w.r.t variables. Gradients will be
+    returned as new NDArrays instead of stored into `variable.grad`.
+    Supports recording gradient graph for computing higher order gradients.
+
+    .. Note: Currently only a very limited set of operators support higher order
+    gradients.
+
+    Parameters
+    ----------
+    heads: NDArray or list of NDArray
+        Output NDArray(s)
+    variables: NDArray or list of NDArray
+        Input variables to compute gradients for.
+    head_grads: NDArray or list of NDArray or None
+        Gradients with respect to heads.
+    retain_graph: bool
+        Whether to keep computation graph to differentiate again, instead
+        of clearing history and release memory. Defaults to the same value
+        as create_graph.
+    create_graph: bool
+        Whether to record gradient graph for computing higher order
+    train_mode: bool, optional
+        Whether to do backward for training or prediction.
+
+    Returns
+    -------
+    NDArray or list of NDArray:
+        Gradients with respect to variables.
+
+    Examples
+    --------
+    >>> x = mx.nd.ones((1,))
+    >>> x.attach_grad()
+    >>> with mx.autograd.record():
+    ...     z = mx.nd.elemwise_add(mx.nd.exp(x), x)
+    >>> dx = mx.autograd.grad(z, [x], create_graph=True)
+    >>> dx.backward()
+    >>> print(dx.grad)
+    [
+    [ 3.71828175]
+    <NDArray 1 @cpu(0)>]
+    """
+    head_handles, hgrad_handles = _parse_head(heads, head_grads)
+
+    if isinstance(variables, NDArray):
+        variables = [variables]
+    else:
+        assert len(variables), "variables cannot be an empty list."
+    var_handles = c_handle_array(variables)
+
+    retain_graph = retain_graph if retain_graph is not None else create_graph
+    grad_vars = ctypes.POINTER(NDArrayHandle)()
+    grad_stypes = ctypes.POINTER(ctypes.c_int)()
 
     check_call(_LIB.MXAutogradBackwardEx(
-        len(output_handles),
-        c_array(NDArrayHandle, output_handles),
-        c_array(NDArrayHandle, ograd_handles),
+        len(head_handles),
+        head_handles,
+        hgrad_handles,
+        len(var_handles),
+        var_handles,
         ctypes.c_int(retain_graph),
-        ctypes.c_int(train_mode)))
+        ctypes.c_int(create_graph),
+        ctypes.c_int(train_mode),
+        ctypes.byref(grad_vars),
+        ctypes.byref(grad_stypes)))
+
+    ret = [_ndarray_cls(ctypes.cast(grad_vars[i], NDArrayHandle),
+                        stype=grad_stypes[i])
+           for i in range(len(var_handles))]
+    if isinstance(variables, NDArray):
+        return ret[0]
+    return ret
 
 
 def get_symbol(x):
@@ -396,8 +469,6 @@ def delete_entry(_):
                 return False
             return True
 
-        input_handles = [x.handle for x in inputs]
-        output_handles = [x.handle for x in outputs]
         callbacks = [Function._bwd_functype(backward_entry),
                      Function._del_functype(delete_entry)]
         callbacks = [cast(i, CFUNCTYPE(c_int)) for i in callbacks]
@@ -408,9 +479,9 @@ def delete_entry(_):
                                       POINTER(c_void_p)))
         check_call(_LIB.MXCustomFunctionRecord(
             c_int(len(inputs)),
-            c_array(NDArrayHandle, input_handles),
+            c_handle_array(inputs),
             c_int(len(outputs)),
-            c_array(NDArrayHandle, output_handles),
+            c_handle_array(outputs),
             ctypes.byref(context)))
 
         Function._registry.ref_holder[key] = context
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index aad0580e7d07..80fc9011dbca 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -20,6 +20,7 @@
 """ctypes library of mxnet and helper functions."""
 from __future__ import absolute_import
 
+import os
 import sys
 import ctypes
 import atexit
@@ -72,6 +73,20 @@ def __str__(self):
         msg += ' is not implemented for Symbol and only available in NDArray.'
         return msg
 
+class NotSupportedForSparseNDArray(MXNetError):
+    def __init__(self, function, alias, *args):
+        super(NotSupportedForSparseNDArray, self).__init__()
+        self.function = function.__name__
+        self.alias = alias
+        self.args = [str(type(a)) for a in args]
+    def __str__(self):
+        msg = 'Function {}'.format(self.function)
+        if self.alias:
+            msg += ' (namely operator "{}")'.format(self.alias)
+        if self.args:
+            msg += ' with arguments ({})'.format(', '.join(self.args))
+        msg += ' is not supported for SparseNDArray and only available in NDArray.'
+        return msg
 
 class MXCallbackList(ctypes.Structure):
     """Structure that holds Callback information. Passed to CustomOpProp."""
@@ -111,6 +126,8 @@ def _load_lib():
 KVStoreHandle = ctypes.c_void_p
 RecordIOHandle = ctypes.c_void_p
 RtcHandle = ctypes.c_void_p
+CudaModuleHandle = ctypes.c_void_p
+CudaKernelHandle = ctypes.c_void_p
 #----------------------------
 # helper function definition
 #----------------------------
@@ -128,6 +145,7 @@ def check_call(ret):
     if ret != 0:
         raise MXNetError(py_str(_LIB.MXGetLastError()))
 
+
 if sys.version_info[0] < 3:
     def c_str(string):
         """Create ctypes char * from a Python string.
@@ -149,6 +167,24 @@ def c_str(string):
         Hello, World
         """
         return ctypes.c_char_p(string)
+
+    def c_str_array(strings):
+        """Create ctypes const char ** from a list of Python strings.
+
+        Parameters
+        ----------
+        strings : list of string
+            Python strings.
+
+        Returns
+        -------
+        (ctypes.c_char_p * len(strings))
+            A const char ** pointer that can be passed to C API.
+        """
+        arr = (ctypes.c_char_p * len(strings))()
+        arr[:] = strings
+        return arr
+
 else:
     def c_str(string):
         """Create ctypes char * from a Python string.
@@ -166,11 +202,27 @@ def c_str(string):
         Examples
         --------
         >>> x = mx.base.c_str("Hello, World")
-        >>> print x.value
-        Hello, World
+        >>> print(x.value)
+        b"Hello, World"
         """
         return ctypes.c_char_p(string.encode('utf-8'))
 
+    def c_str_array(strings):
+        """Create ctypes const char ** from a list of Python strings.
+
+        Parameters
+        ----------
+        strings : list of string
+            Python strings.
+
+        Returns
+        -------
+        (ctypes.c_char_p * len(strings))
+            A const char ** pointer that can be passed to C API.
+        """
+        arr = (ctypes.c_char_p * len(strings))()
+        arr[:] = [s.encode('utf-8') for s in strings]
+        return arr
 
 def c_array(ctype, values):
     """Create ctypes array from a Python array.
@@ -196,7 +248,55 @@ def c_array(ctype, values):
     >>> x[1]
     2.0
     """
-    return (ctype * len(values))(*values)
+    out = (ctype * len(values))()
+    out[:] = values
+    return out
+
+
+def c_array_buf(ctype, buf):
+    """Create ctypes array from a Python buffer.
+    For primitive types, using the buffer created with array.array is faster
+    than a c_array call.
+
+    Parameters
+    ----------
+    ctype : ctypes data type
+        Data type of the array we want to convert to, such as mx_float.
+
+    buf : buffer type
+        Data content.
+
+    Returns
+    -------
+    out : ctypes array
+        Created ctypes array.
+
+    Examples
+    --------
+    >>> x = mx.base.c_array_buf(mx.base.mx_float, array.array('i', [1, 2, 3]))
+    >>> print len(x)
+    3
+    >>> x[1]
+    2.0
+    """
+    return (ctype * len(buf)).from_buffer(buf)
+
+def c_handle_array(objs):
+    """Create ctypes const void ** from a list of MXNet objects with handles.
+
+    Parameters
+    ----------
+    objs : list of NDArray/Symbol.
+        MXNet objects.
+
+    Returns
+    -------
+    (ctypes.c_void_p * len(objs))
+        A void ** pointer that can be passed to C API.
+    """
+    arr = (ctypes.c_void_p * len(objs))()
+    arr[:] = [o.handle for o in objs]
+    return arr
 
 def ctypes2buffer(cptr, length):
     """Convert ctypes pointer to buffer type.
@@ -293,6 +393,7 @@ def _notify_shutdown():
 
 atexit.register(_notify_shutdown)
 
+
 def add_fileline_to_docstring(module, incursive=True):
     """Append the definition position to each function contained in module.
 
@@ -328,6 +429,7 @@ def _add_fileline(obj):
         if inspect.isclass(obj) and incursive:
             add_fileline_to_docstring(obj, False)
 
+
 def _as_list(obj):
     """A utility function that converts the argument to a list if it is not already.
 
@@ -345,3 +447,169 @@ def _as_list(obj):
         return obj
     else:
         return [obj]
+
+
+_OP_NAME_PREFIX_LIST = ['_contrib_', '_linalg_', '_sparse_']
+
+
+def _get_op_name_prefix(op_name):
+    """
+    Check whether the given op_name starts with any words in `_OP_NAME_PREFIX_LIST`.
+    If found, return the prefix; else, return an empty string.
+    """
+    for prefix in _OP_NAME_PREFIX_LIST:
+        if op_name.startswith(prefix):
+            return prefix
+    return ""
+
+
+# pylint: enable=too-many-locals, invalid-name
+def _init_op_module(root_namespace, module_name, make_op_func):
+    """
+    Registers op functions created by `make_op_func` under
+    `root_namespace.module_name.[submodule_name]`,
+    where `submodule_name` is one of `_OP_SUBMODULE_NAME_LIST`.
+
+    Parameters
+    ----------
+    root_namespace : str
+        Top level module name, `mxnet` in the current cases.
+    module_name : str
+        Second level module name, `ndarray` and `symbol` in the current cases.
+    make_op_func : function
+        Function for creating op functions for `ndarray` and `symbol` modules.
+    """
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+
+    check_call(_LIB.MXListAllOpNames(ctypes.byref(size),
+                                     ctypes.byref(plist)))
+    op_names = []
+    for i in range(size.value):
+        op_names.append(py_str(plist[i]))
+
+    module_op = sys.modules["%s.%s.op" % (root_namespace, module_name)]
+    module_internal = sys.modules["%s.%s._internal" % (root_namespace, module_name)]
+    # contrib module in the old format (deprecated)
+    # kept here for backward compatibility
+    # use mx.nd.contrib or mx.sym.contrib from now on
+    contrib_module_name_old = "%s.contrib.%s" % (root_namespace, module_name)
+    contrib_module_old = sys.modules[contrib_module_name_old]
+    submodule_dict = {}
+    for op_name_prefix in _OP_NAME_PREFIX_LIST:
+        submodule_dict[op_name_prefix] =\
+            sys.modules["%s.%s.%s" % (root_namespace, module_name, op_name_prefix[1:-1])]
+    for name in op_names:
+        hdl = OpHandle()
+        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+        op_name_prefix = _get_op_name_prefix(name)
+        if len(op_name_prefix) > 0:
+            func_name = name[len(op_name_prefix):]
+            cur_module = submodule_dict[op_name_prefix]
+            module_name = "%s.%s.%s" % (root_namespace, module_name, op_name_prefix[1:-1])
+        elif name.startswith('_'):
+            func_name = name
+            cur_module = module_internal
+        else:
+            func_name = name
+            cur_module = module_op
+
+        function = make_op_func(hdl, name, func_name)
+        function.__module__ = module_name
+        setattr(cur_module, function.__name__, function)
+        cur_module.__all__.append(function.__name__)
+
+        if op_name_prefix == '_contrib_':
+            hdl = OpHandle()
+            check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+            func_name = name[len(op_name_prefix):]
+
+            function = make_op_func(hdl, name, func_name)
+            function.__module__ = contrib_module_name_old
+            setattr(contrib_module_old, function.__name__, function)
+            contrib_module_old.__all__.append(function.__name__)
+
+
+def _generate_op_module_signature(root_namespace, module_name, op_code_gen_func):
+    """
+    Generate op functions created by `op_code_gen_func` and write to the source file
+    of `root_namespace.module_name.[submodule_name]`,
+    where `submodule_name` is one of `_OP_SUBMODULE_NAME_LIST`.
+
+    Parameters
+    ----------
+    root_namespace : str
+        Top level module name, `mxnet` in the current cases.
+    module_name : str
+        Second level module name, `ndarray` and `symbol` in the current cases.
+    op_code_gen_func : function
+        Function for creating op functions for `ndarray` and `symbol` modules.
+    """
+    def get_module_file(module_name):
+        """Return the generated module file based on module name."""
+        path = os.path.dirname(__file__)
+        module_path = module_name.split('.')
+        module_path[-1] = 'gen_'+module_path[-1]
+        file_name = os.path.join(path, '..', *module_path) + '.py'
+        module_file = open(file_name, 'w')
+        dependencies = {'symbol': ['from ._internal import SymbolBase',
+                                   'from ..base import _Null'],
+                        'ndarray': ['from ._internal import NDArrayBase',
+                                    'from ..base import _Null']}
+        module_file.write('# File content is auto-generated. Do not modify.'+os.linesep)
+        module_file.write('# pylint: skip-file'+os.linesep)
+        module_file.write(os.linesep.join(dependencies[module_name.split('.')[1]]))
+        return module_file
+    def write_all_str(module_file, module_all_list):
+        """Write the proper __all__ based on available operators."""
+        module_file.write(os.linesep)
+        module_file.write(os.linesep)
+        all_str = '__all__ = [' + ', '.join(["'%s'"%s for s in module_all_list]) + ']'
+        module_file.write(all_str)
+
+    plist = ctypes.POINTER(ctypes.c_char_p)()
+    size = ctypes.c_uint()
+
+    check_call(_LIB.MXListAllOpNames(ctypes.byref(size),
+                                     ctypes.byref(plist)))
+    op_names = []
+    for i in range(size.value):
+        op_names.append(py_str(plist[i]))
+
+    module_op_file = get_module_file("%s.%s.op" % (root_namespace, module_name))
+    module_op_all = []
+    module_internal_file = get_module_file("%s.%s._internal"%(root_namespace, module_name))
+    module_internal_all = []
+    submodule_dict = {}
+    for op_name_prefix in _OP_NAME_PREFIX_LIST:
+        submodule_dict[op_name_prefix] =\
+            (get_module_file("%s.%s.%s" % (root_namespace, module_name,
+                                           op_name_prefix[1:-1])), [])
+    for name in op_names:
+        hdl = OpHandle()
+        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
+        op_name_prefix = _get_op_name_prefix(name)
+        if len(op_name_prefix) > 0:
+            func_name = name[len(op_name_prefix):]
+            cur_module_file, cur_module_all = submodule_dict[op_name_prefix]
+        elif name.startswith('_'):
+            func_name = name
+            cur_module_file = module_internal_file
+            cur_module_all = module_internal_all
+        else:
+            func_name = name
+            cur_module_file = module_op_file
+            cur_module_all = module_op_all
+
+        code, _ = op_code_gen_func(hdl, name, func_name, True)
+        cur_module_file.write(os.linesep)
+        cur_module_file.write(code)
+        cur_module_all.append(func_name)
+
+    for (submodule_f, submodule_all) in submodule_dict.values():
+        write_all_str(submodule_f, submodule_all)
+        submodule_f.close()
+    write_all_str(module_op_file, module_op_all)
+    module_op_file.close()
+    write_all_str(module_internal_file, module_internal_all)
+    module_internal_file.close()
diff --git a/python/mxnet/context.py b/python/mxnet/context.py
index 9798b480d235..beccaebcef23 100644
--- a/python/mxnet/context.py
+++ b/python/mxnet/context.py
@@ -62,8 +62,8 @@ class Context(object):
     """
     # static class variable
     default_ctx = None
-    devtype2str = {1: 'cpu', 2: 'gpu', 3: 'cpu_pinned'}
-    devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3}
+    devtype2str = {1: 'cpu', 2: 'gpu', 3: 'cpu_pinned', 5: 'cpu_shared'}
+    devstr2type = {'cpu': 1, 'gpu': 2, 'cpu_pinned': 3, 'cpu_shared': 5}
     def __init__(self, device_type, device_id=0):
         if isinstance(device_type, Context):
             self.device_typeid = device_type.device_typeid
@@ -128,14 +128,13 @@ def cpu(device_id=0):
 
     Examples
     ----------
-    >>> with mx.Context('cpu', 1):
+    >>> with mx.cpu():
     ...     cpu_array = mx.nd.ones((2, 3))
     >>> cpu_array.context
-    cpu(1)
-    >>> with mx.cpu(1):
-    ...    cpu_array = mx.nd.ones((2, 3))
+    cpu(0)
+    >>> cpu_array = mx.nd.ones((2, 3), ctx=mx.cpu())
     >>> cpu_array.context
-    cpu(1)
+    cpu(0)
 
     Parameters
     ----------
@@ -151,6 +150,36 @@ def cpu(device_id=0):
     return Context('cpu', device_id)
 
 
+def cpu_pinned(device_id=0):
+    """Returns a CPU pinned memory context. Copying from CPU pinned memory to GPU
+    is faster than from normal CPU memory.
+
+    This function is a short cut for ``Context('cpu_pinned', device_id)``.
+
+    Examples
+    ----------
+    >>> with mx.cpu_pinned():
+    ...     cpu_array = mx.nd.ones((2, 3))
+    >>> cpu_array.context
+    cpu_pinned(0)
+    >>> cpu_array = mx.nd.ones((2, 3), ctx=mx.cpu_pinned())
+    >>> cpu_array.context
+    cpu_pinned(0)
+
+    Parameters
+    ----------
+    device_id : int, optional
+        The device id of the device. `device_id` is not needed for CPU.
+        This is included to make interface compatible with GPU.
+
+    Returns
+    -------
+    context : Context
+        The corresponding CPU pinned memory context.
+    """
+    return Context('cpu_pinned', device_id)
+
+
 def gpu(device_id=0):
     """Returns a GPU context.
 
@@ -159,12 +188,14 @@ def gpu(device_id=0):
 
     Examples
     ----------
-    >>> with mx.Context('gpu', 1):
+    >>> cpu_array = mx.nd.ones((2, 3))
+    >>> cpu_array.context
+    cpu(0)
+    >>> with mx.gpu(1):
     ...     gpu_array = mx.nd.ones((2, 3))
     >>> gpu_array.context
     gpu(1)
-    >>> with mx.gpu(1):
-    ...    gpu_array = mx.nd.ones((2, 3))
+    >>> gpu_array = mx.nd.ones((2, 3), ctx=mx.gpu(1))
     >>> gpu_array.context
     gpu(1)
 
diff --git a/python/mxnet/contrib/autograd.py b/python/mxnet/contrib/autograd.py
index c7fb6e17803a..c5c9c027ee24 100644
--- a/python/mxnet/contrib/autograd.py
+++ b/python/mxnet/contrib/autograd.py
@@ -20,12 +20,13 @@
 from __future__ import absolute_import
 from __future__ import division
 
+from array import array
 import ctypes
 import functools
 from ..base import _LIB, check_call, string_types
-from ..base import mx_uint, NDArrayHandle, c_array
-from ..ndarray import NDArray, zeros_like
-from ..symbol import _GRAD_REQ_MAP
+from ..base import mx_uint, NDArrayHandle, c_array, c_array_buf, c_handle_array
+# pylint: disable= unused-import
+from ..ndarray import NDArray, zeros_like, _GRAD_REQ_MAP
 
 
 def set_is_training(is_train):
@@ -107,21 +108,16 @@ def mark_variables(variables, gradients, grad_reqs='write'):
     gradients: list of NDArray
     grad_reqs: list of string
     """
-    variable_handles = []
-    gradient_handles = []
-    for var, gradvar in zip(variables, gradients):
-        variable_handles.append(var.handle)
-        gradient_handles.append(gradvar.handle)
     if isinstance(grad_reqs, string_types):
         grad_reqs = [_GRAD_REQ_MAP[grad_reqs]]*len(variables)
     else:
         grad_reqs = [_GRAD_REQ_MAP[i] for i in grad_reqs]
 
     check_call(_LIB.MXAutogradMarkVariables(
-        len(variable_handles),
-        c_array(NDArrayHandle, variable_handles),
-        c_array(mx_uint, grad_reqs),
-        c_array(NDArrayHandle, gradient_handles)))
+        len(variables),
+        c_handle_array(variables),
+        c_array_buf(mx_uint, array('I', grad_reqs)),
+        c_handle_array(gradients)))
 
 
 def backward(outputs, out_grads=None, retain_graph=False):
@@ -134,14 +130,11 @@ def backward(outputs, out_grads=None, retain_graph=False):
     """
     assert isinstance(outputs, (list, tuple)), \
         "outputs must be a list or tuple of NDArrays"
-    output_handles = []
-    for arr in outputs:
-        output_handles.append(arr.handle)
 
     if out_grads is None:
         check_call(_LIB.MXAutogradBackward(
-            len(output_handles),
-            c_array(NDArrayHandle, output_handles),
+            len(outputs),
+            c_handle_array(outputs),
             ctypes.c_void_p(0),
             ctypes.c_int(retain_graph)))
         return
@@ -152,12 +145,12 @@ def backward(outputs, out_grads=None, retain_graph=False):
             ograd_handles.append(arr.handle)
         else:
             ograd_handles.append(NDArrayHandle(0))
-    assert len(ograd_handles) == len(output_handles), \
+    assert len(ograd_handles) == len(outputs), \
         "outputs and out_grads must have the same length"
 
     check_call(_LIB.MXAutogradBackward(
-        len(output_handles),
-        c_array(NDArrayHandle, output_handles),
+        len(outputs),
+        c_handle_array(outputs),
         c_array(NDArrayHandle, ograd_handles),
         ctypes.c_int(retain_graph)))
 
diff --git a/python/mxnet/contrib/ndarray.py b/python/mxnet/contrib/ndarray.py
index 3c86fe7ba3fb..f65c75ef0fef 100644
--- a/python/mxnet/contrib/ndarray.py
+++ b/python/mxnet/contrib/ndarray.py
@@ -17,3 +17,4 @@
 
 # coding: utf-8
 """NDArray namespace used to register contrib functions"""
+__all__ = []
diff --git a/python/mxnet/contrib/symbol.py b/python/mxnet/contrib/symbol.py
index 1d5334595f27..90f6dae070d2 100644
--- a/python/mxnet/contrib/symbol.py
+++ b/python/mxnet/contrib/symbol.py
@@ -17,3 +17,4 @@
 
 # coding: utf-8
 """Symbol namespace used to register contrib functions"""
+__all__ = []
diff --git a/python/mxnet/engine.py b/python/mxnet/engine.py
new file mode 100644
index 000000000000..d4d38f1f292f
--- /dev/null
+++ b/python/mxnet/engine.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Engine properties management."""
+from __future__ import absolute_import
+
+import ctypes
+from .base import _LIB, check_call
+
+
+def set_bulk_size(size):
+    """Set size limit on bulk execution.
+
+    Bulk execution bundles many operators to run together.
+    This can improve performance when running a lot of small
+    operators sequentially.
+
+    Parameters
+    ----------
+    size : int
+        Maximum number of operators that can be bundled in a bulk.
+
+    Returns
+    -------
+    int
+        Previous bulk size.
+    """
+    prev = ctypes.c_int()
+    check_call(_LIB.MXEngineSetBulkSize(
+        ctypes.c_int(size), ctypes.byref(prev)))
+    return prev.value
+
+
+class _BulkScope(object):
+    """Scope object for bulk execution."""
+    def __init__(self, size):
+        self._size = size
+        self._old_size = None
+
+    def __enter__(self):
+        self._old_size = set_bulk_size(self._size)
+        return self
+
+    def __exit__(self, ptype, value, trace):
+        set_bulk_size(self._old_size)
+
+
+def bulk(size):
+    """Bulk execution bundles many operators to run together.
+    This can improve performance when running a lot of small
+    operators sequentially.
+
+    Returns a scope for managing bulk size::
+
+        with mx.engine.bulk(10):
+            x = mx.nd.zeros((1,))
+            for i in range(100):
+                x += 1
+    """
+    return _BulkScope(size)
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index baff834bb33a..579e6d3e35bb 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -25,8 +25,9 @@
 import numpy as np
 from .base import _LIB
 from .base import mx_uint, NDArrayHandle, ExecutorHandle
-from .base import check_call, c_array, py_str
+from .base import check_call, c_handle_array, py_str
 from .ndarray import NDArray
+from .ndarray import _ndarray_cls
 from . import ndarray as nd
 
 # those functions are not used here, we just import them to keep backward compatibility
@@ -105,7 +106,9 @@ def _get_outputs(self):
         handles = ctypes.POINTER(NDArrayHandle)()
         check_call(_LIB.MXExecutorOutputs(self.handle,
                                           ctypes.byref(out_size), ctypes.byref(handles)))
-        return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
+        num_output = out_size.value
+        outputs = [_ndarray_cls(NDArrayHandle(handles[i])) for i in range(num_output)]
+        return outputs
 
     def forward(self, is_train=False, **kwargs):
         """Calculate the outputs specified by the bound symbol.
@@ -223,7 +226,7 @@ def backward(self, out_grads=None, is_train=True):
         for obj in out_grads:
             if not isinstance(obj, NDArray):
                 raise TypeError("inputs must be NDArray")
-        ndarray = c_array(NDArrayHandle, [item.handle for item in out_grads])
+        ndarray = c_handle_array(out_grads)
         check_call(_LIB.MXExecutorBackwardEx(
             self.handle,
             mx_uint(len(out_grads)),
diff --git a/python/mxnet/gluon/__init__.py b/python/mxnet/gluon/__init__.py
index 089340efcd2c..288937cf4a03 100644
--- a/python/mxnet/gluon/__init__.py
+++ b/python/mxnet/gluon/__init__.py
@@ -36,3 +36,5 @@
 from . import data
 
 from . import model_zoo
+
+from . import contrib
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index 74a9058e98e0..466f87fade7f 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -18,8 +18,10 @@
 # coding: utf-8
 # pylint: disable= arguments-differ
 """Base container class for all neural network models."""
+__all__ = ['Block', 'HybridBlock', 'SymbolBlock']
 
 import copy
+import warnings
 
 from .. import symbol, ndarray, initializer
 from ..symbol import Symbol
@@ -30,7 +32,7 @@
 
 
 class _BlockScope(object):
-    """Scope for collecting child `Block`s."""
+    """Scope for collecting child `Block` s."""
     _current = None
 
     def __init__(self, block):
@@ -64,6 +66,8 @@ def create(prefix, params, hint):
         return current._block.prefix+prefix, params
 
     def __enter__(self):
+        if self._block._empty_prefix:
+            return
         self._old_scope = _BlockScope._current
         _BlockScope._current = self
         self._name_scope = _name.Prefix(self._block.prefix)
@@ -71,6 +75,8 @@ def __enter__(self):
         return self
 
     def __exit__(self, ptype, value, trace):
+        if self._block._empty_prefix:
+            return
         self._name_scope.__exit__(ptype, value, trace)
         self._name_scope = None
         _BlockScope._current = self._old_scope
@@ -116,8 +122,8 @@ class Block(object):
     """Base class for all neural network layers and models. Your models should
     subclass this class.
 
-    `Block` can be nested recursively in a tree structure. You can create and
-    assign child `Block` as regular attributes::
+    :py:class:`Block` can be nested recursively in a tree structure. You can create and
+    assign child :py:class:`Block` as regular attributes::
 
         from mxnet.gluon import Block, nn
         from mxnet import ndarray as F
@@ -140,23 +146,25 @@ def forward(self, x):
         model(F.zeros((10, 10), ctx=mx.cpu(0)))
 
 
-    Child `Block` assigned this way will be registered and `collect_params`
+    Child :py:class:`Block` assigned this way will be registered and :py:meth:`collect_params`
     will collect their Parameters recursively.
 
     Parameters
     ----------
     prefix : str
         Prefix acts like a name space. It will be prepended to the names of all
-        Parameters and child `Block`s in this `Block`'s `name_scope`. Prefix
-        should be unique within one model to prevent name collisions.
+        Parameters and child :py:class:`Block` s in this :py:class:`Block` 's
+        :py:meth:`name_scope` .
+        Prefix should be unique within one model to prevent name collisions.
     params : ParameterDict or None
-        `ParameterDict` for sharing weights with the new `Block`. For example,
-        if you want `dense1` to share `dense0`'s weights, you can do::
+        :py:class:`ParameterDict` for sharing weights with the new :py:class:`Block`. For example,
+        if you want ``dense1`` to share ``dense0``'s weights, you can do::
 
             dense0 = nn.Dense(20)
             dense1 = nn.Dense(20, params=dense0.collect_params())
     """
     def __init__(self, prefix=None, params=None):
+        self._empty_prefix = prefix == ''
         self._prefix, self._params = _BlockScope.create(prefix, params, self._alias())
         self._name = self._prefix[:-1] if self._prefix.endswith('_') else self._prefix
         self._scope = _BlockScope(self)
@@ -172,26 +180,41 @@ def __repr__(self):
 
     def __setattr__(self, name, value):
         """Registers parameters."""
-        super(Block, self).__setattr__(name, value)
-        if isinstance(value, Block):
+
+        if hasattr(self, name):
+            existing = getattr(self, name)
+            if isinstance(existing, (Parameter, Block)) and not isinstance(value, type(existing)):
+                raise TypeError('Changing attribute type for {name} from {type1} to {type2}' \
+                                'is not allowed.'.format(name=name,
+                                                         type1=type(existing),
+                                                         type2=type(value)))
+            if isinstance(existing, Block):
+                for i, c in enumerate(self._children):
+                    if c is existing:
+                        self._children[i] = value
+            elif isinstance(value, Block):
+                self.register_child(value)
+        elif isinstance(value, Block):
             self.register_child(value)
 
+        super(Block, self).__setattr__(name, value)
+
     def _alias(self):
         return self.__class__.__name__.lower()
 
     @property
     def prefix(self):
-        """Prefix of this `Block`."""
+        """Prefix of this :py:class:`Block`."""
         return self._prefix
 
     @property
     def name(self):
-        """Name of this `Block`, without '_' in the end."""
+        """Name of this :py:class:`Block`, without '_' in the end."""
         return self._name
 
     def name_scope(self):
-        """Returns a name space object managing a child `Block` and parameter
-        names. Should be used within a `with` statement::
+        """Returns a name space object managing a child :py:class:`Block` and parameter
+        names. Should be used within a ``with`` statement::
 
             with self.name_scope():
                 self.dense = nn.Dense(20)
@@ -200,12 +223,12 @@ def name_scope(self):
 
     @property
     def params(self):
-        """Returns this `Block`'s parameter dictionary (does not include its
+        """Returns this :py:class:`Block`'s parameter dictionary (does not include its
         children's parameters)."""
         return self._params
 
     def collect_params(self):
-        """Returns a `ParameterDict` containing this `Block` and all of its
+        """Returns a :py:class:`ParameterDict` containing this :py:class:`Block` and all of its
         children's Parameters."""
         ret = ParameterDict(self._params.prefix)
         ret.update(self.params)
@@ -240,19 +263,19 @@ def load_params(self, filename, ctx, allow_missing=False,
 
 
     def register_child(self, block):
-        """Registers block as a child of self. `Block`s assigned to self as
+        """Registers block as a child of self. :py:class:`Block` s assigned to self as
         attributes will be registered automatically."""
         self._children.append(block)
 
     def initialize(self, init=initializer.Uniform(), ctx=None, verbose=False):
-        """Initializes `Parameter`s of this `Block` and its children.
+        """Initializes :py:class:`Parameter` s of this :py:class:`Block` and its children.
 
-        Equivalent to `block.collect_params().initialize(...)`
+        Equivalent to ``block.collect_params().initialize(...)``
         """
         self.collect_params().initialize(init, ctx, verbose)
 
     def hybridize(self, active=True):
-        """Activates or deactivates `HybridBlock`s recursively. Has no effect on
+        """Activates or deactivates :py:class:`HybridBlock` s recursively. Has no effect on
         non-hybrid children.
 
         Parameters
@@ -263,12 +286,25 @@ def hybridize(self, active=True):
         for cld in self._children:
             cld.hybridize(active)
 
+    def cast(self, dtype):
+        """Cast this Block to use another data type.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype
+            The new data type.
+        """
+        for child in self._children:
+            child.cast(dtype)
+        for _, param in self.params.items():
+            param.cast(dtype)
+
     def __call__(self, *args):
         """Calls forward. Only accepts positional arguments."""
         return self.forward(*args)
 
     def forward(self, *args):
-        """Overrides to implement forward computation using `NDArray`. Only
+        """Overrides to implement forward computation using :py:class:`NDArray`. Only
         accepts positional arguments.
 
         Parameters
@@ -283,16 +319,17 @@ def forward(self, *args):
 class HybridBlock(Block):
     """`HybridBlock` supports forwarding with both Symbol and NDArray.
 
-    Forward computation in `HybridBlock` must be static to work with `Symbol`s,
-    i.e. you cannot call `.asnumpy()`, `.shape`, `.dtype`, etc on tensors.
+    Forward computation in :py:class:`HybridBlock` must be static to work with :py:class:`Symbol` s,
+    i.e. you cannot call :py:meth:`NDArray.asnumpy`, :py:attr:`NDArray.shape`,
+    :py:attr:`NDArray.dtype`, etc on tensors.
     Also, you cannot use branching or loop logic that bases on non-constant
     expressions like random numbers or intermediate results, since they change
     the graph structure for each iteration.
 
-    Before activating with `hybridize()`, `HybridBlock` works just like normal
-    `Block`. After activation, `HybridBlock` will create a symbolic graph
+    Before activating with :py:meth:`hybridize()`, :py:class:`HybridBlock` works just like normal
+    :py:class:`Block`. After activation, :py:class:`HybridBlock` will create a symbolic graph
     representing the forward computation and cache it. On subsequent forwards,
-    the cached graph will be used instead of `hybrid_forward`.
+    the cached graph will be used instead of :py:meth:`hybrid_forward`.
 
     Refer `Hybrid tutorial <http://mxnet.io/tutorials/gluon/hybrid.html>`_ to see
     the end-to-end usage.
@@ -302,7 +339,7 @@ def __init__(self, prefix=None, params=None):
         self._reg_params = {}
         self._cached_graph = ()
         self._cached_op = None
-        self._cached_params = None
+        self._cached_op_args = None
         self._out_format = None
         self._in_format = None
         self._active = False
@@ -310,6 +347,8 @@ def __init__(self, prefix=None, params=None):
     def __setattr__(self, name, value):
         """Registers parameters."""
         super(HybridBlock, self).__setattr__(name, value)
+        if isinstance(value, HybridBlock):
+            self._clear_cached_op()
         if isinstance(value, Parameter):
             assert name not in self._reg_params or \
                 not isinstance(self._reg_params[name], Parameter), \
@@ -318,23 +357,13 @@ def __setattr__(self, name, value):
                 "Block construction instead."
             self._reg_params[name] = value
 
-    def register_child(self, block):
-        if not isinstance(block, HybridBlock):
-            raise ValueError(
-                "Children of HybridBlock must also be HybridBlock, " \
-                "but %s has type %s. If you are using Sequential, " \
-                "please try HybridSequential instead"%(
-                    str(block), str(type(block))))
-        super(HybridBlock, self).register_child(block)
-
-    def hybridize(self, active=True):
-        self._active = active
-        super(HybridBlock, self).hybridize(active)
-
     def _get_graph(self, *args):
         if not self._cached_graph:
             args, self._in_format = _flatten(args)
-            inputs = [symbol.var('input_%d'%i) for i in range(len(args))]
+            if len(args) > 1:
+                inputs = [symbol.var('data%d'%i) for i in range(len(args))]
+            else:
+                inputs = [symbol.var('data')]
             grouped_inputs = _regroup(inputs, self._in_format)[0]
 
             params = {i: j.var() for i, j in self._reg_params.items()}
@@ -346,67 +375,142 @@ def _get_graph(self, *args):
 
         return self._cached_graph
 
-    def infer_shape(self, *args):
-        """Infers shape of Parameters from inputs."""
-        inputs, out = self._get_graph(*args)
-        args, _ = _flatten(args)
-        arg_shapes, _, aux_shapes = out.infer_shape(
-            **{i.name: j.shape for i, j in zip(inputs, args)})
-        sdict = {i: j for i, j in zip(out.list_arguments(), arg_shapes)}
-        sdict.update({name : shape for name, shape in \
-                      zip(out.list_auxiliary_states(), aux_shapes)})
-        for i in self.collect_params().values():
-            i.shape = sdict[i.name]
-
     def _build_cache(self, *args):
         inputs, out = self._get_graph(*args)
+        input_idx = {var.name: i for i, var in enumerate(inputs)}
         self._cached_op = ndarray.CachedOp(out)
-
         params = dict(self.collect_params().items())
-        self._cached_params = [params.get(name, None) for name in out.list_inputs()]
-        assert len(params) + len(self._cached_graph[0]) == len(out.list_inputs()), \
-            "Wrong number of inputs."
 
-        name2pos = {var.name: i for i, var in enumerate(inputs)}
-        self._in_idx = [(i, name2pos[name]) for i, name in enumerate(out.list_inputs())
-                        if name not in params]
+        # verify graph inputs
+        expected_inputs = set(out.list_inputs())
+        for name in expected_inputs:
+            assert name in params or name in input_idx, \
+                "Unknown input to HybridBlock: %s"%name
+        for name, i in input_idx.items():
+            if name not in expected_inputs:
+                warnings.warn("The %d-th input to HybridBlock is not used by any "
+                              "computation. Is this intended?"%i)
+        for name in params:
+            if name not in expected_inputs:
+                warnings.warn("Parameter %s is not used by any computation. "
+                              "Is this intended?"%name)
+
+        self._cached_op_args = [(False, params[name]) if name in params
+                                else (True, input_idx[name])
+                                for name in out.list_inputs()]
+
+    def _finish_deferred_init(self, hybrid, *args):
+        self.infer_shape(*args)
+        if hybrid:
+            for is_arg, i in self._cached_op_args:
+                if not is_arg:
+                    i._finish_deferred_init()
+        else:
+            for _, i in self.params.items():
+                i._finish_deferred_init()
 
     def _call_cached_op(self, *args):
         if self._cached_op is None:
             self._build_cache(*args)
 
-        try:
-            cargs = [i.data() if i else None for i in self._cached_params]
-        except DeferredInitializationError:
-            self.infer_shape(*args)
-            for i in self._cached_params:
-                if i is not None:
-                    i._finish_deferred_init()
-            cargs = [i.data() if i else None for i in self._cached_params]
-
         args, fmt = _flatten(args)
         assert fmt == self._in_format, "Invalid input format"
-        for i, j in self._in_idx:
-            cargs[i] = args[j]
+        cargs = [args[i] if is_arg else i.data()
+                 for is_arg, i in self._cached_op_args]
         out = self._cached_op(*cargs)
         if isinstance(out, NDArray):
             out = [out]
         return _regroup(out, self._out_format)[0]
 
+    def _clear_cached_op(self):
+        self._cached_graph = ()
+        self._cached_op = None
+        self._cached_op_args = None
+
+    def register_child(self, block):
+        if not isinstance(block, HybridBlock):
+            raise ValueError(
+                "Children of HybridBlock must also be HybridBlock, " \
+                "but %s has type %s. If you are using Sequential, " \
+                "please try HybridSequential instead"%(
+                    str(block), str(type(block))))
+        super(HybridBlock, self).register_child(block)
+        self._clear_cached_op()
+
+    def hybridize(self, active=True):
+        self._active = active
+        super(HybridBlock, self).hybridize(active)
+
+    def cast(self, dtype):
+        self._clear_cached_op()
+        super(HybridBlock, self).cast(dtype)
+
+    def _infer_attrs(self, infer_fn, attr, *args):
+        """Generic infer attributes."""
+        inputs, out = self._get_graph(*args)
+        args, _ = _flatten(args)
+        arg_attrs, _, aux_attrs = getattr(out, infer_fn)(
+            **{i.name: getattr(j, attr) for i, j in zip(inputs, args)})
+        sdict = {i: j for i, j in zip(out.list_arguments(), arg_attrs)}
+        sdict.update({name : attr for name, attr in \
+                      zip(out.list_auxiliary_states(), aux_attrs)})
+        for i in self.collect_params().values():
+            setattr(i, attr, sdict[i.name])
+
+    def infer_shape(self, *args):
+        """Infers shape of Parameters from inputs."""
+        self._infer_attrs('infer_shape', 'shape', *args)
+
+    def infer_type(self, *args):
+        """Infers data type of Parameters from inputs."""
+        self._infer_attrs('infer_type', 'dtype', *args)
+
+    def export(self, path):
+        """Export HybridBlock to json format that can be loaded by `mxnet.mod.Module`
+        or the C++ interface.
+
+        .. note:: When there are only one input, it will have name `data`. When there
+                  Are more than one inputs, they will be named as `data0`, `data1`, etc.
+
+        Parameters
+        ----------
+        path : str
+            Path to save model. Two files `path-symbol.json` and `path-0000.params`
+            will be created.
+        """
+        if not self._cached_graph:
+            raise RuntimeError(
+                "Please first call block.hybridize() and then run forward with "
+                "this block at least once before calling export.")
+        sym = self._cached_graph[1]
+        sym.save('%s-symbol.json'%path)
+
+        arg_names = set(sym.list_arguments())
+        aux_names = set(sym.list_auxiliary_states())
+        arg_dict = {}
+        for name, param in self.collect_params().items():
+            if name in arg_names:
+                arg_dict['arg:%s'%name] = param._reduce()
+            else:
+                assert name in aux_names
+                arg_dict['aux:%s'%name] = param._reduce()
+        ndarray.save('%s-0000.params'%path, arg_dict)
+
     def forward(self, x, *args):
         """Defines the forward computation. Arguments can be either
-        `NDArray` or `Symbol`."""
+        :py:class:`NDArray` or :py:class:`Symbol`."""
         if isinstance(x, NDArray):
             with x.context as ctx:
-                if self._active:
-                    return self._call_cached_op(x, *args)
                 try:
+                    if self._active:
+                        return self._call_cached_op(x, *args)
                     params = {i: j.data(ctx) for i, j in self._reg_params.items()}
                 except DeferredInitializationError:
-                    self.infer_shape(x, *args)
-                    for i in self.collect_params().values():
-                        i._finish_deferred_init()
-                    params = {i: j.data(ctx) for i, j in self._reg_params.items()}
+                    self._finish_deferred_init(self._active, x, *args)
+
+                if self._active:
+                    return self._call_cached_op(x, *args)
+                params = {i: j.data(ctx) for i, j in self._reg_params.items()}
                 return self.hybrid_forward(ndarray, x, *args, **params)
 
         assert isinstance(x, Symbol), \
@@ -459,7 +563,7 @@ class SymbolBlock(HybridBlock):
                    internals['model_dense1_relu_fwd_output']]
     >>> # Create SymbolBlock that shares parameters with alexnet
     >>> feat_model = gluon.SymbolBlock(outputs, inputs, params=alexnet.collect_params())
-    >>> x = mx.nd.random_normal(shape=(16, 3, 224, 224))
+    >>> x = mx.nd.random.normal(shape=(16, 3, 224, 224))
     >>> print(feat_model(x))
     """
     def __init__(self, outputs, inputs, params=None):
@@ -468,8 +572,8 @@ def __init__(self, outputs, inputs, params=None):
         self._params = ParameterDict('', params)
         if isinstance(inputs, symbol.Symbol) and len(inputs.list_outputs()) == 1:
             inputs = [inputs]
-        if isinstance(outputs, symbol.Symbol) and len(outputs.list_outputs()) == 1:
-            outputs = [outputs]
+        if isinstance(outputs, (list, tuple)) and len(outputs) == 1:
+            outputs = outputs[0]
 
         syms, self._in_format = _flatten(inputs)
         out, self._out_format = _flatten(outputs)
@@ -495,6 +599,11 @@ def __init__(self, outputs, inputs, params=None):
     def forward(self, x, *args):
         if isinstance(x, NDArray):
             with x.context:
+                try:
+                    return self._call_cached_op(x, *args)
+                except DeferredInitializationError:
+                    self._finish_deferred_init(True, x, *args)
+
                 return self._call_cached_op(x, *args)
 
         assert isinstance(x, Symbol), \
@@ -504,7 +613,7 @@ def forward(self, x, *args):
         assert in_fmt == self._in_format, "Invalid input format"
         ret = copy.copy(self._cached_graph[1])
         ret._compose(**{k.name: v for k, v in zip(self._cached_graph[0], args)})
-        return _regroup(ret, self._out_format)[0]
+        return _regroup(list(ret), self._out_format)[0]
 
     def hybrid_forward(self, F, x, *args, **kwargs):
         raise NotImplementedError
diff --git a/python/mxnet/_ndarray_internal.py b/python/mxnet/gluon/contrib/__init__.py
similarity index 91%
rename from python/mxnet/_ndarray_internal.py
rename to python/mxnet/gluon/contrib/__init__.py
index 8f151f1b5b64..3f8b64be9b61 100644
--- a/python/mxnet/_ndarray_internal.py
+++ b/python/mxnet/gluon/contrib/__init__.py
@@ -15,4 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""NDArray namespace used to register internal functions."""
+# coding: utf-8
+"""Contrib neural network module."""
+
+from . import rnn
diff --git a/python/mxnet/gluon/contrib/rnn/__init__.py b/python/mxnet/gluon/contrib/rnn/__init__.py
new file mode 100644
index 000000000000..eaa38be07702
--- /dev/null
+++ b/python/mxnet/gluon/contrib/rnn/__init__.py
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import
+"""Contrib recurrent neural network module."""
+
+from . import conv_rnn_cell, rnn_cell
+
+from .conv_rnn_cell import *
+
+from .rnn_cell import *
+
+__all__ = conv_rnn_cell.__all__ + rnn_cell.__all__
diff --git a/python/mxnet/gluon/contrib/rnn/conv_rnn_cell.py b/python/mxnet/gluon/contrib/rnn/conv_rnn_cell.py
new file mode 100644
index 000000000000..09db5470ef93
--- /dev/null
+++ b/python/mxnet/gluon/contrib/rnn/conv_rnn_cell.py
@@ -0,0 +1,975 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=arguments-differ, too-many-lines
+# coding: utf-8
+"""Definition of various recurrent neural network cells."""
+__all__ = ['Conv1DRNNCell', 'Conv2DRNNCell', 'Conv3DRNNCell',
+           'Conv1DLSTMCell', 'Conv2DLSTMCell', 'Conv3DLSTMCell',
+           'Conv1DGRUCell', 'Conv2DGRUCell', 'Conv3DGRUCell']
+
+
+from math import floor
+
+from ....base import numeric_types
+from ...rnn import HybridRecurrentCell
+
+
+def _get_conv_out_size(dimensions, kernels, paddings, dilations):
+    return tuple(int(floor(x+2*p-d*(k-1)-1)+1) if x else 0 for x, k, p, d in
+                 zip(dimensions, kernels, paddings, dilations))
+
+
+class _BaseConvRNNCell(HybridRecurrentCell):
+    """Abstract base class for convolutional RNNs"""
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad, i2h_dilate, h2h_dilate,
+                 i2h_weight_initializer, h2h_weight_initializer,
+                 i2h_bias_initializer, h2h_bias_initializer,
+                 dims,
+                 conv_layout, activation,
+                 prefix=None, params=None):
+        super(_BaseConvRNNCell, self).__init__(prefix=prefix, params=params)
+
+        self._hidden_channels = hidden_channels
+        self._input_shape = input_shape
+        self._conv_layout = conv_layout
+        self._activation = activation
+
+        # Convolution setting
+        assert all(isinstance(spec, int) or len(spec) == dims
+                   for spec in [i2h_kernel, i2h_pad, i2h_dilate,
+                                h2h_kernel, h2h_dilate]), \
+               "For {dims}D convolution, the convolution settings can only be either int " \
+               "or list/tuple of length {dims}".format(dims=dims)
+
+        self._i2h_kernel = (i2h_kernel,) * dims if isinstance(i2h_kernel, numeric_types) \
+                           else i2h_kernel
+        self._stride = (1,) * dims
+        self._i2h_pad = (i2h_pad,) * dims if isinstance(i2h_pad, numeric_types) \
+                        else i2h_pad
+        self._i2h_dilate = (i2h_dilate,) * dims if isinstance(i2h_dilate, numeric_types) \
+                           else i2h_dilate
+        self._h2h_kernel = (h2h_kernel,) * dims if isinstance(h2h_kernel, numeric_types) \
+                           else h2h_kernel
+        assert all(k % 2 == 1 for k in self._h2h_kernel), \
+            "Only support odd number, get h2h_kernel= %s" % str(h2h_kernel)
+        self._h2h_dilate = (h2h_dilate,) * dims if isinstance(h2h_dilate, numeric_types) \
+                           else h2h_dilate
+
+        self._channel_axis, \
+        self._in_channels, \
+        i2h_param_shape, \
+        h2h_param_shape, \
+        self._h2h_pad, \
+        self._state_shape = self._decide_shapes()
+
+        self.i2h_weight = self.params.get('i2h_weight', shape=i2h_param_shape,
+                                          init=i2h_weight_initializer,
+                                          allow_deferred_init=True)
+        self.h2h_weight = self.params.get('h2h_weight', shape=h2h_param_shape,
+                                          init=h2h_weight_initializer,
+                                          allow_deferred_init=True)
+        self.i2h_bias = self.params.get('i2h_bias', shape=(hidden_channels*self._num_gates,),
+                                        init=i2h_bias_initializer,
+                                        allow_deferred_init=True)
+        self.h2h_bias = self.params.get('h2h_bias', shape=(hidden_channels*self._num_gates,),
+                                        init=h2h_bias_initializer,
+                                        allow_deferred_init=True)
+
+    def _decide_shapes(self):
+        channel_axis = self._conv_layout.find('C')
+        input_shape = self._input_shape
+        in_channels = input_shape[channel_axis - 1]
+        hidden_channels = self._hidden_channels
+        if channel_axis == 1:
+            dimensions = input_shape[1:]
+        else:
+            dimensions = input_shape[:-1]
+
+        total_out = hidden_channels * self._num_gates
+
+        i2h_param_shape = (total_out,)
+        h2h_param_shape = (total_out,)
+        state_shape = (hidden_channels,)
+        conv_out_size = _get_conv_out_size(dimensions,
+                                           self._i2h_kernel,
+                                           self._i2h_pad,
+                                           self._i2h_dilate)
+        h2h_pad = tuple(d*(k-1)//2 for d, k in zip(self._h2h_dilate, self._h2h_kernel))
+        if channel_axis == 1:
+            i2h_param_shape += (in_channels,) + self._i2h_kernel
+            h2h_param_shape += (hidden_channels,) + self._h2h_kernel
+            state_shape += conv_out_size
+        else:
+            i2h_param_shape += self._i2h_kernel + (in_channels,)
+            h2h_param_shape += self._h2h_kernel + (hidden_channels,)
+            state_shape = conv_out_size + state_shape
+
+        return channel_axis, in_channels, i2h_param_shape, \
+               h2h_param_shape, h2h_pad, state_shape
+
+    def __repr__(self):
+        s = '{name}({mapping}'
+        if hasattr(self, '_activation'):
+            s += ', {_activation}'
+        s += ', {_conv_layout}'
+        s += ')'
+        attrs = self.__dict__
+        shape = self.i2h_weight.shape
+        in_channels = shape[1 if self._channel_axis == 1 else -1]
+        mapping = ('{0} -> {1}'.format(in_channels if in_channels else None, shape[0]))
+        return s.format(name=self.__class__.__name__,
+                        mapping=mapping,
+                        **attrs)
+
+    @property
+    def _num_gates(self):
+        return len(self._gate_names)
+
+    def _conv_forward(self, F, inputs, states,
+                      i2h_weight, h2h_weight, i2h_bias, h2h_bias,
+                      prefix):
+        i2h = F.Convolution(data=inputs,
+                            num_filter=self._hidden_channels*self._num_gates,
+                            kernel=self._i2h_kernel,
+                            stride=self._stride,
+                            pad=self._i2h_pad,
+                            dilate=self._i2h_dilate,
+                            weight=i2h_weight,
+                            bias=i2h_bias,
+                            layout=self._conv_layout,
+                            name=prefix+'i2h')
+        h2h = F.Convolution(data=states[0],
+                            num_filter=self._hidden_channels*self._num_gates,
+                            kernel=self._h2h_kernel,
+                            dilate=self._h2h_dilate,
+                            pad=self._h2h_pad,
+                            stride=self._stride,
+                            weight=h2h_weight,
+                            bias=h2h_bias,
+                            layout=self._conv_layout,
+                            name=prefix+'h2h')
+        return i2h, h2h
+
+    def state_info(self, batch_size=0):
+        raise NotImplementedError("_BaseConvRNNCell is abstract class for convolutional RNN")
+
+    def hybrid_forward(self, F, inputs, states):
+        raise NotImplementedError("_BaseConvRNNCell is abstract class for convolutional RNN")
+
+
+class _ConvRNNCell(_BaseConvRNNCell):
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel, i2h_pad, i2h_dilate, h2h_dilate,
+                 i2h_weight_initializer, h2h_weight_initializer,
+                 i2h_bias_initializer, h2h_bias_initializer,
+                 dims, conv_layout, activation, prefix, params):
+        super(_ConvRNNCell, self).__init__(input_shape=input_shape,
+                                           hidden_channels=hidden_channels,
+                                           activation=activation,
+                                           i2h_kernel=i2h_kernel,
+                                           i2h_pad=i2h_pad, i2h_dilate=i2h_dilate,
+                                           h2h_kernel=h2h_kernel, h2h_dilate=h2h_dilate,
+                                           i2h_weight_initializer=i2h_weight_initializer,
+                                           h2h_weight_initializer=h2h_weight_initializer,
+                                           i2h_bias_initializer=i2h_bias_initializer,
+                                           h2h_bias_initializer=h2h_bias_initializer,
+                                           dims=dims,
+                                           conv_layout=conv_layout,
+                                           prefix=prefix, params=params)
+
+    def state_info(self, batch_size=0):
+        return [{'shape': (batch_size,)+self._state_shape, '__layout__': self._conv_layout}]
+
+    def _alias(self):
+        return 'conv_rnn'
+
+    @property
+    def _gate_names(self):
+        return ('',)
+
+    def hybrid_forward(self, F, inputs, states, i2h_weight,
+                       h2h_weight, i2h_bias, h2h_bias):
+        prefix = 't%d_'%self._counter
+        i2h, h2h = self._conv_forward(F, inputs, states,
+                                      i2h_weight, h2h_weight, i2h_bias, h2h_bias,
+                                      prefix)
+        output = self._get_activation(F, i2h + h2h, self._activation,
+                                      name=prefix+'out')
+        return output, [output]
+
+
+class Conv1DRNNCell(_ConvRNNCell):
+    r"""1D Convolutional RNN cell.
+
+    .. math::
+
+        h_t = tanh(W_i \ast x_t + R_i \ast h_{t-1} + b_i)
+
+    Parameters
+    ----------
+    input_shape : tuple of int
+        Input tensor shape at each time step for each sample, excluding dimension of the batch size
+        and sequence length. Must be consistent with `conv_layout`.
+        For example, for layout 'NCW' the shape should be (C, W).
+    hidden_channels : int
+        Number of output channels.
+    i2h_kernel : int or tuple of int
+        Input convolution kernel sizes.
+    h2h_kernel : int or tuple of int
+        Recurrent convolution kernel sizes. Only odd-numbered sizes are supported.
+    i2h_pad : int or tuple of int, default (0,)
+        Pad for input convolution.
+    i2h_dilate : int or tuple of int, default (1,)
+        Input convolution dilate.
+    h2h_dilate : int or tuple of int, default (1,)
+        Recurrent convolution dilate.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the input convolutions.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the input convolutions.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the input convolution bias vectors.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the recurrent convolution bias vectors.
+    conv_layout : str, default 'NCW'
+        Layout for all convolution inputs, outputs and weights. Options are 'NCW' and 'NWC'.
+    activation : str or Block, default 'tanh'
+        Type of activation function.
+        If argument type is string, it's equivalent to nn.Activation(act_type=str). See
+        :func:`~mxnet.ndarray.Activation` for available choices.
+        Alternatively, other activation blocks such as nn.LeakyReLU can be used.
+    prefix : str, default 'conv_rnn_'
+        Prefix for name of layers (and name of weight if params is None).
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad=(0,), i2h_dilate=(1,), h2h_dilate=(1,),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 conv_layout='NCW', activation='tanh',
+                 prefix=None, params=None):
+        super(Conv1DRNNCell, self).__init__(input_shape=input_shape,
+                                            hidden_channels=hidden_channels,
+                                            i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                            i2h_pad=i2h_pad,
+                                            i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                            i2h_weight_initializer=i2h_weight_initializer,
+                                            h2h_weight_initializer=h2h_weight_initializer,
+                                            i2h_bias_initializer=i2h_bias_initializer,
+                                            h2h_bias_initializer=h2h_bias_initializer,
+                                            dims=1,
+                                            conv_layout=conv_layout,
+                                            activation=activation,
+                                            prefix=prefix, params=params)
+
+
+class Conv2DRNNCell(_ConvRNNCell):
+    r"""2D Convolutional RNN cell.
+
+    .. math::
+
+        h_t = tanh(W_i \ast x_t + R_i \ast h_{t-1} + b_i)
+
+    Parameters
+    ----------
+    input_shape : tuple of int
+        Input tensor shape at each time step for each sample, excluding dimension of the batch size
+        and sequence length. Must be consistent with `conv_layout`.
+        For example, for layout 'NCHW' the shape should be (C, H, W).
+    hidden_channels : int
+        Number of output channels.
+    i2h_kernel : int or tuple of int
+        Input convolution kernel sizes.
+    h2h_kernel : int or tuple of int
+        Recurrent convolution kernel sizes. Only odd-numbered sizes are supported.
+    i2h_pad : int or tuple of int, default (0, 0)
+        Pad for input convolution.
+    i2h_dilate : int or tuple of int, default (1, 1)
+        Input convolution dilate.
+    h2h_dilate : int or tuple of int, default (1, 1)
+        Recurrent convolution dilate.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the input convolutions.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the input convolutions.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the input convolution bias vectors.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the recurrent convolution bias vectors.
+    conv_layout : str, default 'NCHW'
+        Layout for all convolution inputs, outputs and weights. Options are 'NCHW' and 'NHWC'.
+    activation : str or Block, default 'tanh'
+        Type of activation function.
+        If argument type is string, it's equivalent to nn.Activation(act_type=str). See
+        :func:`~mxnet.ndarray.Activation` for available choices.
+        Alternatively, other activation blocks such as nn.LeakyReLU can be used.
+    prefix : str, default 'conv_rnn_'
+        Prefix for name of layers (and name of weight if params is None).
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad=(0, 0), i2h_dilate=(1, 1), h2h_dilate=(1, 1),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 conv_layout='NCHW', activation='tanh',
+                 prefix=None, params=None):
+        super(Conv2DRNNCell, self).__init__(input_shape=input_shape,
+                                            hidden_channels=hidden_channels,
+                                            i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                            i2h_pad=i2h_pad,
+                                            i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                            i2h_weight_initializer=i2h_weight_initializer,
+                                            h2h_weight_initializer=h2h_weight_initializer,
+                                            i2h_bias_initializer=i2h_bias_initializer,
+                                            h2h_bias_initializer=h2h_bias_initializer,
+                                            dims=2,
+                                            conv_layout=conv_layout,
+                                            activation=activation,
+                                            prefix=prefix, params=params)
+
+
+class Conv3DRNNCell(_ConvRNNCell):
+    r"""3D Convolutional RNN cells
+
+    .. math::
+
+        h_t = tanh(W_i \ast x_t + R_i \ast h_{t-1} + b_i)
+
+    Parameters
+    ----------
+    input_shape : tuple of int
+        Input tensor shape at each time step for each sample, excluding dimension of the batch size
+        and sequence length. Must be consistent with `conv_layout`.
+        For example, for layout 'NCDHW' the shape should be (C, D, H, W).
+    hidden_channels : int
+        Number of output channels.
+    i2h_kernel : int or tuple of int
+        Input convolution kernel sizes.
+    h2h_kernel : int or tuple of int
+        Recurrent convolution kernel sizes. Only odd-numbered sizes are supported.
+    i2h_pad : int or tuple of int, default (0, 0, 0)
+        Pad for input convolution.
+    i2h_dilate : int or tuple of int, default (1, 1, 1)
+        Input convolution dilate.
+    h2h_dilate : int or tuple of int, default (1, 1, 1)
+        Recurrent convolution dilate.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the input convolutions.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the input convolutions.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the input convolution bias vectors.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the recurrent convolution bias vectors.
+    conv_layout : str, default 'NCDHW'
+        Layout for all convolution inputs, outputs and weights. Options are 'NCDHW' and 'NDHWC'.
+    activation : str or Block, default 'tanh'
+        Type of activation function.
+        If argument type is string, it's equivalent to nn.Activation(act_type=str). See
+        :func:`~mxnet.ndarray.Activation` for available choices.
+        Alternatively, other activation blocks such as nn.LeakyReLU can be used.
+    prefix : str, default 'conv_rnn_'
+        Prefix for name of layers (and name of weight if params is None).
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad=(0, 0, 0),
+                 i2h_dilate=(1, 1, 1), h2h_dilate=(1, 1, 1),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 conv_layout='NCDHW', activation='tanh',
+                 prefix=None, params=None):
+        super(Conv3DRNNCell, self).__init__(input_shape=input_shape,
+                                            hidden_channels=hidden_channels,
+                                            i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                            i2h_pad=i2h_pad,
+                                            i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                            i2h_weight_initializer=i2h_weight_initializer,
+                                            h2h_weight_initializer=h2h_weight_initializer,
+                                            i2h_bias_initializer=i2h_bias_initializer,
+                                            h2h_bias_initializer=h2h_bias_initializer,
+                                            dims=3,
+                                            conv_layout=conv_layout,
+                                            activation=activation,
+                                            prefix=prefix, params=params)
+
+
+class _ConvLSTMCell(_BaseConvRNNCell):
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad, i2h_dilate, h2h_dilate,
+                 i2h_weight_initializer, h2h_weight_initializer,
+                 i2h_bias_initializer, h2h_bias_initializer,
+                 dims, conv_layout, activation, prefix, params):
+        super(_ConvLSTMCell, self).__init__(input_shape=input_shape,
+                                            hidden_channels=hidden_channels,
+                                            i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                            i2h_pad=i2h_pad,
+                                            i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                            i2h_weight_initializer=i2h_weight_initializer,
+                                            h2h_weight_initializer=h2h_weight_initializer,
+                                            i2h_bias_initializer=i2h_bias_initializer,
+                                            h2h_bias_initializer=h2h_bias_initializer,
+                                            dims=dims,
+                                            conv_layout=conv_layout,
+                                            activation=activation,
+                                            prefix=prefix, params=params)
+
+    def state_info(self, batch_size=0):
+        return [{'shape': (batch_size,)+self._state_shape, '__layout__': self._conv_layout},
+                {'shape': (batch_size,)+self._state_shape, '__layout__': self._conv_layout}]
+
+    def _alias(self):
+        return 'conv_lstm'
+
+    @property
+    def _gate_names(self):
+        return ['_i', '_f', '_c', '_o']
+
+    def hybrid_forward(self, F, inputs, states, i2h_weight,
+                       h2h_weight, i2h_bias, h2h_bias):
+        prefix = 't%d_'%self._counter
+        i2h, h2h = self._conv_forward(F, inputs, states,
+                                      i2h_weight, h2h_weight, i2h_bias, h2h_bias,
+                                      prefix)
+        gates = i2h + h2h
+        slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice',
+                                     axis=self._channel_axis)
+        in_gate = F.Activation(slice_gates[0], act_type="sigmoid", name=prefix+'i')
+        forget_gate = F.Activation(slice_gates[1], act_type="sigmoid", name=prefix+'f')
+        in_transform = self._get_activation(F, slice_gates[2], self._activation, name=prefix+'c')
+        out_gate = F.Activation(slice_gates[3], act_type="sigmoid", name=prefix+'o')
+        next_c = F._internal._plus(forget_gate * states[1], in_gate * in_transform,
+                                   name=prefix+'state')
+        next_h = F._internal._mul(out_gate, self._get_activation(F, next_c, self._activation),
+                                  name=prefix+'out')
+
+        return next_h, [next_h, next_c]
+
+
+class Conv1DLSTMCell(_ConvLSTMCell):
+    r"""1D Convolutional LSTM network cell.
+
+    `"Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting"
+    <https://arxiv.org/abs/1506.04214>`_ paper. Xingjian et al. NIPS2015
+
+    .. math::
+        \begin{array}{ll}
+        i_t = \sigma(W_i \ast x_t + R_i \ast h_{t-1} + b_i) \\
+        f_t = \sigma(W_f \ast x_t + R_f \ast h_{t-1} + b_f) \\
+        o_t = \sigma(W_o \ast x_t + R_o \ast h_{t-1} + b_o) \\
+        c^\prime_t = tanh(W_c \ast x_t + R_c \ast h_{t-1} + b_c) \\
+        c_t = f_t \circ c_{t-1} + i_t \circ c^\prime_t \\
+        h_t = o_t \circ tanh(c_t) \\
+        \end{array}
+
+    Parameters
+    ----------
+    input_shape : tuple of int
+        Input tensor shape at each time step for each sample, excluding dimension of the batch size
+        and sequence length. Must be consistent with `conv_layout`.
+        For example, for layout 'NCW' the shape should be (C, W).
+    hidden_channels : int
+        Number of output channels.
+    i2h_kernel : int or tuple of int
+        Input convolution kernel sizes.
+    h2h_kernel : int or tuple of int
+        Recurrent convolution kernel sizes. Only odd-numbered sizes are supported.
+    i2h_pad : int or tuple of int, default (0,)
+        Pad for input convolution.
+    i2h_dilate : int or tuple of int, default (1,)
+        Input convolution dilate.
+    h2h_dilate : int or tuple of int, default (1,)
+        Recurrent convolution dilate.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the input convolutions.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the input convolutions.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the input convolution bias vectors.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the recurrent convolution bias vectors.
+    conv_layout : str, default 'NCW'
+        Layout for all convolution inputs, outputs and weights. Options are 'NCW' and 'NWC'.
+    activation : str or Block, default 'tanh'
+        Type of activation function used in c^\prime_t.
+        If argument type is string, it's equivalent to nn.Activation(act_type=str). See
+        :func:`~mxnet.ndarray.Activation` for available choices.
+        Alternatively, other activation blocks such as nn.LeakyReLU can be used.
+    prefix : str, default 'conv_lstm_'
+        Prefix for name of layers (and name of weight if params is None).
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad=(0,),
+                 i2h_dilate=(1,), h2h_dilate=(1,),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 conv_layout='NCW', activation='tanh',
+                 prefix=None, params=None):
+        super(Conv1DLSTMCell, self).__init__(input_shape=input_shape,
+                                             hidden_channels=hidden_channels,
+                                             i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                             i2h_pad=i2h_pad,
+                                             i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                             i2h_weight_initializer=i2h_weight_initializer,
+                                             h2h_weight_initializer=h2h_weight_initializer,
+                                             i2h_bias_initializer=i2h_bias_initializer,
+                                             h2h_bias_initializer=h2h_bias_initializer,
+                                             dims=1,
+                                             conv_layout=conv_layout,
+                                             activation=activation,
+                                             prefix=prefix, params=params)
+
+
+class Conv2DLSTMCell(_ConvLSTMCell):
+    r"""2D Convolutional LSTM network cell.
+
+    `"Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting"
+    <https://arxiv.org/abs/1506.04214>`_ paper. Xingjian et al. NIPS2015
+
+    .. math::
+        \begin{array}{ll}
+        i_t = \sigma(W_i \ast x_t + R_i \ast h_{t-1} + b_i) \\
+        f_t = \sigma(W_f \ast x_t + R_f \ast h_{t-1} + b_f) \\
+        o_t = \sigma(W_o \ast x_t + R_o \ast h_{t-1} + b_o) \\
+        c^\prime_t = tanh(W_c \ast x_t + R_c \ast h_{t-1} + b_c) \\
+        c_t = f_t \circ c_{t-1} + i_t \circ c^\prime_t \\
+        h_t = o_t \circ tanh(c_t) \\
+        \end{array}
+
+    Parameters
+    ----------
+    input_shape : tuple of int
+        Input tensor shape at each time step for each sample, excluding dimension of the batch size
+        and sequence length. Must be consistent with `conv_layout`.
+        For example, for layout 'NCHW' the shape should be (C, H, W).
+    hidden_channels : int
+        Number of output channels.
+    i2h_kernel : int or tuple of int
+        Input convolution kernel sizes.
+    h2h_kernel : int or tuple of int
+        Recurrent convolution kernel sizes. Only odd-numbered sizes are supported.
+    i2h_pad : int or tuple of int, default (0, 0)
+        Pad for input convolution.
+    i2h_dilate : int or tuple of int, default (1, 1)
+        Input convolution dilate.
+    h2h_dilate : int or tuple of int, default (1, 1)
+        Recurrent convolution dilate.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the input convolutions.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the input convolutions.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the input convolution bias vectors.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the recurrent convolution bias vectors.
+    conv_layout : str, default 'NCHW'
+        Layout for all convolution inputs, outputs and weights. Options are 'NCHW' and 'NHWC'.
+    activation : str or Block, default 'tanh'
+        Type of activation function used in c^\prime_t.
+        If argument type is string, it's equivalent to nn.Activation(act_type=str). See
+        :func:`~mxnet.ndarray.Activation` for available choices.
+        Alternatively, other activation blocks such as nn.LeakyReLU can be used.
+    prefix : str, default 'conv_lstm_'
+        Prefix for name of layers (and name of weight if params is None).
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad=(0, 0),
+                 i2h_dilate=(1, 1), h2h_dilate=(1, 1),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 conv_layout='NCHW', activation='tanh',
+                 prefix=None, params=None):
+        super(Conv2DLSTMCell, self).__init__(input_shape=input_shape,
+                                             hidden_channels=hidden_channels,
+                                             i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                             i2h_pad=i2h_pad,
+                                             i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                             i2h_weight_initializer=i2h_weight_initializer,
+                                             h2h_weight_initializer=h2h_weight_initializer,
+                                             i2h_bias_initializer=i2h_bias_initializer,
+                                             h2h_bias_initializer=h2h_bias_initializer,
+                                             dims=2,
+                                             conv_layout=conv_layout,
+                                             activation=activation,
+                                             prefix=prefix, params=params)
+
+
+class Conv3DLSTMCell(_ConvLSTMCell):
+    r"""3D Convolutional LSTM network cell.
+
+    `"Convolutional LSTM Network: A Machine Learning Approach for Precipitation Nowcasting"
+    <https://arxiv.org/abs/1506.04214>`_ paper. Xingjian et al. NIPS2015
+
+    .. math::
+        \begin{array}{ll}
+        i_t = \sigma(W_i \ast x_t + R_i \ast h_{t-1} + b_i) \\
+        f_t = \sigma(W_f \ast x_t + R_f \ast h_{t-1} + b_f) \\
+        o_t = \sigma(W_o \ast x_t + R_o \ast h_{t-1} + b_o) \\
+        c^\prime_t = tanh(W_c \ast x_t + R_c \ast h_{t-1} + b_c) \\
+        c_t = f_t \circ c_{t-1} + i_t \circ c^\prime_t \\
+        h_t = o_t \circ tanh(c_t) \\
+        \end{array}
+
+    Parameters
+    ----------
+    input_shape : tuple of int
+        Input tensor shape at each time step for each sample, excluding dimension of the batch size
+        and sequence length. Must be consistent with `conv_layout`.
+        For example, for layout 'NCDHW' the shape should be (C, D, H, W).
+    hidden_channels : int
+        Number of output channels.
+    i2h_kernel : int or tuple of int
+        Input convolution kernel sizes.
+    h2h_kernel : int or tuple of int
+        Recurrent convolution kernel sizes. Only odd-numbered sizes are supported.
+    i2h_pad : int or tuple of int, default (0, 0, 0)
+        Pad for input convolution.
+    i2h_dilate : int or tuple of int, default (1, 1, 1)
+        Input convolution dilate.
+    h2h_dilate : int or tuple of int, default (1, 1, 1)
+        Recurrent convolution dilate.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the input convolutions.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the input convolutions.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the input convolution bias vectors.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the recurrent convolution bias vectors.
+    conv_layout : str, default 'NCDHW'
+        Layout for all convolution inputs, outputs and weights. Options are 'NCDHW' and 'NDHWC'.
+    activation : str or Block, default 'tanh'
+        Type of activation function used in c^\prime_t.
+        If argument type is string, it's equivalent to nn.Activation(act_type=str). See
+        :func:`~mxnet.ndarray.Activation` for available choices.
+        Alternatively, other activation blocks such as nn.LeakyReLU can be used.
+    prefix : str, default 'conv_lstm_'
+        Prefix for name of layers (and name of weight if params is None).
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad=(0, 0, 0),
+                 i2h_dilate=(1, 1, 1), h2h_dilate=(1, 1, 1),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 conv_layout='NCDHW', activation='tanh',
+                 prefix=None, params=None):
+        super(Conv3DLSTMCell, self).__init__(input_shape=input_shape,
+                                             hidden_channels=hidden_channels,
+                                             i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                             i2h_pad=i2h_pad,
+                                             i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                             i2h_weight_initializer=i2h_weight_initializer,
+                                             h2h_weight_initializer=h2h_weight_initializer,
+                                             i2h_bias_initializer=i2h_bias_initializer,
+                                             h2h_bias_initializer=h2h_bias_initializer,
+                                             dims=3,
+                                             conv_layout=conv_layout,
+                                             activation=activation,
+                                             prefix=prefix, params=params)
+
+
+class _ConvGRUCell(_BaseConvRNNCell):
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel, i2h_pad, i2h_dilate, h2h_dilate,
+                 i2h_weight_initializer, h2h_weight_initializer,
+                 i2h_bias_initializer, h2h_bias_initializer,
+                 dims, conv_layout, activation, prefix, params):
+        super(_ConvGRUCell, self).__init__(input_shape=input_shape,
+                                           hidden_channels=hidden_channels,
+                                           i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                           i2h_pad=i2h_pad,
+                                           i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                           i2h_weight_initializer=i2h_weight_initializer,
+                                           h2h_weight_initializer=h2h_weight_initializer,
+                                           i2h_bias_initializer=i2h_bias_initializer,
+                                           h2h_bias_initializer=h2h_bias_initializer,
+                                           dims=dims,
+                                           conv_layout=conv_layout,
+                                           activation=activation,
+                                           prefix=prefix, params=params)
+
+    def state_info(self, batch_size=0):
+        return [{'shape': (batch_size,)+self._state_shape, '__layout__': self._conv_layout}]
+
+    def _alias(self):
+        return 'conv_gru'
+
+    @property
+    def _gate_names(self):
+        return ['_r', '_z', '_o']
+
+    def hybrid_forward(self, F, inputs, states, i2h_weight,
+                       h2h_weight, i2h_bias, h2h_bias):
+        prefix = 't%d_'%self._counter
+        i2h, h2h = self._conv_forward(F, inputs, states,
+                                      i2h_weight, h2h_weight, i2h_bias, h2h_bias,
+                                      prefix)
+
+        i2h_r, i2h_z, i2h = F.SliceChannel(i2h, num_outputs=3,
+                                           name=prefix+'i2h_slice',
+                                           axis=self._channel_axis)
+        h2h_r, h2h_z, h2h = F.SliceChannel(h2h, num_outputs=3,
+                                           name=prefix+'h2h_slice',
+                                           axis=self._channel_axis)
+
+        reset_gate = F.Activation(i2h_r + h2h_r, act_type="sigmoid",
+                                  name=prefix+'r_act')
+        update_gate = F.Activation(i2h_z + h2h_z, act_type="sigmoid",
+                                   name=prefix+'z_act')
+
+        next_h_tmp = self._get_activation(F, i2h + reset_gate * h2h, self._activation,
+                                          name=prefix+'h_act')
+
+        next_h = F._internal._plus((1. - update_gate) * next_h_tmp, update_gate * states[0],
+                                   name=prefix+'out')
+
+        return next_h, [next_h]
+
+
+class Conv1DGRUCell(_ConvGRUCell):
+    r"""1D Convolutional Gated Rectified Unit (GRU) network cell.
+
+    .. math::
+        \begin{array}{ll}
+        r_t = \sigma(W_r \ast x_t + R_r \ast h_{t-1} + b_r) \\
+        z_t = \sigma(W_z \ast x_t + R_z \ast h_{t-1} + b_z) \\
+        n_t = tanh(W_i \ast x_t + b_i + r_t \circ (R_n \ast h_{t-1} + b_n)) \\
+        h^\prime_t = (1 - z_t) \circ n_t + z_t \circ h \\
+        \end{array}
+
+    Parameters
+    ----------
+    input_shape : tuple of int
+        Input tensor shape at each time step for each sample, excluding dimension of the batch size
+        and sequence length. Must be consistent with `conv_layout`.
+        For example, for layout 'NCW' the shape should be (C, W).
+    hidden_channels : int
+        Number of output channels.
+    i2h_kernel : int or tuple of int
+        Input convolution kernel sizes.
+    h2h_kernel : int or tuple of int
+        Recurrent convolution kernel sizes. Only odd-numbered sizes are supported.
+    i2h_pad : int or tuple of int, default (0,)
+        Pad for input convolution.
+    i2h_dilate : int or tuple of int, default (1,)
+        Input convolution dilate.
+    h2h_dilate : int or tuple of int, default (1,)
+        Recurrent convolution dilate.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the input convolutions.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the input convolutions.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the input convolution bias vectors.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the recurrent convolution bias vectors.
+    conv_layout : str, default 'NCW'
+        Layout for all convolution inputs, outputs and weights. Options are 'NCW' and 'NWC'.
+    activation : str or Block, default 'tanh'
+        Type of activation function used in n_t.
+        If argument type is string, it's equivalent to nn.Activation(act_type=str). See
+        :func:`~mxnet.ndarray.Activation` for available choices.
+        Alternatively, other activation blocks such as nn.LeakyReLU can be used.
+    prefix : str, default 'conv_gru_'
+        Prefix for name of layers (and name of weight if params is None).
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad=(0,),
+                 i2h_dilate=(1,), h2h_dilate=(1,),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 conv_layout='NCW', activation='tanh',
+                 prefix=None, params=None):
+        super(Conv1DGRUCell, self).__init__(input_shape=input_shape,
+                                            hidden_channels=hidden_channels,
+                                            i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                            i2h_pad=i2h_pad,
+                                            i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                            i2h_weight_initializer=i2h_weight_initializer,
+                                            h2h_weight_initializer=h2h_weight_initializer,
+                                            i2h_bias_initializer=i2h_bias_initializer,
+                                            h2h_bias_initializer=h2h_bias_initializer,
+                                            dims=1,
+                                            conv_layout=conv_layout,
+                                            activation=activation,
+                                            prefix=prefix, params=params)
+
+
+class Conv2DGRUCell(_ConvGRUCell):
+    r"""2D Convolutional Gated Rectified Unit (GRU) network cell.
+
+    .. math::
+        \begin{array}{ll}
+        r_t = \sigma(W_r \ast x_t + R_r \ast h_{t-1} + b_r) \\
+        z_t = \sigma(W_z \ast x_t + R_z \ast h_{t-1} + b_z) \\
+        n_t = tanh(W_i \ast x_t + b_i + r_t \circ (R_n \ast h_{t-1} + b_n)) \\
+        h^\prime_t = (1 - z_t) \circ n_t + z_t \circ h \\
+        \end{array}
+
+    Parameters
+    ----------
+    input_shape : tuple of int
+        Input tensor shape at each time step for each sample, excluding dimension of the batch size
+        and sequence length. Must be consistent with `conv_layout`.
+        For example, for layout 'NCHW' the shape should be (C, H, W).
+    hidden_channels : int
+        Number of output channels.
+    i2h_kernel : int or tuple of int
+        Input convolution kernel sizes.
+    h2h_kernel : int or tuple of int
+        Recurrent convolution kernel sizes. Only odd-numbered sizes are supported.
+    i2h_pad : int or tuple of int, default (0, 0)
+        Pad for input convolution.
+    i2h_dilate : int or tuple of int, default (1, 1)
+        Input convolution dilate.
+    h2h_dilate : int or tuple of int, default (1, 1)
+        Recurrent convolution dilate.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the input convolutions.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the input convolutions.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the input convolution bias vectors.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the recurrent convolution bias vectors.
+    conv_layout : str, default 'NCHW'
+        Layout for all convolution inputs, outputs and weights. Options are 'NCHW' and 'NHWC'.
+    activation : str or Block, default 'tanh'
+        Type of activation function used in n_t.
+        If argument type is string, it's equivalent to nn.Activation(act_type=str). See
+        :func:`~mxnet.ndarray.Activation` for available choices.
+        Alternatively, other activation blocks such as nn.LeakyReLU can be used.
+    prefix : str, default 'conv_gru_'
+        Prefix for name of layers (and name of weight if params is None).
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad=(0, 0),
+                 i2h_dilate=(1, 1), h2h_dilate=(1, 1),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 conv_layout='NCHW', activation='tanh',
+                 prefix=None, params=None):
+        super(Conv2DGRUCell, self).__init__(input_shape=input_shape,
+                                            hidden_channels=hidden_channels,
+                                            i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                            i2h_pad=i2h_pad,
+                                            i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                            i2h_weight_initializer=i2h_weight_initializer,
+                                            h2h_weight_initializer=h2h_weight_initializer,
+                                            i2h_bias_initializer=i2h_bias_initializer,
+                                            h2h_bias_initializer=h2h_bias_initializer,
+                                            dims=2,
+                                            conv_layout=conv_layout,
+                                            activation=activation,
+                                            prefix=prefix, params=params)
+
+
+class Conv3DGRUCell(_ConvGRUCell):
+    r"""3D Convolutional Gated Rectified Unit (GRU) network cell.
+
+    .. math::
+        \begin{array}{ll}
+        r_t = \sigma(W_r \ast x_t + R_r \ast h_{t-1} + b_r) \\
+        z_t = \sigma(W_z \ast x_t + R_z \ast h_{t-1} + b_z) \\
+        n_t = tanh(W_i \ast x_t + b_i + r_t \circ (R_n \ast h_{t-1} + b_n)) \\
+        h^\prime_t = (1 - z_t) \circ n_t + z_t \circ h \\
+        \end{array}
+
+    Parameters
+    ----------
+    input_shape : tuple of int
+        Input tensor shape at each time step for each sample, excluding dimension of the batch size
+        and sequence length. Must be consistent with `conv_layout`.
+        For example, for layout 'NCDHW' the shape should be (C, D, H, W).
+    hidden_channels : int
+        Number of output channels.
+    i2h_kernel : int or tuple of int
+        Input convolution kernel sizes.
+    h2h_kernel : int or tuple of int
+        Recurrent convolution kernel sizes. Only odd-numbered sizes are supported.
+    i2h_pad : int or tuple of int, default (0, 0, 0)
+        Pad for input convolution.
+    i2h_dilate : int or tuple of int, default (1, 1, 1)
+        Input convolution dilate.
+    h2h_dilate : int or tuple of int, default (1, 1, 1)
+        Recurrent convolution dilate.
+    i2h_weight_initializer : str or Initializer
+        Initializer for the input weights matrix, used for the input convolutions.
+    h2h_weight_initializer : str or Initializer
+        Initializer for the recurrent weights matrix, used for the input convolutions.
+    i2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the input convolution bias vectors.
+    h2h_bias_initializer : str or Initializer, default zeros
+        Initializer for the recurrent convolution bias vectors.
+    conv_layout : str, default 'NCDHW'
+        Layout for all convolution inputs, outputs and weights. Options are 'NCDHW' and 'NDHWC'.
+    activation : str or Block, default 'tanh'
+        Type of activation function used in n_t.
+        If argument type is string, it's equivalent to nn.Activation(act_type=str). See
+        :func:`~mxnet.ndarray.Activation` for available choices.
+        Alternatively, other activation blocks such as nn.LeakyReLU can be used.
+    prefix : str, default 'conv_gru_'
+        Prefix for name of layers (and name of weight if params is None).
+    params : RNNParams, default None
+        Container for weight sharing between cells. Created if None.
+    """
+    def __init__(self, input_shape, hidden_channels,
+                 i2h_kernel, h2h_kernel,
+                 i2h_pad=(0, 0, 0),
+                 i2h_dilate=(1, 1, 1), h2h_dilate=(1, 1, 1),
+                 i2h_weight_initializer=None, h2h_weight_initializer=None,
+                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
+                 conv_layout='NCDHW', activation='tanh',
+                 prefix=None, params=None):
+        super(Conv3DGRUCell, self).__init__(input_shape=input_shape,
+                                            hidden_channels=hidden_channels,
+                                            i2h_kernel=i2h_kernel, h2h_kernel=h2h_kernel,
+                                            i2h_pad=i2h_pad,
+                                            i2h_dilate=i2h_dilate, h2h_dilate=h2h_dilate,
+                                            i2h_weight_initializer=i2h_weight_initializer,
+                                            h2h_weight_initializer=h2h_weight_initializer,
+                                            i2h_bias_initializer=i2h_bias_initializer,
+                                            h2h_bias_initializer=h2h_bias_initializer,
+                                            dims=3,
+                                            conv_layout=conv_layout,
+                                            activation=activation,
+                                            prefix=prefix, params=params)
diff --git a/python/mxnet/gluon/contrib/rnn/rnn_cell.py b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
new file mode 100644
index 000000000000..d74c107df567
--- /dev/null
+++ b/python/mxnet/gluon/contrib/rnn/rnn_cell.py
@@ -0,0 +1,183 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Definition of various recurrent neural network cells."""
+__all__ = ['VariationalDropoutCell']
+
+from ...rnn import BidirectionalCell, SequentialRNNCell, ModifierCell
+from ...rnn.rnn_cell import _format_sequence, _get_begin_state
+
+
+class VariationalDropoutCell(ModifierCell):
+    """
+    Applies Variational Dropout on base cell.
+    (https://arxiv.org/pdf/1512.05287.pdf,
+     https://www.stat.berkeley.edu/~tsmoon/files/Conference/asru2015.pdf).
+
+    Variational dropout uses the same dropout mask across time-steps. It can be applied to RNN
+    inputs, outputs, and states. The masks for them are not shared.
+
+    The dropout mask is initialized when stepping forward for the first time and will remain
+    the same until .reset() is called. Thus, if using the cell and stepping manually without calling
+    .unroll(), the .reset() should be called after each sequence.
+
+    Parameters
+    ----------
+    base_cell : RecurrentCell
+        The cell on which to perform variational dropout.
+    drop_inputs : float, default 0.
+        The dropout rate for inputs. Won't apply dropout if it equals 0.
+    drop_states : float, default 0.
+        The dropout rate for state inputs on the first state channel.
+        Won't apply dropout if it equals 0.
+    drop_outputs : float, default 0.
+        The dropout rate for outputs. Won't apply dropout if it equals 0.
+    """
+    def __init__(self, base_cell, drop_inputs=0., drop_states=0., drop_outputs=0.):
+        assert not drop_states or not isinstance(base_cell, BidirectionalCell), \
+            "BidirectionalCell doesn't support variational state dropout. " \
+            "Please add VariationalDropoutCell to the cells underneath instead."
+        assert not drop_states \
+               or not isinstance(base_cell, SequentialRNNCell) or not base_cell._bidirectional, \
+            "Bidirectional SequentialRNNCell doesn't support variational state dropout. " \
+            "Please add VariationalDropoutCell to the cells underneath instead."
+        super(VariationalDropoutCell, self).__init__(base_cell)
+        self.drop_inputs = drop_inputs
+        self.drop_states = drop_states
+        self.drop_outputs = drop_outputs
+        self.drop_inputs_mask = None
+        self.drop_states_mask = None
+        self.drop_outputs_mask = None
+
+    def _alias(self):
+        return 'vardrop'
+
+    def reset(self):
+        super(VariationalDropoutCell, self).reset()
+        self.drop_inputs_mask = None
+        self.drop_states_mask = None
+        self.drop_outputs_mask = None
+
+    def _initialize_input_masks(self, F, inputs, states):
+        if self.drop_states and self.drop_states_mask is None:
+            self.drop_states_mask = F.Dropout(F.ones_like(states[0]),
+                                              p=self.drop_states)
+
+        if self.drop_inputs and self.drop_inputs_mask is None:
+            self.drop_inputs_mask = F.Dropout(F.ones_like(inputs),
+                                              p=self.drop_inputs)
+
+    def _initialize_output_mask(self, F, output):
+        if self.drop_outputs and self.drop_outputs_mask is None:
+            self.drop_outputs_mask = F.Dropout(F.ones_like(output),
+                                               p=self.drop_outputs)
+
+
+    def hybrid_forward(self, F, inputs, states):
+        cell = self.base_cell
+        self._initialize_input_masks(F, inputs, states)
+
+        if self.drop_states:
+            states = list(states)
+            # state dropout only needs to be applied on h, which is always the first state.
+            states[0] = states[0] * self.drop_states_mask
+
+        if self.drop_inputs:
+            inputs = inputs * self.drop_inputs_mask
+
+        next_output, next_states = cell(inputs, states)
+
+        self._initialize_output_mask(F, next_output)
+        if self.drop_outputs:
+            next_output = next_output * self.drop_outputs_mask
+
+        return next_output, next_states
+
+    def __repr__(self):
+        s = '{name}(p_out = {drop_outputs}, p_state = {drop_states})'
+        return s.format(name=self.__class__.__name__,
+                        **self.__dict__)
+
+    def unroll(self, length, inputs, begin_state=None, layout='NTC', merge_outputs=None):
+        """Unrolls an RNN cell across time steps.
+
+        Parameters
+        ----------
+        length : int
+            Number of steps to unroll.
+        inputs : Symbol, list of Symbol, or None
+            If `inputs` is a single Symbol (usually the output
+            of Embedding symbol), it should have shape
+            (batch_size, length, ...) if `layout` is 'NTC',
+            or (length, batch_size, ...) if `layout` is 'TNC'.
+
+            If `inputs` is a list of symbols (usually output of
+            previous unroll), they should all have shape
+            (batch_size, ...).
+        begin_state : nested list of Symbol, optional
+            Input states created by `begin_state()`
+            or output state of another cell.
+            Created from `begin_state()` if `None`.
+        layout : str, optional
+            `layout` of input symbol. Only used if inputs
+            is a single Symbol.
+        merge_outputs : bool, optional
+            If `False`, returns outputs as a list of Symbols.
+            If `True`, concatenates output across time steps
+            and returns a single symbol with shape
+            (batch_size, length, ...) if layout is 'NTC',
+            or (length, batch_size, ...) if layout is 'TNC'.
+            If `None`, output whatever is faster.
+
+        Returns
+        -------
+        outputs : list of Symbol or Symbol
+            Symbol (if `merge_outputs` is True) or list of Symbols
+            (if `merge_outputs` is False) corresponding to the output from
+            the RNN from this unrolling.
+
+        states : list of Symbol
+            The new state of this RNN after this unrolling.
+            The type of this symbol is same as the output of `begin_state()`.
+        """
+
+        # Dropout on inputs and outputs can be performed on the whole sequence
+        # only when state dropout is not present.
+        if self.drop_states:
+            return super(VariationalDropoutCell, self).unroll(length, inputs, begin_state,
+                                                              layout, merge_outputs)
+
+        self.reset()
+
+        inputs, axis, F, batch_size = _format_sequence(length, inputs, layout, True)
+        states = _get_begin_state(self, F, begin_state, inputs, batch_size)
+
+        if self.drop_inputs:
+            first_input = inputs.slice_axis(axis, 0, 1).split(1, axis=axis, squeeze_axis=True)
+            self._initialize_input_masks(F, first_input, states)
+            inputs = F.broadcast_mul(inputs, self.drop_inputs_mask.expand_dims(axis=axis))
+
+        outputs, states = self.base_cell.unroll(length, inputs, states, layout, merge_outputs=True)
+        if self.drop_outputs:
+            first_output = outputs.slice_axis(axis, 0, 1).split(1, axis=axis, squeeze_axis=True)
+            self._initialize_output_mask(F, first_output)
+            outputs = F.broadcast_mul(outputs, self.drop_outputs_mask.expand_dims(axis=axis))
+
+        outputs, _, _, _ = _format_sequence(length, outputs, layout, merge_outputs)
+
+        return outputs, states
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 772209a6f2aa..beb228ec24c6 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -18,25 +18,109 @@
 # coding: utf-8
 # pylint: disable=
 """Dataset generator."""
+__all__ = ['DataLoader']
 
+import multiprocessing
+import multiprocessing.queues
+from multiprocessing.reduction import ForkingPickler
+import pickle
+import io
+import os
+import sys
+import warnings
 import numpy as np
 
 from . import sampler as _sampler
-from ... import nd
+from ... import nd, context
 
 
-def _batchify(data):
+def rebuild_ndarray(*args):
+    """Rebuild ndarray from pickled shared memory"""
+    # pylint: disable=no-value-for-parameter
+    return nd.NDArray(nd.ndarray._new_from_shared_mem(*args))
+
+
+def reduce_ndarray(data):
+    """Reduce ndarray to shared memory handle"""
+    return rebuild_ndarray, data._to_shared_mem()
+
+ForkingPickler.register(nd.NDArray, reduce_ndarray)
+
+
+class ConnectionWrapper(object):
+    """Connection wrapper for multiprocessing that supports sending
+    NDArray via shared memory."""
+
+    def __init__(self, conn):
+        self.conn = conn
+
+    def send(self, obj):
+        """Send object"""
+        buf = io.BytesIO()
+        ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(obj)
+        self.send_bytes(buf.getvalue())
+
+    def recv(self):
+        """Receive object"""
+        buf = self.recv_bytes()
+        return pickle.loads(buf)
+
+    def __getattr__(self, name):
+        """Emmulate conn"""
+        return getattr(self.conn, name)
+
+
+class Queue(multiprocessing.queues.Queue):
+    """Wrapper for multiprocessing queue that dumps NDArray with shared memory."""
+    def __init__(self, *args, **kwargs):
+        if sys.version_info[0] <= 2:
+            super(Queue, self).__init__(*args, **kwargs)
+        else:
+            super(Queue, self).__init__(*args, ctx=multiprocessing.get_context(),
+                                        **kwargs)
+        self._reader = ConnectionWrapper(self._reader)
+        self._writer = ConnectionWrapper(self._writer)
+        self._send = self._writer.send
+        self._recv = self._reader.recv
+
+
+def default_batchify_fn(data):
     """Collate data into batch."""
     if isinstance(data[0], nd.NDArray):
         return nd.stack(*data)
     elif isinstance(data[0], tuple):
         data = zip(*data)
-        return [_batchify(i) for i in data]
+        return [default_batchify_fn(i) for i in data]
     else:
         data = np.asarray(data)
         return nd.array(data, dtype=data.dtype)
 
 
+def default_mp_batchify_fn(data):
+    """Collate data into batch. Use shared memory for stacking."""
+    if isinstance(data[0], nd.NDArray):
+        out = nd.empty((len(data),) + data[0].shape, dtype=data[0].dtype,
+                       ctx=context.Context('cpu_shared', 0))
+        return nd.stack(*data, out=out)
+    elif isinstance(data[0], tuple):
+        data = zip(*data)
+        return [default_mp_batchify_fn(i) for i in data]
+    else:
+        data = np.asarray(data)
+        return nd.array(data, dtype=data.dtype,
+                        ctx=context.Context('cpu_shared', 0))
+
+
+def worker_loop(dataset, key_queue, data_queue, batchify_fn):
+    """Worker loop for multiprocessing DataLoader."""
+    while True:
+        idx, samples = key_queue.get()
+        if idx is None:
+            break
+        batch = batchify_fn([dataset[i] for i in samples])
+        data_queue.put((idx, batch))
+
+
 class DataLoader(object):
     """Loads data from a dataset and returns mini-batches of data.
 
@@ -61,9 +145,27 @@ class DataLoader(object):
     batch_sampler : Sampler
         A sampler that returns mini-batches. Do not specify batch_size,
         shuffle, sampler, and last_batch if batch_sampler is specified.
+    batchify_fn : callable
+        Callback function to allow users to specify how to merge samples
+        into a batch. Defaults to `default_batchify_fn`::
+
+            def default_batchify_fn(data):
+                if isinstance(data[0], nd.NDArray):
+                    return nd.stack(*data)
+                elif isinstance(data[0], tuple):
+                    data = zip(*data)
+                    return [default_batchify_fn(i) for i in data]
+                else:
+                    data = np.asarray(data)
+                    return nd.array(data, dtype=data.dtype)
+
+    num_workers : int, default 0
+        The number of multiprocessing workers to use for data preprocessing.
+        `num_workers > 0` is not supported on Windows yet.
     """
     def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
-                 last_batch=None, batch_sampler=None):
+                 last_batch=None, batch_sampler=None, batchify_fn=None,
+                 num_workers=0):
         self._dataset = dataset
 
         if batch_sampler is None:
@@ -86,10 +188,53 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
                              "not be specified if batch_sampler is specified.")
 
         self._batch_sampler = batch_sampler
+        if num_workers > 0 and os.name == 'nt':
+            warnings.warn("DataLoader does not support num_workers > 0 on Windows yet.")
+            num_workers = 0
+        self._num_workers = num_workers
+        if batchify_fn is None:
+            if num_workers > 0:
+                self._batchify_fn = default_mp_batchify_fn
+            else:
+                self._batchify_fn = default_batchify_fn
+        else:
+            self._batchify_fn = batchify_fn
 
     def __iter__(self):
-        for batch in self._batch_sampler:
-            yield _batchify([self._dataset[idx] for idx in batch])
+        if self._num_workers == 0:
+            for batch in self._batch_sampler:
+                yield self._batchify_fn([self._dataset[idx] for idx in batch])
+            return
+
+        key_queue = Queue()
+        data_queue = Queue(2*self._num_workers)
+
+        workers = []
+        for _ in range(self._num_workers):
+            worker = multiprocessing.Process(
+                target=worker_loop,
+                args=(self._dataset, key_queue, data_queue, self._batchify_fn))
+            worker.daemon = True
+            worker.start()
+            workers.append(worker)
+
+        for idx, batch in enumerate(self._batch_sampler):
+            key_queue.put((idx, batch))
+
+        data_buffer = {}
+        curr_idx = 0
+        for _ in range(len(self._batch_sampler)):
+            idx, batch = data_queue.get()
+            data_buffer[idx] = batch
+            while curr_idx in data_buffer:
+                yield data_buffer.pop(curr_idx)
+                curr_idx += 1
+
+        for _ in range(self._num_workers):
+            key_queue.put((None, None))
+
+        for worker in workers:
+            worker.join()
 
     def __len__(self):
         return len(self._batch_sampler)
diff --git a/python/mxnet/gluon/data/dataset.py b/python/mxnet/gluon/data/dataset.py
index 2fa20ccc522f..059c2a61c70b 100644
--- a/python/mxnet/gluon/data/dataset.py
+++ b/python/mxnet/gluon/data/dataset.py
@@ -18,6 +18,8 @@
 # coding: utf-8
 # pylint: disable=
 """Dataset container."""
+__all__ = ['Dataset', 'ArrayDataset', 'RecordFileDataset']
+
 import os
 
 from ... import recordio, ndarray
@@ -38,30 +40,35 @@ def __len__(self):
 
 
 class ArrayDataset(Dataset):
-    """A dataset with a data array and a label array.
+    """A dataset of multiple arrays.
 
-    The i-th sample is `(data[i], lable[i])`.
+    The i-th sample is `(x1[i], x2[i], ...)`.
 
     Parameters
     ----------
-    data : array-like object
-        The data array. Can be mxnet or numpy array.
-    label : array-like object
-        The label array. Can be mxnet or numpy array.
+    *args : one or more arrays
+        The data arrays.
     """
-    def __init__(self, data, label):
-        assert len(data) == len(label)
-        self._data = data
-        if isinstance(label, ndarray.NDArray) and len(label.shape) == 1:
-            self._label = label.asnumpy()
-        else:
-            self._label = label
+    def __init__(self, *args):
+        assert len(args) > 0, "Needs at least 1 arrays"
+        self._length = len(args[0])
+        self._data = []
+        for i, data in enumerate(args):
+            assert len(data) == self._length, \
+                "All arrays must have the same length. But the first has %s " \
+                "while the %d-th has %d."%(length, i+1, len(data))
+            if isinstance(data, ndarray.NDArray) and len(data.shape) == 1:
+                data = data.asnumpy()
+            self._data.append(data)
 
     def __getitem__(self, idx):
-        return self._data[idx], self._label[idx]
+        if len(self._data) == 1:
+            return self._data[0][idx]
+        else:
+            return tuple(data[idx] for data in self._data)
 
     def __len__(self):
-        return len(self._data)
+        return self._length
 
 
 class RecordFileDataset(Dataset):
@@ -79,7 +86,7 @@ def __init__(self, filename):
         self._record = recordio.MXIndexedRecordIO(idx_file, filename, 'r')
 
     def __getitem__(self, idx):
-        return self._record.read_idx(idx)
+        return self._record.read_idx(self._record.keys[idx])
 
     def __len__(self):
         return len(self._record.keys)
diff --git a/python/mxnet/gluon/data/sampler.py b/python/mxnet/gluon/data/sampler.py
index 80f115e0333f..66d6cfb29797 100644
--- a/python/mxnet/gluon/data/sampler.py
+++ b/python/mxnet/gluon/data/sampler.py
@@ -18,6 +18,7 @@
 # coding: utf-8
 # pylint: disable=
 """Dataset sampler."""
+__all__ = ['Sampler', 'SequentialSampler', 'RandomSampler', 'BatchSampler']
 
 import random
 
diff --git a/python/mxnet/gluon/data/vision.py b/python/mxnet/gluon/data/vision.py
index b63624508124..54da152b9fe1 100644
--- a/python/mxnet/gluon/data/vision.py
+++ b/python/mxnet/gluon/data/vision.py
@@ -18,6 +18,8 @@
 # coding: utf-8
 # pylint: disable=
 """Dataset container."""
+__all__ = ['MNIST', 'FashionMNIST', 'CIFAR10', 'CIFAR100',
+           'ImageRecordDataset', 'ImageFolderDataset']
 
 import os
 import gzip
@@ -30,16 +32,25 @@
 from ..utils import download, check_sha1
 from ... import nd, image, recordio
 
+apache_repo_url = 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'
 
 class _DownloadedDataset(dataset.Dataset):
     """Base class for MNIST, cifar10, etc."""
-    def __init__(self, root, train, transform):
+    def __init__(self, repo_dir, root, train, transform):
         self._root = os.path.expanduser(root)
+        self._repo_dir = repo_dir
         self._train = train
         self._transform = transform
         self._data = None
         self._label = None
 
+        repo_url = os.environ.get('MXNET_GLUON_REPO', apache_repo_url)
+        if repo_url[-1] != '/':
+            repo_url = repo_url+'/'
+        self._base_url = repo_url
+
+        if not os.path.isdir(self._root):
+            os.makedirs(self._root)
         self._get_data()
 
     def __getitem__(self, idx):
@@ -53,41 +64,54 @@ def __len__(self):
     def _get_data(self):
         raise NotImplementedError
 
+    def _get_url(self, filename):
+        return '{base_url}gluon/dataset/{repo_dir}/{filename}'.format(base_url=self._base_url,
+                                                                      repo_dir=self._repo_dir,
+                                                                      filename=filename)
+
 
 class MNIST(_DownloadedDataset):
-    """MNIST handwritten digits dataset from `http://yann.lecun.com/exdb/mnist`_.
+    """MNIST handwritten digits dataset from http://yann.lecun.com/exdb/mnist
 
     Each sample is an image (in 3D NDArray) with shape (28, 28, 1).
 
     Parameters
     ----------
-    root : str
+    root : str, default '~/.mxnet/datasets/mnist'
         Path to temp folder for storing data.
-    train : bool
+    train : bool, default True
         Whether to load the training or testing set.
-    transform : function
-        A user defined callback that transforms each instance. For example::
+    transform : function, default None
+        A user defined callback that transforms each sample. For example:
+    ::
+
+        transform=lambda data, label: (data.astype(np.float32)/255, label)
 
-            transform=lambda data, label: (data.astype(np.float32)/255, label)
     """
-    def __init__(self, root='~/.mxnet/datasets/', train=True,
+    def __init__(self, root='~/.mxnet/datasets/mnist', train=True,
                  transform=None):
-        super(MNIST, self).__init__(root, train, transform)
+        self._train_data = ('train-images-idx3-ubyte.gz',
+                            '6c95f4b05d2bf285e1bfb0e7960c31bd3b3f8a7d')
+        self._train_label = ('train-labels-idx1-ubyte.gz',
+                             '2a80914081dc54586dbdf242f9805a6b8d2a15fc')
+        self._test_data = ('t10k-images-idx3-ubyte.gz',
+                           'c3a25af1f52dad7f726cce8cacb138654b760d48')
+        self._test_label = ('t10k-labels-idx1-ubyte.gz',
+                            '763e7fa3757d93b0cdec073cef058b2004252c17')
+        super(MNIST, self).__init__('mnist', root, train, transform)
 
     def _get_data(self):
-        if not os.path.isdir(self._root):
-            os.makedirs(self._root)
-        url = 'http://data.mxnet.io/data/mnist/'
         if self._train:
-            data_file = download(url+'train-images-idx3-ubyte.gz', self._root,
-                                 sha1_hash='6c95f4b05d2bf285e1bfb0e7960c31bd3b3f8a7d')
-            label_file = download(url+'train-labels-idx1-ubyte.gz', self._root,
-                                  sha1_hash='2a80914081dc54586dbdf242f9805a6b8d2a15fc')
+            data, label = self._train_data, self._train_label
         else:
-            data_file = download(url+'t10k-images-idx3-ubyte.gz', self._root,
-                                 sha1_hash='c3a25af1f52dad7f726cce8cacb138654b760d48')
-            label_file = download(url+'t10k-labels-idx1-ubyte.gz', self._root,
-                                  sha1_hash='763e7fa3757d93b0cdec073cef058b2004252c17')
+            data, label = self._test_data, self._test_label
+
+        data_file = download(self._get_url(data[0]),
+                             path=self._root,
+                             sha1_hash=data[1])
+        label_file = download(self._get_url(label[0]),
+                              path=self._root,
+                              sha1_hash=label[1])
 
         with gzip.open(label_file, 'rb') as fin:
             struct.unpack(">II", fin.read(8))
@@ -98,35 +122,71 @@ def _get_data(self):
             data = np.fromstring(fin.read(), dtype=np.uint8)
             data = data.reshape(len(label), 28, 28, 1)
 
-        self._data = [nd.array(x, dtype=x.dtype) for x in data]
+        self._data = nd.array(data, dtype=data.dtype)
         self._label = label
 
 
+class FashionMNIST(MNIST):
+    """A dataset of Zalando's article images consisting of fashion products,
+    a drop-in replacement of the original MNIST dataset from
+    https://github.com/zalandoresearch/fashion-mnist
+
+    Each sample is an image (in 3D NDArray) with shape (28, 28, 1).
+
+    Parameters
+    ----------
+    root : str, default '~/.mxnet/datasets/fashion-mnist'
+        Path to temp folder for storing data.
+    train : bool, default True
+        Whether to load the training or testing set.
+    transform : function, default None
+        A user defined callback that transforms each sample. For example:
+    ::
+
+        transform=lambda data, label: (data.astype(np.float32)/255, label)
+
+    """
+    def __init__(self, root='~/.mxnet/datasets/fashion-mnist', train=True,
+                 transform=None):
+        self._train_data = ('train-images-idx3-ubyte.gz',
+                            '0cf37b0d40ed5169c6b3aba31069a9770ac9043d')
+        self._train_label = ('train-labels-idx1-ubyte.gz',
+                             '236021d52f1e40852b06a4c3008d8de8aef1e40b')
+        self._test_data = ('t10k-images-idx3-ubyte.gz',
+                           '626ed6a7c06dd17c0eec72fa3be1740f146a2863')
+        self._test_label = ('t10k-labels-idx1-ubyte.gz',
+                            '17f9ab60e7257a1620f4ad76bbbaf857c3920701')
+        super(MNIST, self).__init__('fashion-mnist', root, train, transform) # pylint: disable=bad-super-call
+
+
 class CIFAR10(_DownloadedDataset):
-    """CIFAR10 image classification dataset from `https://www.cs.toronto.edu/~kriz/cifar.html`_.
+    """CIFAR10 image classification dataset from https://www.cs.toronto.edu/~kriz/cifar.html
 
     Each sample is an image (in 3D NDArray) with shape (32, 32, 1).
 
     Parameters
     ----------
-    root : str
+    root : str, default '~/.mxnet/datasets/cifar10'
         Path to temp folder for storing data.
-    train : bool
+    train : bool, default True
         Whether to load the training or testing set.
-    transform : function
-        A user defined callback that transforms each instance. For example::
+    transform : function, default None
+        A user defined callback that transforms each sample. For example:
+    ::
+
+        transform=lambda data, label: (data.astype(np.float32)/255, label)
 
-            transform=lambda data, label: (data.astype(np.float32)/255, label)
     """
-    def __init__(self, root='~/.mxnet/datasets/', train=True,
+    def __init__(self, root='~/.mxnet/datasets/cifar10', train=True,
                  transform=None):
-        self._file_hashes = {'data_batch_1.bin': 'aadd24acce27caa71bf4b10992e9e7b2d74c2540',
-                             'data_batch_2.bin': 'c0ba65cce70568cd57b4e03e9ac8d2a5367c1795',
-                             'data_batch_3.bin': '1dd00a74ab1d17a6e7d73e185b69dbf31242f295',
-                             'data_batch_4.bin': 'aab85764eb3584312d3c7f65fd2fd016e36a258e',
-                             'data_batch_5.bin': '26e2849e66a845b7f1e4614ae70f4889ae604628',
-                             'test_batch.bin': '67eb016db431130d61cd03c7ad570b013799c88c'}
-        super(CIFAR10, self).__init__(root, train, transform)
+        self._archive_file = ('cifar-10-binary.tar.gz', 'fab780a1e191a7eda0f345501ccd62d20f7ed891')
+        self._train_data = [('data_batch_1.bin', 'aadd24acce27caa71bf4b10992e9e7b2d74c2540'),
+                            ('data_batch_2.bin', 'c0ba65cce70568cd57b4e03e9ac8d2a5367c1795'),
+                            ('data_batch_3.bin', '1dd00a74ab1d17a6e7d73e185b69dbf31242f295'),
+                            ('data_batch_4.bin', 'aab85764eb3584312d3c7f65fd2fd016e36a258e'),
+                            ('data_batch_5.bin', '26e2849e66a845b7f1e4614ae70f4889ae604628')]
+        self._test_data = [('test_batch.bin', '67eb016db431130d61cd03c7ad570b013799c88c')]
+        super(CIFAR10, self).__init__('cifar10', root, train, transform)
 
     def _read_batch(self, filename):
         with open(filename, 'rb') as fin:
@@ -136,33 +196,65 @@ def _read_batch(self, filename):
                data[:, 0].astype(np.int32)
 
     def _get_data(self):
-        if not os.path.isdir(self._root):
-            os.makedirs(self._root)
-
-        file_paths = [(name, os.path.join(self._root, 'cifar-10-batches-bin/', name))
-                      for name in self._file_hashes]
-        if any(not os.path.exists(path) or not check_sha1(path, self._file_hashes[name])
-               for name, path in file_paths):
-            url = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
-            filename = download(url, self._root,
-                                sha1_hash='e8aa088b9774a44ad217101d2e2569f823d2d491')
+        if any(not os.path.exists(path) or not check_sha1(path, sha1)
+               for path, sha1 in ((os.path.join(self._root, name), sha1)
+                                  for name, sha1 in self._train_data + self._test_data)):
+            filename = download(self._get_url(self._archive_file[0]),
+                                path=self._root,
+                                sha1_hash=self._archive_file[1])
 
             with tarfile.open(filename) as tar:
                 tar.extractall(self._root)
 
         if self._train:
-            filename = os.path.join(self._root, 'cifar-10-batches-bin/data_batch_%d.bin')
-            data, label = zip(*[self._read_batch(filename%i) for i in range(1, 6)])
-            data = np.concatenate(data)
-            label = np.concatenate(label)
+            data_files = self._train_data
         else:
-            filename = os.path.join(self._root, 'cifar-10-batches-bin/test_batch.bin')
-            data, label = self._read_batch(filename)
+            data_files = self._test_data
+        data, label = zip(*(self._read_batch(os.path.join(self._root, name))
+                            for name, _ in data_files))
+        data = np.concatenate(data)
+        label = np.concatenate(label)
 
-        self._data = [nd.array(x, dtype=x.dtype) for x in data]
+        self._data = nd.array(data, dtype=data.dtype)
         self._label = label
 
 
+class CIFAR100(CIFAR10):
+    """CIFAR100 image classification dataset from https://www.cs.toronto.edu/~kriz/cifar.html
+
+    Each sample is an image (in 3D NDArray) with shape (32, 32, 1).
+
+    Parameters
+    ----------
+    root : str, default '~/.mxnet/datasets/cifar100'
+        Path to temp folder for storing data.
+    fine_label : bool, default False
+        Whether to load the fine-grained (100 classes) or coarse-grained (20 super-classes) labels.
+    train : bool, default True
+        Whether to load the training or testing set.
+    transform : function, default None
+        A user defined callback that transforms each sample. For example:
+    ::
+
+        transform=lambda data, label: (data.astype(np.float32)/255, label)
+
+    """
+    def __init__(self, root='~/.mxnet/datasets/cifar100', fine_label=False, train=True,
+                 transform=None):
+        self._archive_file = ('cifar-100-binary.tar.gz', 'a0bb982c76b83111308126cc779a992fa506b90b')
+        self._train_data = [('train.bin', 'e207cd2e05b73b1393c74c7f5e7bea451d63e08e')]
+        self._test_data = [('test.bin', '8fb6623e830365ff53cf14adec797474f5478006')]
+        self._fine_label = fine_label
+        super(CIFAR10, self).__init__('cifar100', root, train, transform) # pylint: disable=bad-super-call
+
+    def _read_batch(self, filename):
+        with open(filename, 'rb') as fin:
+            data = np.fromstring(fin.read(), dtype=np.uint8).reshape(-1, 3072+2)
+
+        return data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1), \
+               data[:, 0+self._fine_label].astype(np.int32)
+
+
 class ImageRecordDataset(dataset.RecordFileDataset):
     """A dataset wrapping over a RecordIO file containing images.
 
@@ -176,10 +268,12 @@ class ImageRecordDataset(dataset.RecordFileDataset):
         If 0, always convert images to greyscale.
 
         If 1, always convert images to colored (RGB).
-    transform : function
-        A user defined callback that transforms each instance. For example::
+    transform : function, default None
+        A user defined callback that transforms each sample. For example:
+    ::
+
+        transform=lambda data, label: (data.astype(np.float32)/255, label)
 
-            transform=lambda data, label: (data.astype(np.float32)/255, label)
     """
     def __init__(self, filename, flag=1, transform=None):
         super(ImageRecordDataset, self).__init__(filename)
@@ -211,10 +305,11 @@ class ImageFolderDataset(dataset.Dataset):
     flag : {0, 1}, default 1
         If 0, always convert loaded images to greyscale (1 channel).
         If 1, always convert loaded images to colored (3 channels).
-    transform : callable
-        A function that takes data and label and transforms them::
+    transform : callable, default None
+        A function that takes data and label and transforms them:
+    ::
 
-            transform = lambda data, label: (data.astype(np.float32)/255, label)
+        transform = lambda data, label: (data.astype(np.float32)/255, label)
 
     Attributes
     ----------
@@ -228,9 +323,9 @@ def __init__(self, root, flag=1, transform=None):
         self._flag = flag
         self._transform = transform
         self._exts = ['.jpg', '.jpeg', '.png']
-        self._list_iamges(self._root)
+        self._list_images(self._root)
 
-    def _list_iamges(self, root):
+    def _list_images(self, root):
         self.synsets = []
         self.items = []
 
diff --git a/python/mxnet/gluon/loss.py b/python/mxnet/gluon/loss.py
index 583910590868..614025cd35e4 100644
--- a/python/mxnet/gluon/loss.py
+++ b/python/mxnet/gluon/loss.py
@@ -19,6 +19,11 @@
 # pylint: disable=arguments-differ
 """ losses for training neural networks """
 from __future__ import absolute_import
+__all__ = ['Loss', 'L2Loss', 'L1Loss',
+           'SigmoidBinaryCrossEntropyLoss', 'SigmoidBCELoss',
+           'SoftmaxCrossEntropyLoss', 'SoftmaxCELoss',
+           'KLDivLoss', 'CTCLoss', 'HuberLoss', 'HingeLoss',
+           'SquaredHingeLoss', 'LogisticLoss', 'TripletLoss']
 
 from .. import ndarray
 from ..base import numeric_types
@@ -54,11 +59,9 @@ def _apply_weighting(F, loss, weight=None, sample_weight=None):
 
     return loss
 
-def _reshape_label_as_output(F, output, label):
-    # for symbolic output.shape is not available so we reshape
-    # to empty shape and let it be inferred from output's shape
-    # via the '-' operator later.
-    return label.reshape(output.shape) if F is ndarray else label.reshape(())
+def _reshape_like(F, x, y):
+    """Reshapes x to the same shape as y."""
+    return x.reshape(y.shape) if F is ndarray else F.reshape_like(x, y)
 
 class Loss(HybridBlock):
     """Base class for loss.
@@ -88,68 +91,84 @@ def hybrid_forward(self, F, x, *args, **kwargs):
             The first input tensor.
         *args : list of Symbol or list of NDArray
             Additional input tensors.
+
         """
         # pylint: disable= invalid-name
         raise NotImplementedError
 
 
 class L2Loss(Loss):
-    """Calculates the mean squared error between output and label:
+    r"""Calculates the mean squared error between `pred` and `label`.
 
-    .. math::
-        L = \\frac{1}{2}\\sum_i \\Vert {output}_i - {label}_i \\Vert^2.
+    .. math:: L = \frac{1}{2} \sum_i \vert {pred}_i - {label}_i \vert^2.
 
-    Output and label can have arbitrary shape as long as they have the same
+    `pred` and `label` can have arbitrary shape as long as they have the same
     number of elements.
 
     Parameters
     ----------
     weight : float or None
         Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: prediction tensor with arbitrary shape
+        - **label**: target tensor with the same size as pred.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
+          batch_axis are averaged out.
     """
     def __init__(self, weight=1., batch_axis=0, **kwargs):
         super(L2Loss, self).__init__(weight, batch_axis, **kwargs)
 
-    def hybrid_forward(self, F, output, label, sample_weight=None):
-        label = _reshape_label_as_output(F, output, label)
-        loss = F.square(output - label)
+    def hybrid_forward(self, F, pred, label, sample_weight=None):
+        label = _reshape_like(F, label, pred)
+        loss = F.square(pred - label)
         loss = _apply_weighting(F, loss, self._weight/2, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
 
 class L1Loss(Loss):
-    """Calculates the mean absolute error between output and label:
+    r"""Calculates the mean absolute error between `pred` and `label`.
 
-    .. math::
-        L = \\frac{1}{2}\\sum_i \\vert {output}_i - {label}_i \\vert.
+    .. math:: L = \sum_i \vert {pred}_i - {label}_i \vert.
 
-    Output and label must have the same shape.
+    `pred` and `label` can have arbitrary shape as long as they have the same
+    number of elements.
 
     Parameters
     ----------
     weight : float or None
         Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: prediction tensor with arbitrary shape
+        - **label**: target tensor with the same size as pred.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
+          batch_axis are averaged out.
     """
     def __init__(self, weight=None, batch_axis=0, **kwargs):
         super(L1Loss, self).__init__(weight, batch_axis, **kwargs)
 
-    def hybrid_forward(self, F, output, label, sample_weight=None):
-        label = _reshape_label_as_output(F, output, label)
-        loss = F.abs(output - label)
+    def hybrid_forward(self, F, pred, label, sample_weight=None):
+        label = _reshape_like(F, label, pred)
+        loss = F.abs(pred - label)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
@@ -157,39 +176,63 @@ def hybrid_forward(self, F, output, label, sample_weight=None):
 class SigmoidBinaryCrossEntropyLoss(Loss):
     r"""The cross-entropy loss for binary classification. (alias: SigmoidBCELoss)
 
-    BCE loss is useful when training logistic regression.
+    BCE loss is useful when training logistic regression. If `from_sigmoid`
+    is False (default), this loss computes:
+
+    .. math::
+
+        prob = \frac{1}{1 + \exp(-{pred})}
+
+        L = - \sum_i {label}_i * \log({prob}_i) +
+            (1 - {label}_i) * \log(1 - {prob}_i)
+
+    If `from_sigmoid` is True, this loss computes:
 
     .. math::
-        loss(o, t) = - 1/n \sum_i (t[i] * log(o[i]) + (1 - t[i]) * log(1 - o[i]))
 
+        L = - \sum_i {label}_i * \log({pred}_i) +
+            (1 - {label}_i) * \log(1 - {pred}_i)
+
+
+    `pred` and `label` can have arbitrary shape as long as they have the same
+    number of elements.
 
     Parameters
     ----------
     from_sigmoid : bool, default is `False`
         Whether the input is from the output of sigmoid. Set this to false will make
-        the loss calculate sigmoid and then BCE, which is more numerically stable through
-        log-sum-exp trick.
+        the loss calculate sigmoid and BCE together, which is more numerically
+        stable through log-sum-exp trick.
     weight : float or None
         Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: prediction tensor with arbitrary shape
+        - **label**: target tensor with values in range `[0, 1]`. Must have the
+          same size as `pred`.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
+          batch_axis are averaged out.
     """
     def __init__(self, from_sigmoid=False, weight=None, batch_axis=0, **kwargs):
         super(SigmoidBinaryCrossEntropyLoss, self).__init__(weight, batch_axis, **kwargs)
         self._from_sigmoid = from_sigmoid
 
-    def hybrid_forward(self, F, output, label, sample_weight=None):
-        label = _reshape_label_as_output(F, output, label)
+    def hybrid_forward(self, F, pred, label, sample_weight=None):
+        label = _reshape_like(F, label, pred)
         if not self._from_sigmoid:
-            max_val = F.maximum(-output, 0)
-            loss = output - output*label + max_val + F.log(F.exp(-max_val)+F.exp(-output-max_val))
+            max_val = F.relu(-pred)
+            loss = pred - pred*label + max_val + F.log(F.exp(-max_val)+F.exp(-pred-max_val))
         else:
-            loss = -(F.log(output+1e-8)*label + F.log(1.-output+1e-8)*(1.-label))
+            loss = -(F.log(pred+1e-12)*label + F.log(1.-pred+1e-12)*(1.-label))
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
@@ -197,25 +240,31 @@ def hybrid_forward(self, F, output, label, sample_weight=None):
 
 
 class SoftmaxCrossEntropyLoss(Loss):
-    """Computes the softmax cross entropy loss. (alias: SoftmaxCELoss)
+    r"""Computes the softmax cross entropy loss. (alias: SoftmaxCELoss)
 
-    If `sparse_label` is `True`, label should contain integer category indicators:
+    If `sparse_label` is `True` (default), label should contain integer
+    category indicators:
 
     .. math::
-        p = {softmax}({output})
 
-        L = -\\sum_i {log}(p_{i,{label}_i})
+        \DeclareMathOperator{softmax}{softmax}
 
-    Label's shape should be output's shape without the `axis` dimension. i.e. for
-    `output.shape` = (1,2,3,4) and axis = 2, `label.shape` should be (1,2,4).
+        p = \softmax({pred})
 
-    If `sparse_label` is `False`, label should contain probability distribution
-    with the same shape as output:
+        L = -\sum_i \log p_{i,{label}_i}
+
+    `label`'s shape should be `pred`'s shape with the `axis` dimension removed.
+    i.e. for `pred` with shape (1,2,3,4) and `axis = 2`, `label`'s shape should
+    be (1,2,4).
+
+    If `sparse_label` is `False`, `label` should contain probability distribution
+    and `label`'s shape should be the same with `pred`:
 
     .. math::
-        p = {softmax}({output})
 
-        L = -\\sum_i \\sum_j {label}_j {log}(p_{ij})
+        p = \softmax({pred})
+
+        L = -\sum_i \sum_j {label}_j \log p_{ij}
 
     Parameters
     ----------
@@ -228,13 +277,28 @@ class SoftmaxCrossEntropyLoss(Loss):
         of unnormalized numbers.
     weight : float or None
         Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: the prediction tensor, where the `batch_axis` dimension
+          ranges over batch size and `axis` dimension ranges over the number
+          of classes.
+        - **label**: the truth tensor. When `sparse_label` is True, `label`'s
+          shape should be `pred`'s shape with the `axis` dimension removed.
+          i.e. for `pred` with shape (1,2,3,4) and `axis = 2`, `label`'s shape
+          should be (1,2,4) and values should be integers between 0 and 2. If
+          `sparse_label` is False, `label`'s shape must be the same as `pred`
+          and values should be floats in the range `[0, 1]`.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as label. For example, if label has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
+          batch_axis are averaged out.
     """
     def __init__(self, axis=-1, sparse_label=True, from_logits=False, weight=None,
                  batch_axis=0, **kwargs):
@@ -243,13 +307,14 @@ def __init__(self, axis=-1, sparse_label=True, from_logits=False, weight=None,
         self._sparse_label = sparse_label
         self._from_logits = from_logits
 
-    def hybrid_forward(self, F, output, label, sample_weight=None):
+    def hybrid_forward(self, F, pred, label, sample_weight=None):
         if not self._from_logits:
-            output = F.log_softmax(output)
+            pred = F.log_softmax(pred, self._axis)
         if self._sparse_label:
-            loss = -F.pick(output, label, axis=self._axis, keepdims=True)
+            loss = -F.pick(pred, label, axis=self._axis, keepdims=True)
         else:
-            loss = -F.sum(output*label, axis=self._axis, keepdims=True)
+            label = _reshape_like(F, label, pred)
+            loss = -F.sum(pred*label, axis=self._axis, keepdims=True)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
 
@@ -257,41 +322,377 @@ def hybrid_forward(self, F, output, label, sample_weight=None):
 
 
 class KLDivLoss(Loss):
-    """The Kullback-Leibler divergence loss.
+    r"""The Kullback-Leibler divergence loss.
+
+    KL divergence measures the distance between contiguous distributions. It
+    can be used to minimize information loss when approximating a distribution.
+    If `from_logits` is True (default), loss is defined as:
+
+    .. math::
+
+        L = \sum_i {label}_i * \big[\log({label}_i) - {pred}_i\big]
 
-    KL divergence is a useful distance measure for continuous distributions
-    and is often useful when performing direct regression over the space of
-    (discretely sampled) continuous output distributions.
+    If `from_logits` is False, loss is defined as:
 
-    .. _Kullback-Leibler divergence:
-        https://en.wikipedia.org/wiki/Kullback-Leibler_divergence
     .. math::
-        L = 1/n \\sum_i (label_i * (log(label_i) - output_i))
 
-    Label's shape should be the same as output's.
+        \DeclareMathOperator{softmax}{softmax}
+
+        prob = \softmax({pred})
+
+        L = \sum_i {label}_i * \big[\log({label}_i) - log({pred}_i)\big]
+
+
+    `pred` and `label` can have arbitrary shape as long as they have the same
+    number of elements.
 
     Parameters
     ----------
     from_logits : bool, default is `True`
         Whether the input is log probability (usually from log_softmax) instead
         of unnormalized numbers.
+    axis : int, default -1
+        The dimension along with to compute softmax. Only used when `from_logits`
+        is False.
     weight : float or None
         Global scalar weight for loss.
-    sample_weight : Symbol or None
-        Per sample weighting. Must be broadcastable to
-        the same shape as loss. For example, if loss has
-        shape (64, 10) and you want to weight each sample
-        in the batch, `sample_weight` should have shape (64, 1).
     batch_axis : int, default 0
         The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: prediction tensor with arbitrary shape. If `from_logits` is
+          True, `pred` should be log probabilities. Otherwise, it should be
+          unnormalized predictions, i.e. from a dense layer.
+        - **label**: truth tensor with values in range `(0, 1)`. Must have
+          the same size as `pred`.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
+          batch_axis are averaged out.
+
+
+    References
+    ----------
+        `Kullback-Leibler divergence
+        <https://en.wikipedia.org/wiki/Kullback-Leibler_divergence>`_
     """
-    def __init__(self, from_logits=True, weight=None, batch_axis=0, **kwargs):
+    def __init__(self, from_logits=True, axis=-1, weight=None, batch_axis=0,
+                 **kwargs):
         super(KLDivLoss, self).__init__(weight, batch_axis, **kwargs)
         self._from_logits = from_logits
+        self._axis = axis
 
-    def hybrid_forward(self, F, output, label, sample_weight=None):
+    def hybrid_forward(self, F, pred, label, sample_weight=None):
         if not self._from_logits:
-            output = F.log_softmax(output)
-        loss = label * (F.log(label+1e-8) - output)
+            pred = F.log_softmax(pred, self._axis)
+        loss = label * (F.log(label+1e-12) - pred)
         loss = _apply_weighting(F, loss, self._weight, sample_weight)
         return F.mean(loss, axis=self._batch_axis, exclude=True)
+
+
+class CTCLoss(Loss):
+    r"""Connectionist Temporal Classification Loss.
+
+
+    Parameters
+    ----------
+    layout : str, default 'NTC'
+        Layout of prediction tensor. 'N', 'T', 'C' stands for batch size,
+        sequence length, and alphabet_size respectively.
+    label_layout : str, default 'NT'
+        Layout of the labels. 'N', 'T' stands for batch size, and sequence
+        length respectively.
+    weight : float or None
+        Global scalar weight for loss.
+
+
+    Inputs:
+        - **pred**: unnormalized prediction tensor (before softmax).
+          Its shape depends on `layout`. If `layout` is 'TNC', pred
+          should have shape `(sequence_length, batch_size, alphabet_size)`.
+          Note that in the last dimension, index `alphabet_size-1` is reserved
+          for internal use as blank label. So `alphabet_size` is one plus the
+          actual alphabet size.
+
+        - **label**: zero-based label tensor. Its shape depends on `label_layout`.
+          If `label_layout` is 'TN', `label` should have shape
+          `(label_sequence_length, batch_size)`.
+
+        - **pred_lengths**: optional (default None), used for specifying the
+          length of each entry when different `pred` entries in the same batch
+          have different lengths. `pred_lengths` should have shape `(batch_size,)`.
+
+        - **label_lengths**: optional (default None), used for specifying the
+          length of each entry when different `label` entries in the same batch
+          have different lengths. `label_lengths` should have shape `(batch_size,)`.
+
+    Outputs:
+        - **loss**: output loss has shape `(batch_size,)`.
+
+
+    **Example**: suppose the vocabulary is `[a, b, c]`, and in one batch we
+    have three sequences 'ba', 'cbb', and 'abac'. We can index the labels as
+    `{'a': 0, 'b': 1, 'c': 2, blank: 3}`. Then `alphabet_size` should be 4,
+    where label 3 is reserved for internal use by `CTCLoss`. We then need to
+    pad each sequence with `-1` to make a rectangular `label` tensor::
+
+        [[1, 0, -1, -1],
+         [2, 1,  1, -1],
+         [0, 1,  0,  2]]
+
+
+    References
+    ----------
+        `Connectionist Temporal Classification: Labelling Unsegmented
+        Sequence Data with Recurrent Neural Networks
+        <http://www.cs.toronto.edu/~graves/icml_2006.pdf>`_
+    """
+    def __init__(self, layout='NTC', label_layout='NT', weight=None, **kwargs):
+        assert layout in ['NTC', 'TNC'],\
+               "Only 'NTC' and 'TNC' layouts for pred are supported. Got: %s"%layout
+        assert label_layout in ['NT', 'TN'],\
+               "Only 'NT' and 'TN' layouts for label are supported. Got: %s"%label_layout
+        self._layout = layout
+        self._label_layout = label_layout
+        batch_axis = label_layout.find('N')
+        super(CTCLoss, self).__init__(weight, batch_axis, **kwargs)
+
+    def hybrid_forward(self, F, pred, label,
+                       pred_lengths=None, label_lengths=None, sample_weight=None):
+        if self._layout == 'NTC':
+            pred = F.swapaxes(pred, 0, 1)
+        if self._batch_axis == 1:
+            label = F.swapaxes(label, 0, 1)
+        loss = F.contrib.CTCLoss(pred, label, pred_lengths, label_lengths,
+                                 use_data_lengths=pred_lengths is not None,
+                                 use_label_lengths=label_lengths is not None,
+                                 blank_label='last')
+        return _apply_weighting(F, loss, self._weight, sample_weight)
+
+
+class HuberLoss(Loss):
+    r"""Calculates smoothed L1 loss that is equal to L1 loss if absolute error
+    exceeds rho but is equal to L2 loss otherwise. Also called SmoothedL1 loss.
+
+    .. math::
+        L = \sum_i \begin{cases} \frac{1}{2 {rho}} ({pred}_i - {label}_i)^2 &
+                           \text{ if } |{pred}_i - {label}_i| < {rho} \\
+                           |{pred}_i - {label}_i| - \frac{{rho}}{2} &
+                           \text{ otherwise }
+            \end{cases}
+
+    `pred` and `label` can have arbitrary shape as long as they have the same
+    number of elements.
+
+    Parameters
+    ----------
+    rho : float, default 1
+        Threshold for trimmed mean estimator.
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: prediction tensor with arbitrary shape
+        - **label**: target tensor with the same size as pred.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
+          batch_axis are averaged out.
+    """
+    def __init__(self, rho=1, weight=None, batch_axis=0, **kwargs):
+        super(HuberLoss, self).__init__(weight, batch_axis, **kwargs)
+        self._rho = rho
+
+    def hybrid_forward(self, F, pred, label, sample_weight=None):
+        label = _reshape_like(F, label, pred)
+        loss = F.abs(pred - label)
+        loss = F.where(loss > self._rho, loss - 0.5 * self._rho,
+                       (0.5/self._rho) * F.square(loss))
+        loss = _apply_weighting(F, loss, self._weight, sample_weight)
+        return F.mean(loss, axis=self._batch_axis, exclude=True)
+
+
+class HingeLoss(Loss):
+    r"""Calculates the hinge loss function often used in SVMs:
+
+    .. math::
+        L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)
+
+    where `pred` is the classifier prediction and `label` is the target tensor
+    containing values -1 or 1. `pred` and `label` must have the same number of
+    elements.
+
+    Parameters
+    ----------
+    margin : float
+        The margin in hinge loss. Defaults to 1.0
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: prediction tensor with arbitrary shape.
+        - **label**: truth tensor with values -1 or 1. Must have the same size
+          as pred.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
+          batch_axis are averaged out.
+    """
+    def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
+        super(HingeLoss, self).__init__(weight, batch_axis, **kwargs)
+        self._margin = margin
+
+    def hybrid_forward(self, F, pred, label, sample_weight=None):
+        label = _reshape_like(F, label, pred)
+        loss = F.relu(self._margin - pred * label)
+        loss = _apply_weighting(F, loss, self._weight, sample_weight)
+        return F.mean(loss, axis=self._batch_axis, exclude=True)
+
+
+class SquaredHingeLoss(Loss):
+    r"""Calculates the soft-margin loss function used in SVMs:
+
+    .. math::
+        L = \sum_i max(0, {margin} - {pred}_i \cdot {label}_i)^2
+
+    where `pred` is the classifier prediction and `label` is the target tensor
+    containing values -1 or 1. `pred` and `label` can have arbitrary shape as
+    long as they have the same number of elements.
+
+    Parameters
+    ----------
+    margin : float
+        The margin in hinge loss. Defaults to 1.0
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: prediction tensor with arbitrary shape
+        - **label**: truth tensor with values -1 or 1. Must have the same size
+          as pred.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
+          batch_axis are averaged out.
+    """
+    def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
+        super(SquaredHingeLoss, self).__init__(weight, batch_axis, **kwargs)
+        self._margin = margin
+
+    def hybrid_forward(self, F, pred, label, sample_weight=None):
+        label = _reshape_like(F, label, pred)
+        loss = F.square(F.relu(self._margin - pred * label))
+        loss = _apply_weighting(F, loss, self._weight, sample_weight)
+        return F.mean(loss, axis=self._batch_axis, exclude=True)
+
+
+class LogisticLoss(Loss):
+    r"""Calculates the logistic loss (for binary losses only):
+
+    .. math::
+        L = \sum_i \log(1 + \exp(- {pred}_i \cdot {label}_i))
+
+    where `pred` is the classifier prediction and `label` is the target tensor
+    containing values -1 or 1. `pred` and `label` can have arbitrary shape as
+    long as they have the same number of elements.
+
+    Parameters
+    ----------
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: prediction tensor with arbitrary shape.
+        - **label**: truth tensor with values -1 or 1. Must have the same size
+          as pred.
+        - **sample_weight**: element-wise weighting tensor. Must be broadcastable
+          to the same shape as pred. For example, if pred has shape (64, 10)
+          and you want to weigh each sample in the batch separately,
+          sample_weight should have shape (64, 1).
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,). Dimenions other than
+          batch_axis are averaged out.
+    """
+    def __init__(self, weight=None, batch_axis=0, **kwargs):
+        super(LogisticLoss, self).__init__(weight, batch_axis, **kwargs)
+
+    def hybrid_forward(self, F, pred, label, sample_weight=None):
+        label = _reshape_like(F, label, pred)
+        loss = F.log(1.0 + F.exp(-pred * label))
+        loss = _apply_weighting(F, loss, self._weight, sample_weight)
+        return F.mean(loss, axis=self._batch_axis, exclude=True)
+
+
+class TripletLoss(Loss):
+    r"""Calculates triplet loss given three input tensors and a positive margin.
+    Triplet loss measures the relative similarity between prediction, a positive
+    example and a negative example:
+
+    .. math::
+        L = \sum_i \max(\Vert {pred}_i - {pos_i} \Vert_2^2 -
+                        \Vert {pred}_i - {neg_i} \Vert_2^2 + {margin}, 0)
+
+    `pred`, `positive` and `negative` can have arbitrary shape as long as they
+    have the same number of elements.
+
+    Parameters
+    ----------
+    margin : float
+        Margin of separation between correct and incorrect pair.
+    weight : float or None
+        Global scalar weight for loss.
+    batch_axis : int, default 0
+        The axis that represents mini-batch.
+
+
+    Inputs:
+        - **pred**: prediction tensor with arbitrary shape
+        - **positive**: positive example tensor with arbitrary shape. Must have
+          the same size as pred.
+        - **negative**: negative example tensor with arbitrary shape Must have
+          the same size as pred.
+
+    Outputs:
+        - **loss**: loss tensor with shape (batch_size,).
+    """
+    def __init__(self, margin=1, weight=None, batch_axis=0, **kwargs):
+        super(TripletLoss, self).__init__(weight, batch_axis, **kwargs)
+        self._margin = margin
+
+    def hybrid_forward(self, F, pred, positive, negative):
+        positive = _reshape_like(F, positive, pred)
+        negative = _reshape_like(F, negative, pred)
+        loss = F.sum(F.square(pred-positive) - F.square(pred-negative),
+                     axis=self._batch_axis, exclude=True)
+        loss = F.relu(loss + self._margin)
+        return _apply_weighting(F, loss, self._weight, None)
diff --git a/python/mxnet/gluon/model_zoo/custom_layers.py b/python/mxnet/gluon/model_zoo/custom_layers.py
index cf91876888ee..8c481b3c3614 100644
--- a/python/mxnet/gluon/model_zoo/custom_layers.py
+++ b/python/mxnet/gluon/model_zoo/custom_layers.py
@@ -18,6 +18,7 @@
 # coding: utf-8
 # pylint: disable= arguments-differ
 """Custom neural network layers in model_zoo."""
+__all__ = ['HybridConcurrent', 'Identity']
 
 from ..block import Block, HybridBlock
 from ..utils import _indent
diff --git a/python/mxnet/gluon/model_zoo/model_store.py b/python/mxnet/gluon/model_zoo/model_store.py
index e524f215416d..bb549748e497 100644
--- a/python/mxnet/gluon/model_zoo/model_store.py
+++ b/python/mxnet/gluon/model_zoo/model_store.py
@@ -38,6 +38,7 @@
     ('2a903ab21260c85673a78fe65037819a843a1f43', 'resnet50_v1'),
     ('8aacf80ff4014c1efa2362a963ac5ec82cf92d5b', 'resnet18_v2'),
     ('0ed3cd06da41932c03dea1de7bc2506ef3fb97b3', 'resnet34_v2'),
+    ('eb7a368774aa34a12ed155126b641ae7556dad9d', 'resnet50_v2'),
     ('264ba4970a0cc87a4f15c96e25246a1307caf523', 'squeezenet1.0'),
     ('33ba0f93753c83d86e1eb397f38a667eaf2e9376', 'squeezenet1.1'),
     ('dd221b160977f36a53f464cb54648d227c707a05', 'vgg11'),
@@ -49,25 +50,25 @@
     ('f713436691eee9a20d70a145ce0d53ed24bf7399', 'vgg19'),
     ('9730961c9cea43fd7eeefb00d792e386c45847d6', 'vgg19_bn')]}
 
-_url_format = 'https://{bucket}.s3.amazonaws.com/gluon/models/{file_name}.zip'
-bucket = 'apache-mxnet'
+apache_repo_url = 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/'
+_url_format = '{repo_url}gluon/models/{file_name}.zip'
 
 def short_hash(name):
     if name not in _model_sha1:
         raise ValueError('Pretrained model for {name} is not available.'.format(name=name))
     return _model_sha1[name][:8]
 
-def get_model_file(name, local_dir=os.path.expanduser('~/.mxnet/models/')):
+def get_model_file(name, root='~/.mxnet/models/'):
     r"""Return location for the pretrained on local file system.
 
     This function will download from online model zoo when model cannot be found or has mismatch.
-    The local_dir directory will be created if it doesn't exist.
+    The root directory will be created if it doesn't exist.
 
     Parameters
     ----------
     name : str
         Name of the model.
-    local_dir : str, default '~/.mxnet/models'
+    root : str, default '~/.mxnet/models'
         Location for keeping the model parameters.
 
     Returns
@@ -77,7 +78,8 @@ def get_model_file(name, local_dir=os.path.expanduser('~/.mxnet/models/')):
     """
     file_name = '{name}-{short_hash}'.format(name=name,
                                              short_hash=short_hash(name))
-    file_path = os.path.join(local_dir, file_name+'.params')
+    root = os.path.expanduser(root)
+    file_path = os.path.join(root, file_name+'.params')
     sha1_hash = _model_sha1[name]
     if os.path.exists(file_path):
         if check_sha1(file_path, sha1_hash):
@@ -87,16 +89,18 @@ def get_model_file(name, local_dir=os.path.expanduser('~/.mxnet/models/')):
     else:
         print('Model file is not found. Downloading.')
 
-    if not os.path.exists(local_dir):
-        os.makedirs(local_dir)
+    if not os.path.exists(root):
+        os.makedirs(root)
 
-    zip_file_path = os.path.join(local_dir, file_name+'.zip')
-    download(_url_format.format(bucket=bucket,
-                                file_name=file_name),
+    zip_file_path = os.path.join(root, file_name+'.zip')
+    repo_url = os.environ.get('MXNET_GLUON_REPO', apache_repo_url)
+    if repo_url[-1] != '/':
+        repo_url = repo_url + '/'
+    download(_url_format.format(repo_url=repo_url, file_name=file_name),
              path=zip_file_path,
              overwrite=True)
     with zipfile.ZipFile(zip_file_path) as zf:
-        zf.extractall(local_dir)
+        zf.extractall(root)
     os.remove(zip_file_path)
 
     if check_sha1(file_path, sha1_hash):
@@ -104,15 +108,16 @@ def get_model_file(name, local_dir=os.path.expanduser('~/.mxnet/models/')):
     else:
         raise ValueError('Downloaded file has different hash. Please try again.')
 
-def purge(local_dir=os.path.expanduser('~/.mxnet/models/')):
+def purge(root='~/.mxnet/models/'):
     r"""Purge all pretrained model files in local file store.
 
     Parameters
     ----------
-    local_dir : str, default '~/.mxnet/models'
+    root : str, default '~/.mxnet/models'
         Location for keeping the model parameters.
     """
-    files = os.listdir(local_dir)
+    root = os.path.expanduser(root)
+    files = os.listdir(root)
     for f in files:
         if f.endswith(".params"):
-            os.remove(os.path.join(local_dir, f))
+            os.remove(os.path.join(root, f))
diff --git a/python/mxnet/gluon/model_zoo/vision/__init__.py b/python/mxnet/gluon/model_zoo/vision/__init__.py
index 354236b2d896..619711e71d09 100644
--- a/python/mxnet/gluon/model_zoo/vision/__init__.py
+++ b/python/mxnet/gluon/model_zoo/vision/__init__.py
@@ -27,24 +27,26 @@
 -  `ResNet V2`_
 -  `SqueezeNet`_
 -  `VGG`_
+-  `MobileNet`_
 
 You can construct a model with random weights by calling its constructor:
+
 .. code::
 
-    import mxnet.gluon.models as models
-    resnet18 = models.resnet18_v1()
-    alexnet = models.alexnet()
-    squeezenet = models.squeezenet1_0()
-    densenet = models.densenet_161()
+    from mxnet.gluon.model_zoo import vision
+    resnet18 = vision.resnet18_v1()
+    alexnet = vision.alexnet()
+    squeezenet = vision.squeezenet1_0()
+    densenet = vision.densenet_161()
 
 We provide pre-trained models for all the models except ResNet V2.
-These can constructed by passing
-``pretrained=True``:
+These can constructed by passing ``pretrained=True``:
+
 .. code::
 
-    import mxnet.gluon.models as models
-    resnet18 = models.resnet18_v1(pretrained=True)
-    alexnet = models.alexnet(pretrained=True)
+    from mxnet.gluon.model_zoo import vision
+    resnet18 = vision.resnet18_v1(pretrained=True)
+    alexnet = vision.alexnet(pretrained=True)
 
 Pretrained models are converted from torchvision.
 All pre-trained models expect input images normalized in the same way,
@@ -67,6 +69,7 @@
 .. _ResNet V2: https://arxiv.org/abs/1512.03385
 .. _SqueezeNet: https://arxiv.org/abs/1602.07360
 .. _VGG: https://arxiv.org/abs/1409.1556
+.. _MobileNet: https://arxiv.org/abs/1704.04861
 """
 
 from .alexnet import *
@@ -81,6 +84,8 @@
 
 from .vgg import *
 
+from .mobilenet import *
+
 def get_model(name, **kwargs):
     """Returns a pre-defined model by name
 
@@ -92,6 +97,10 @@ def get_model(name, **kwargs):
         Whether to load the pretrained weights for model.
     classes : int
         Number of classes for the output layer.
+    ctx : Context, default CPU
+        The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
 
     Returns
     -------
@@ -124,6 +133,10 @@ def get_model(name, **kwargs):
               'squeezenet1.0': squeezenet1_0,
               'squeezenet1.1': squeezenet1_1,
               'inceptionv3': inception_v3,
+              'mobilenet1.0': mobilenet1_0,
+              'mobilenet0.75': mobilenet0_75,
+              'mobilenet0.5': mobilenet0_5,
+              'mobilenet0.25': mobilenet0_25
              }
     name = name.lower()
     if name not in models:
diff --git a/python/mxnet/gluon/model_zoo/vision/alexnet.py b/python/mxnet/gluon/model_zoo/vision/alexnet.py
index 4d5bc8c85b67..05669d054d78 100644
--- a/python/mxnet/gluon/model_zoo/vision/alexnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/alexnet.py
@@ -52,22 +52,20 @@ def __init__(self, classes=1000, **kwargs):
                                             activation='relu'))
                 self.features.add(nn.MaxPool2D(pool_size=3, strides=2))
                 self.features.add(nn.Flatten())
+                self.features.add(nn.Dense(4096, activation='relu'))
+                self.features.add(nn.Dropout(0.5))
+                self.features.add(nn.Dense(4096, activation='relu'))
+                self.features.add(nn.Dropout(0.5))
 
-            self.classifier = nn.HybridSequential(prefix='')
-            with self.classifier.name_scope():
-                self.classifier.add(nn.Dense(4096, activation='relu'))
-                self.classifier.add(nn.Dropout(0.5))
-                self.classifier.add(nn.Dense(4096, activation='relu'))
-                self.classifier.add(nn.Dropout(0.5))
-                self.classifier.add(nn.Dense(classes))
+            self.output = nn.Dense(classes)
 
     def hybrid_forward(self, F, x):
         x = self.features(x)
-        x = self.classifier(x)
+        x = self.output(x)
         return x
 
 # Constructor
-def alexnet(pretrained=False, ctx=cpu(), **kwargs):
+def alexnet(pretrained=False, ctx=cpu(), root='~/.mxnet/models', **kwargs):
     r"""AlexNet model from the `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
 
     Parameters
@@ -76,9 +74,11 @@ def alexnet(pretrained=False, ctx=cpu(), **kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     net = AlexNet(**kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('alexnet'), ctx=ctx)
+        net.load_params(get_model_file('alexnet', root=root), ctx=ctx)
     return net
diff --git a/python/mxnet/gluon/model_zoo/vision/densenet.py b/python/mxnet/gluon/model_zoo/vision/densenet.py
index 57dbe5d188a7..16afb2d4e393 100644
--- a/python/mxnet/gluon/model_zoo/vision/densenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/densenet.py
@@ -103,11 +103,11 @@ def __init__(self, num_init_features, growth_rate, block_config,
             self.features.add(nn.AvgPool2D(pool_size=7))
             self.features.add(nn.Flatten())
 
-            self.classifier = nn.Dense(classes)
+            self.output = nn.Dense(classes)
 
     def hybrid_forward(self, F, x):
         x = self.features(x)
-        x = self.classifier(x)
+        x = self.output(x)
         return x
 
 
@@ -119,7 +119,7 @@ def hybrid_forward(self, F, x):
 
 
 # Constructor
-def get_densenet(num_layers, pretrained=False, ctx=cpu(), **kwargs):
+def get_densenet(num_layers, pretrained=False, ctx=cpu(), root='~/.mxnet/models', **kwargs):
     r"""Densenet-BC model from the
     `"Densely Connected Convolutional Networks" <https://arxiv.org/pdf/1608.06993.pdf>`_ paper.
 
@@ -131,12 +131,14 @@ def get_densenet(num_layers, pretrained=False, ctx=cpu(), **kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     num_init_features, growth_rate, block_config = densenet_spec[num_layers]
     net = DenseNet(num_init_features, growth_rate, block_config, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('densenet%d'%(num_layers)), ctx=ctx)
+        net.load_params(get_model_file('densenet%d'%(num_layers), root=root), ctx=ctx)
     return net
 
 def densenet121(**kwargs):
@@ -149,6 +151,8 @@ def densenet121(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_densenet(121, **kwargs)
 
@@ -162,6 +166,8 @@ def densenet161(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_densenet(161, **kwargs)
 
@@ -175,6 +181,8 @@ def densenet169(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_densenet(169, **kwargs)
 
@@ -188,5 +196,7 @@ def densenet201(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_densenet(201, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/inception.py b/python/mxnet/gluon/model_zoo/vision/inception.py
index 1afd3e346113..3ef0fb9e9378 100644
--- a/python/mxnet/gluon/model_zoo/vision/inception.py
+++ b/python/mxnet/gluon/model_zoo/vision/inception.py
@@ -182,23 +182,21 @@ def __init__(self, classes=1000, **kwargs):
             self.features.add(_make_C(160, 'C2_'))
             self.features.add(_make_C(160, 'C3_'))
             self.features.add(_make_C(192, 'C4_'))
+            self.features.add(_make_D('D_'))
+            self.features.add(_make_E('E1_'))
+            self.features.add(_make_E('E2_'))
+            self.features.add(nn.AvgPool2D(pool_size=8))
+            self.features.add(nn.Dropout(0.5))
 
-            self.classifier = nn.HybridSequential(prefix='')
-            self.classifier.add(_make_D('D_'))
-            self.classifier.add(_make_E('E1_'))
-            self.classifier.add(_make_E('E2_'))
-            self.classifier.add(nn.AvgPool2D(pool_size=8))
-            self.classifier.add(nn.Dropout(0.5))
-            self.classifier.add(nn.Flatten())
-            self.classifier.add(nn.Dense(classes))
+            self.output = nn.Dense(classes)
 
     def hybrid_forward(self, F, x):
         x = self.features(x)
-        x = self.classifier(x)
+        x = self.output(x)
         return x
 
 # Constructor
-def inception_v3(pretrained=False, ctx=cpu(), **kwargs):
+def inception_v3(pretrained=False, ctx=cpu(), root='~/.mxnet/models', **kwargs):
     r"""Inception v3 model from
     `"Rethinking the Inception Architecture for Computer Vision"
     <http://arxiv.org/abs/1512.00567>`_ paper.
@@ -209,9 +207,11 @@ def inception_v3(pretrained=False, ctx=cpu(), **kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     net = Inception3(**kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('inceptionv3'), ctx=ctx)
+        net.load_params(get_model_file('inceptionv3', root=root), ctx=ctx)
     return net
diff --git a/python/mxnet/gluon/model_zoo/vision/mobilenet.py b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
new file mode 100644
index 000000000000..0ba09332091f
--- /dev/null
+++ b/python/mxnet/gluon/model_zoo/vision/mobilenet.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ
+"""MobileNet, implemented in Gluon."""
+__all__ = ['MobileNet', 'mobilenet1_0', 'mobilenet0_75', 'mobilenet0_5', 'mobilenet0_25',
+           'get_mobilenet']
+
+from ....context import cpu
+from ...block import HybridBlock
+from ... import nn
+
+# Helpers
+def _add_conv(out, channels=1, kernel=1, stride=1, pad=0, num_group=1):
+    out.add(nn.Conv2D(channels, kernel, stride, pad, groups=num_group, use_bias=False))
+    out.add(nn.BatchNorm(scale=False))
+    out.add(nn.Activation('relu'))
+
+def _add_conv_dw(out, dw_channels, channels, stride):
+    _add_conv(out, channels=dw_channels, kernel=3, stride=stride, pad=1, num_group=dw_channels)
+    _add_conv(out, channels=channels)
+
+
+# Net
+class MobileNet(HybridBlock):
+    r"""MobileNet model from the
+    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
+    <https://arxiv.org/abs/1704.04861>`_ paper.
+
+    Parameters
+    ----------
+    multiplier : float, default 1.0
+        The width multiplier for controling the model size. Only multipliers that are no
+        less than 0.25 are supported. The actual number of channels is equal to the original
+        channel size multiplied by this multiplier.
+    classes : int, default 1000
+        Number of classes for the output layer.
+    """
+    def __init__(self, multiplier=1.0, classes=1000, **kwargs):
+        super(MobileNet, self).__init__(**kwargs)
+        with self.name_scope():
+            self.features = nn.HybridSequential(prefix='')
+            with self.features.name_scope():
+                _add_conv(self.features, channels=int(32*multiplier), kernel=3, pad=1, stride=2)
+                dw_channels = [int(x*multiplier) for x in [32, 64]+[128]*2+[256]*2+[512]*6+[1024]]
+                channels = [int(x*multiplier) for x in [64]+[128]*2+[256]*2+[512]*6+[1024]*2]
+                strides = [1, 2] * 3 + [1] * 5 + [2, 1]
+                for dwc, c, s in zip(dw_channels, channels, strides):
+                    _add_conv_dw(self.features, dw_channels=dwc, channels=c, stride=s)
+                self.features.add(nn.GlobalAvgPool2D())
+                self.features.add(nn.Flatten())
+
+            self.output = nn.Dense(classes)
+
+    def hybrid_forward(self, F, x):
+        x = self.features(x)
+        x = self.output(x)
+        return x
+
+# Constructor
+def get_mobilenet(multiplier, pretrained=False, ctx=cpu(), root='~/.mxnet/models', **kwargs):
+    r"""MobileNet model from the
+    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
+    <https://arxiv.org/abs/1704.04861>`_ paper.
+
+    Parameters
+    ----------
+    multiplier : float
+        The width multiplier for controling the model size. Only multipliers that are no
+        less than 0.25 are supported. The actual number of channels is equal to the original
+        channel size multiplied by this multiplier.
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    ctx : Context, default CPU
+        The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
+    """
+    net = MobileNet(multiplier, **kwargs)
+    if pretrained:
+        from ..model_store import get_model_file
+        version_suffix = '{0:.2f}'.format(multiplier)
+        if version_suffix in ('1.00', '0.50'):
+            version_suffix = version_suffix[:-1]
+        net.load_params(get_model_file('mobilenet%s'%version_suffix, root=root), ctx=ctx)
+    return net
+
+def mobilenet1_0(**kwargs):
+    r"""MobileNet model from the
+    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
+    <https://arxiv.org/abs/1704.04861>`_ paper, with width multiplier 1.0.
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    ctx : Context, default CPU
+        The context in which to load the pretrained weights.
+    """
+    return get_mobilenet(1.0, **kwargs)
+
+def mobilenet0_75(**kwargs):
+    r"""MobileNet model from the
+    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
+    <https://arxiv.org/abs/1704.04861>`_ paper, with width multiplier 0.75.
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    ctx : Context, default CPU
+        The context in which to load the pretrained weights.
+    """
+    return get_mobilenet(0.75, **kwargs)
+
+def mobilenet0_5(**kwargs):
+    r"""MobileNet model from the
+    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
+    <https://arxiv.org/abs/1704.04861>`_ paper, with width multiplier 0.5.
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    ctx : Context, default CPU
+        The context in which to load the pretrained weights.
+    """
+    return get_mobilenet(0.5, **kwargs)
+
+def mobilenet0_25(**kwargs):
+    r"""MobileNet model from the
+    `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications"
+    <https://arxiv.org/abs/1704.04861>`_ paper, with width multiplier 0.25.
+
+    Parameters
+    ----------
+    pretrained : bool, default False
+        Whether to load the pretrained weights for model.
+    ctx : Context, default CPU
+        The context in which to load the pretrained weights.
+    """
+    return get_mobilenet(0.25, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/resnet.py b/python/mxnet/gluon/model_zoo/vision/resnet.py
index 78bc726f41d9..4474aee839db 100644
--- a/python/mxnet/gluon/model_zoo/vision/resnet.py
+++ b/python/mxnet/gluon/model_zoo/vision/resnet.py
@@ -261,11 +261,9 @@ def __init__(self, block, layers, channels, classes=1000, thumbnail=False, **kwa
                 stride = 1 if i == 0 else 2
                 self.features.add(self._make_layer(block, num_layer, channels[i+1],
                                                    stride, i+1, in_channels=channels[i]))
+            self.features.add(nn.GlobalAvgPool2D())
 
-            self.classifier = nn.HybridSequential(prefix='')
-            self.classifier.add(nn.GlobalAvgPool2D())
-            self.classifier.add(nn.Flatten())
-            self.classifier.add(nn.Dense(classes, in_units=channels[-1]))
+            self.output = nn.Dense(classes, in_units=channels[-1])
 
     def _make_layer(self, block, layers, channels, stride, stage_index, in_channels=0):
         layer = nn.HybridSequential(prefix='stage%d_'%stage_index)
@@ -278,7 +276,7 @@ def _make_layer(self, block, layers, channels, stride, stage_index, in_channels=
 
     def hybrid_forward(self, F, x):
         x = self.features(x)
-        x = self.classifier(x)
+        x = self.output(x)
 
         return x
 
@@ -322,13 +320,12 @@ def __init__(self, block, layers, channels, classes=1000, thumbnail=False, **kwa
                 self.features.add(self._make_layer(block, num_layer, channels[i+1],
                                                    stride, i+1, in_channels=in_channels))
                 in_channels = channels[i+1]
+            self.features.add(nn.BatchNorm())
+            self.features.add(nn.Activation('relu'))
+            self.features.add(nn.GlobalAvgPool2D())
+            self.features.add(nn.Flatten())
 
-            self.classifier = nn.HybridSequential(prefix='')
-            self.classifier.add(nn.BatchNorm())
-            self.classifier.add(nn.Activation('relu'))
-            self.classifier.add(nn.GlobalAvgPool2D())
-            self.classifier.add(nn.Flatten())
-            self.classifier.add(nn.Dense(classes, in_units=in_channels))
+            self.output = nn.Dense(classes, in_units=in_channels)
 
     def _make_layer(self, block, layers, channels, stride, stage_index, in_channels=0):
         layer = nn.HybridSequential(prefix='stage%d_'%stage_index)
@@ -341,7 +338,7 @@ def _make_layer(self, block, layers, channels, stride, stage_index, in_channels=
 
     def hybrid_forward(self, F, x):
         x = self.features(x)
-        x = self.classifier(x)
+        x = self.output(x)
         return x
 
 
@@ -358,7 +355,7 @@ def hybrid_forward(self, F, x):
 
 
 # Constructor
-def get_resnet(version, num_layers, pretrained=False, ctx=cpu(), **kwargs):
+def get_resnet(version, num_layers, pretrained=False, ctx=cpu(), root='~/.mxnet/models', **kwargs):
     r"""ResNet V1 model from `"Deep Residual Learning for Image Recognition"
     <http://arxiv.org/abs/1512.03385>`_ paper.
     ResNet V2 model from `"Identity Mappings in Deep Residual Networks"
@@ -374,6 +371,8 @@ def get_resnet(version, num_layers, pretrained=False, ctx=cpu(), **kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     block_type, layers, channels = resnet_spec[num_layers]
     resnet_class = resnet_net_versions[version-1]
@@ -381,7 +380,8 @@ def get_resnet(version, num_layers, pretrained=False, ctx=cpu(), **kwargs):
     net = resnet_class(block_class, layers, channels, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('resnet%d_v%d'%(num_layers, version)), ctx=ctx)
+        net.load_params(get_model_file('resnet%d_v%d'%(num_layers, version),
+                                       root=root), ctx=ctx)
     return net
 
 def resnet18_v1(**kwargs):
@@ -394,6 +394,8 @@ def resnet18_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(1, 18, **kwargs)
 
@@ -407,6 +409,8 @@ def resnet34_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(1, 34, **kwargs)
 
@@ -420,6 +424,8 @@ def resnet50_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(1, 50, **kwargs)
 
@@ -433,6 +439,8 @@ def resnet101_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(1, 101, **kwargs)
 
@@ -446,6 +454,8 @@ def resnet152_v1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(1, 152, **kwargs)
 
@@ -459,6 +469,8 @@ def resnet18_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(2, 18, **kwargs)
 
@@ -472,6 +484,8 @@ def resnet34_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(2, 34, **kwargs)
 
@@ -485,6 +499,8 @@ def resnet50_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(2, 50, **kwargs)
 
@@ -498,6 +514,8 @@ def resnet101_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(2, 101, **kwargs)
 
@@ -511,5 +529,7 @@ def resnet152_v2(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_resnet(2, 152, **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/squeezenet.py b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
index 1a14201c7998..60ef393f2e37 100644
--- a/python/mxnet/gluon/model_zoo/vision/squeezenet.py
+++ b/python/mxnet/gluon/model_zoo/vision/squeezenet.py
@@ -93,21 +93,21 @@ def __init__(self, version, classes=1000, **kwargs):
                 self.features.add(_make_fire(48, 192, 192))
                 self.features.add(_make_fire(64, 256, 256))
                 self.features.add(_make_fire(64, 256, 256))
+            self.features.add(nn.Dropout(0.5))
 
-            self.classifier = nn.HybridSequential(prefix='')
-            self.classifier.add(nn.Dropout(0.5))
-            self.classifier.add(nn.Conv2D(classes, kernel_size=1))
-            self.classifier.add(nn.Activation('relu'))
-            self.classifier.add(nn.AvgPool2D(13))
-            self.classifier.add(nn.Flatten())
+            self.output = nn.HybridSequential(prefix='')
+            self.output.add(nn.Conv2D(classes, kernel_size=1))
+            self.output.add(nn.Activation('relu'))
+            self.output.add(nn.AvgPool2D(13))
+            self.output.add(nn.Flatten())
 
     def hybrid_forward(self, F, x):
         x = self.features(x)
-        x = self.classifier(x)
+        x = self.output(x)
         return x
 
 # Constructor
-def get_squeezenet(version, pretrained=False, ctx=cpu(), **kwargs):
+def get_squeezenet(version, pretrained=False, ctx=cpu(), root='~/.mxnet/models', **kwargs):
     r"""SqueezeNet model from the `"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters
     and <0.5MB model size" <https://arxiv.org/abs/1602.07360>`_ paper.
     SqueezeNet 1.1 model from the `official SqueezeNet repo
@@ -123,11 +123,13 @@ def get_squeezenet(version, pretrained=False, ctx=cpu(), **kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     net = SqueezeNet(version, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
-        net.load_params(get_model_file('squeezenet%s'%version), ctx=ctx)
+        net.load_params(get_model_file('squeezenet%s'%version, root=root), ctx=ctx)
     return net
 
 def squeezenet1_0(**kwargs):
@@ -140,6 +142,8 @@ def squeezenet1_0(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_squeezenet('1.0', **kwargs)
 
@@ -155,5 +159,7 @@ def squeezenet1_1(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_squeezenet('1.1', **kwargs)
diff --git a/python/mxnet/gluon/model_zoo/vision/vgg.py b/python/mxnet/gluon/model_zoo/vision/vgg.py
index 2f4daf9f6437..c524e8e75e8f 100644
--- a/python/mxnet/gluon/model_zoo/vision/vgg.py
+++ b/python/mxnet/gluon/model_zoo/vision/vgg.py
@@ -50,18 +50,17 @@ def __init__(self, layers, filters, classes=1000, batch_norm=False, **kwargs):
         assert len(layers) == len(filters)
         with self.name_scope():
             self.features = self._make_features(layers, filters, batch_norm)
-            self.classifier = nn.HybridSequential(prefix='')
-            self.classifier.add(nn.Dense(4096, activation='relu',
-                                         weight_initializer='normal',
-                                         bias_initializer='zeros'))
-            self.classifier.add(nn.Dropout(rate=0.5))
-            self.classifier.add(nn.Dense(4096, activation='relu',
-                                         weight_initializer='normal',
-                                         bias_initializer='zeros'))
-            self.classifier.add(nn.Dropout(rate=0.5))
-            self.classifier.add(nn.Dense(classes,
-                                         weight_initializer='normal',
-                                         bias_initializer='zeros'))
+            self.features.add(nn.Dense(4096, activation='relu',
+                                       weight_initializer='normal',
+                                       bias_initializer='zeros'))
+            self.features.add(nn.Dropout(rate=0.5))
+            self.features.add(nn.Dense(4096, activation='relu',
+                                       weight_initializer='normal',
+                                       bias_initializer='zeros'))
+            self.features.add(nn.Dropout(rate=0.5))
+            self.output = nn.Dense(classes,
+                                   weight_initializer='normal',
+                                   bias_initializer='zeros')
 
     def _make_features(self, layers, filters, batch_norm):
         featurizer = nn.HybridSequential(prefix='')
@@ -80,7 +79,7 @@ def _make_features(self, layers, filters, batch_norm):
 
     def hybrid_forward(self, F, x):
         x = self.features(x)
-        x = self.classifier(x)
+        x = self.output(x)
         return x
 
 
@@ -92,7 +91,7 @@ def hybrid_forward(self, F, x):
 
 
 # Constructors
-def get_vgg(num_layers, pretrained=False, ctx=cpu(), **kwargs):
+def get_vgg(num_layers, pretrained=False, ctx=cpu(), root='~/.mxnet/models', **kwargs):
     r"""VGG model from the `"Very Deep Convolutional Networks for Large-Scale Image Recognition"
     <https://arxiv.org/abs/1409.1556>`_ paper.
 
@@ -104,13 +103,16 @@ def get_vgg(num_layers, pretrained=False, ctx=cpu(), **kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     layers, filters = vgg_spec[num_layers]
     net = VGG(layers, filters, **kwargs)
     if pretrained:
         from ..model_store import get_model_file
         batch_norm_suffix = '_bn' if kwargs.get('batch_norm') else ''
-        net.load_params(get_model_file('vgg%d%s'%(num_layers, batch_norm_suffix)), ctx=ctx)
+        net.load_params(get_model_file('vgg%d%s'%(num_layers, batch_norm_suffix),
+                                       root=root), ctx=ctx)
     return net
 
 def vgg11(**kwargs):
@@ -123,6 +125,8 @@ def vgg11(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_vgg(11, **kwargs)
 
@@ -136,6 +140,8 @@ def vgg13(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_vgg(13, **kwargs)
 
@@ -149,6 +155,8 @@ def vgg16(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_vgg(16, **kwargs)
 
@@ -162,6 +170,8 @@ def vgg19(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     return get_vgg(19, **kwargs)
 
@@ -176,6 +186,8 @@ def vgg11_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
     return get_vgg(11, **kwargs)
@@ -191,6 +203,8 @@ def vgg13_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
     return get_vgg(13, **kwargs)
@@ -206,6 +220,8 @@ def vgg16_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
     return get_vgg(16, **kwargs)
@@ -221,6 +237,8 @@ def vgg19_bn(**kwargs):
         Whether to load the pretrained weights for model.
     ctx : Context, default CPU
         The context in which to load the pretrained weights.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
     """
     kwargs['batch_norm'] = True
     return get_vgg(19, **kwargs)
diff --git a/python/mxnet/gluon/nn/__init__.py b/python/mxnet/gluon/nn/__init__.py
index 0fc1ff12dd13..b38db621d8cc 100644
--- a/python/mxnet/gluon/nn/__init__.py
+++ b/python/mxnet/gluon/nn/__init__.py
@@ -19,6 +19,8 @@
 # pylint: disable=wildcard-import
 """Neural network layers."""
 
+from ..block import *
+
 from .basic_layers import *
 
 from .conv_layers import *
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 7901a7ae2350..c0b4b52382f5 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -18,13 +18,19 @@
 # coding: utf-8
 # pylint: disable= arguments-differ
 """Basic neural network layers."""
+__all__ = ['Sequential', 'HybridSequential', 'Dense', 'Activation',
+           'Dropout', 'BatchNorm', 'LeakyReLU', 'Embedding', 'Flatten',
+           'Lambda', 'HybridLambda']
+import warnings
+import numpy as np
 
 from ..block import Block, HybridBlock
 from ..utils import _indent
+from ... import nd, sym
 
 
 class Sequential(Block):
-    """Stacks `Block`s sequentially.
+    """Stacks Blocks sequentially.
 
     Example::
 
@@ -37,9 +43,10 @@ class Sequential(Block):
     def __init__(self, prefix=None, params=None):
         super(Sequential, self).__init__(prefix=prefix, params=params)
 
-    def add(self, block):
+    def add(self, *blocks):
         """Adds block on top of the stack."""
-        self.register_child(block)
+        for block in blocks:
+            self.register_child(block)
 
     def forward(self, x):
         for block in self._children:
@@ -55,15 +62,29 @@ def __repr__(self):
         return s.format(name=self.__class__.__name__,
                         modstr=modstr)
 
-    def __getitem__(self, i):
-        return self._children[i]
+    def __getitem__(self, key):
+        return self._children[key]
 
     def __len__(self):
         return len(self._children)
 
+    def hybridize(self, active=True):
+        """Activates or deactivates `HybridBlock`s recursively. Has no effect on
+        non-hybrid children.
+
+        Parameters
+        ----------
+        active : bool, default True
+            Whether to turn hybrid on or off.
+        """
+        if self._children and all(isinstance(c, HybridBlock) for c in self._children):
+            warnings.warn('All children of this Sequential layer are HybridBlocks. Consider ' \
+                          'using HybridSequential for the best performance.')
+        super(Sequential, self).hybridize(active)
+
 
 class HybridSequential(HybridBlock):
-    """Stacks `HybridBlock`s sequentially.
+    """Stacks HybridBlocks sequentially.
 
     Example::
 
@@ -72,13 +93,15 @@ class HybridSequential(HybridBlock):
         with net.name_scope():
             net.add(nn.Dense(10, activation='relu'))
             net.add(nn.Dense(20))
+        net.hybridize()
     """
     def __init__(self, prefix=None, params=None):
         super(HybridSequential, self).__init__(prefix=prefix, params=params)
 
-    def add(self, block):
+    def add(self, *blocks):
         """Adds block on top of the stack."""
-        self.register_child(block)
+        for block in blocks:
+            self.register_child(block)
 
     def hybrid_forward(self, F, x):
         for block in self._children:
@@ -94,15 +117,15 @@ def __repr__(self):
         return s.format(name=self.__class__.__name__,
                         modstr=modstr)
 
-    def __getitem__(self, i):
-        return self._children[i]
+    def __getitem__(self, key):
+        return self._children[key]
 
     def __len__(self):
         return len(self._children)
 
 
 class Dense(HybridBlock):
-    """Just your regular densely-connected NN layer.
+    r"""Just your regular densely-connected NN layer.
 
     `Dense` implements the operation:
     `output = activation(dot(input, weight) + bias)`
@@ -124,6 +147,11 @@ class Dense(HybridBlock):
         (ie. "linear" activation: `a(x) = x`).
     use_bias : bool
         Whether the layer uses a bias vector.
+    flatten: bool
+        Whether the input tensor should be flattened.
+        If true, all but the first axis of input data are collapsed together.
+        If false, all but the last axis of input data are kept the same, and the transformation
+        applies on the last axis.
     weight_initializer : str or `Initializer`
         Initializer for the `kernel` weights matrix.
     bias_initializer: str or `Initializer`
@@ -138,16 +166,22 @@ class Dense(HybridBlock):
         See document of `Block`.
 
 
-    Input shape:
-        A 2D input with shape `(batch_size, in_units)`.
+    Inputs:
+        - **data**: if `flatten` is True, `data` should be a tensor with shape
+          `(batch_size, x1, x2, ..., xn)`, where x1 * x2 * ... * xn is equal to
+          `in_units`. If `flatten` is False, `data` should have shape
+          `(x1, x2, ..., xn, in_units)`.
 
-    Output shape:
-        The output would have shape `(batch_size, units)`.
+    Outputs:
+        - **out**: if `flatten` is True, `out` will be a tensor with shape
+          `(batch_size, units)`. If `flatten` is False, `out` will have shape
+          `(x1, x2, ..., xn, units)`.
     """
-    def __init__(self, units, activation=None, use_bias=True,
+    def __init__(self, units, activation=None, use_bias=True, flatten=True,
                  weight_initializer=None, bias_initializer='zeros',
                  in_units=0, **kwargs):
         super(Dense, self).__init__(**kwargs)
+        self._flatten = flatten
         with self.name_scope():
             self._units = units
             self._in_units = in_units
@@ -166,26 +200,22 @@ def __init__(self, units, activation=None, use_bias=True,
                 self.act = None
 
     def hybrid_forward(self, F, x, weight, bias=None):
-        if bias is None:
-            act = F.FullyConnected(x, weight, no_bias=True, num_hidden=self._units,
-                                   name='fwd')
-        else:
-            act = F.FullyConnected(x, weight, bias, num_hidden=self._units,
-                                   name='fwd')
+        act = F.FullyConnected(x, weight, bias, no_bias=bias is None, num_hidden=self._units,
+                               flatten=self._flatten, name='fwd')
         if self.act is not None:
             act = self.act(act)
         return act
 
     def __repr__(self):
         s = '{name}({layout}, {act})'
+        shape = self.weight.shape
         return s.format(name=self.__class__.__name__,
                         act=self.act if self.act else 'linear',
-                        layout='{0} -> {1}'.format(self._in_units, self._units) if self._in_units
-                        else self._units)
+                        layout='{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0]))
 
 
 class Activation(HybridBlock):
-    """Applies an activation function to input.
+    r"""Applies an activation function to input.
 
     Parameters
     ----------
@@ -194,11 +224,11 @@ class Activation(HybridBlock):
         See :func:`~mxnet.ndarray.Activation` for available choices.
 
 
-    Input shape:
-        Arbitrary.
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
 
-    Output shape:
-        Same shape as input.
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
     """
     def __init__(self, activation, **kwargs):
         self._act_type = activation
@@ -228,11 +258,11 @@ class Dropout(HybridBlock):
         Fraction of the input units to drop. Must be a number between 0 and 1.
 
 
-    Input shape:
-        Arbitrary.
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
 
-    Output shape:
-        Same shape as input.
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
 
     References
     ----------
@@ -290,11 +320,11 @@ class BatchNorm(HybridBlock):
         and `in_channels` will be inferred from the shape of input data.
 
 
-    Input shape:
-        Arbitrary.
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
 
-    Output shape:
-        Same shape as input.
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
     """
     def __init__(self, axis=1, momentum=0.9, epsilon=1e-5, center=True, scale=True,
                  beta_initializer='zeros', gamma_initializer='ones',
@@ -325,14 +355,19 @@ def __init__(self, axis=1, momentum=0.9, epsilon=1e-5, center=True, scale=True,
                                            allow_deferred_init=True,
                                            differentiable=False)
 
+    def cast(self, dtype):
+        if np.dtype(dtype).name == 'float16':
+            dtype = 'float32'
+        super(BatchNorm, self).cast(dtype)
+
     def hybrid_forward(self, F, x, gamma, beta, running_mean, running_var):
         return F.BatchNorm(x, gamma, beta, running_mean, running_var,
                            name='fwd', **self._kwargs)
 
     def __repr__(self):
         s = '{name}({content}'
-        if hasattr(self, 'in_channels'):
-            s += ', in_channels={0}'.format(self.in_channels)
+        in_channels = self.gamma.shape[0]
+        s += ', in_channels={0}'.format(in_channels if in_channels else None)
         s += ')'
         return s.format(name=self.__class__.__name__,
                         content=', '.join(['='.join([k, v.__repr__()])
@@ -340,12 +375,18 @@ def __repr__(self):
 
 
 class LeakyReLU(HybridBlock):
-    """Leaky version of a Rectified Linear Unit.
+    r"""Leaky version of a Rectified Linear Unit.
+
+    It allows a small gradient when the unit is not active
 
-    It allows a small gradient when the unit is not active::
+    .. math::
 
-        `f(x) = alpha * x for x < 0`,
-        `f(x) = x for x >= 0`.
+        f\left(x\right) = \left\{
+            \begin{array}{lr}
+               \alpha x & : x \lt 0 \\
+                      x & : x \geq 0 \\
+            \end{array}
+        \right.\\
 
     Parameters
     ----------
@@ -353,13 +394,14 @@ class LeakyReLU(HybridBlock):
         slope coefficient for the negative half axis. Must be >= 0.
 
 
-    Input shape:
-        Arbitrary.
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
 
-    Output shape:
-        Same shape as input.
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
     """
     def __init__(self, alpha, **kwargs):
+        assert alpha >= 0, "Slope coefficient for LeakyReLU must be no less than 0."
         super(LeakyReLU, self).__init__(**kwargs)
         self._alpha = alpha
 
@@ -373,7 +415,7 @@ def __repr__(self):
 
 
 class Embedding(HybridBlock):
-    """Turns non-negative integers (indexes/tokens) into dense vectors
+    r"""Turns non-negative integers (indexes/tokens) into dense vectors
     of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
 
 
@@ -389,11 +431,11 @@ class Embedding(HybridBlock):
         Initializer for the `embeddings` matrix.
 
 
-    Input shape:
-        2D tensor with shape: `(N, M)`.
+    Inputs:
+        - **data**: 2D tensor with shape: `(x1, x2)`.
 
-    Output shape:
-        3D tensor with shape: `(N, M, output_dim)`.
+    Output:
+        - **out**: 3D tensor with shape: `(x1, x2, output_dim)`.
     """
     def __init__(self, input_dim, output_dim, dtype='float32',
                  weight_initializer=None, **kwargs):
@@ -414,13 +456,13 @@ def __repr__(self):
 
 
 class Flatten(HybridBlock):
-    """Flattens the input to two dimensional.
+    r"""Flattens the input to two dimensional.
 
-    Input shape:
-        Arbitrary shape `(N, a, b, c, ...)`
+    Inputs:
+        - **data**: input tensor with arbitrary shape `(N, x1, x2, ..., xn)`
 
-    Output shape:
-        2D tensor with shape: `(N, a*b*c...)`
+    Output:
+        - **out**: 2D tensor with shape: `(N, x1 \cdot x2 \cdot ... \cdot xn)`
     """
     def __init__(self, **kwargs):
         super(Flatten, self).__init__(**kwargs)
@@ -430,3 +472,93 @@ def hybrid_forward(self, F, x):
 
     def __repr__(self):
         return self.__class__.__name__
+
+
+class Lambda(Block):
+    r"""Wraps an operator or an expression as a Block object.
+
+
+    Parameters
+    ----------
+    function : str or function
+        Function used in lambda must be one of the following:
+        1) the name of an operator that is available in ndarray. For example::
+
+            block = Lambda('tanh')
+
+        2) a function that conforms to "def function(*args)". For example::
+
+            block = Lambda(lambda x: nd.LeakyReLU(x, slope=0.1))
+
+    Inputs:
+        - ** *args **: one or more input data. Their shapes depend on the function.
+
+    Output:
+        - ** *outputs **: one or more output data. Their shapes depend on the function.
+    """
+    def __init__(self, function, prefix=None):
+        super(Lambda, self).__init__(prefix=prefix)
+        if isinstance(function, str):
+            assert hasattr(nd, function), \
+                   "Function name %s is not found in ndarray." % function
+            self._func_impl = getattr(nd, function)
+        elif callable(function):
+            self._func_impl = function
+        else:
+            raise ValueError(
+                "Unrecognized function in lambda: {} of type {}"
+                .format(function, type(function)))
+
+    def forward(self, *args):
+        return self._func_impl(*args)
+
+    def __repr__(self):
+        return '{name}({function})'.format(name=self.__class__.__name__,
+                                           function=self._func_impl.__name__)
+
+
+class HybridLambda(HybridBlock):
+    r"""Wraps an operator or an expression as a HybridBlock object.
+
+
+    Parameters
+    ----------
+    function : str or function
+        Function used in lambda must be one of the following:
+        1) the name of an operator that is available in both symbol and ndarray. For example::
+
+            block = HybridLambda('tanh')
+
+        2) a function that conforms to "def function(F, data, *args)". For example::
+
+            block = HybridLambda(lambda F, x: F.LeakyReLU(x, slope=0.1))
+
+    Inputs:
+        - ** *args **: one or more input data. First argument must be symbol or ndarray.
+        Their shapes depend on the function.
+
+    Output:
+        - ** *outputs **: one or more output data. Their shapes depend on the function.
+    """
+    def __init__(self, function, prefix=None):
+        super(HybridLambda, self).__init__(prefix=prefix)
+        if isinstance(function, str):
+            assert hasattr(nd, function) and hasattr(sym, function), \
+                   "Function name %s is not found in symbol/ndarray." % function
+            func_dict = {sym: getattr(sym, function), nd: getattr(nd, function)}
+            self._func = lambda F, *args: func_dict[F](*args)
+            self._func_name = function
+        elif callable(function):
+            self._func = function
+            self._func_name = function.__name__
+        else:
+            raise ValueError(
+                "Unrecognized function in lambda: {} of type {}"
+                .format(function, type(function)))
+
+    def hybrid_forward(self, F, x, *args):
+        return self._func(F, x, *args)
+
+    def __repr__(self):
+        return '{name}({function})'.format(name=self.__class__.__name__,
+                                           function=self._func_name)
diff --git a/python/mxnet/gluon/nn/conv_layers.py b/python/mxnet/gluon/nn/conv_layers.py
index e49340d27a4e..645de98ec0ed 100644
--- a/python/mxnet/gluon/nn/conv_layers.py
+++ b/python/mxnet/gluon/nn/conv_layers.py
@@ -18,6 +18,13 @@
 # coding: utf-8
 # pylint: disable= arguments-differ, too-many-lines
 """Convolutional neural network layers."""
+__all__ = ['Conv1D', 'Conv2D', 'Conv3D',
+           'Conv1DTranspose', 'Conv2DTranspose', 'Conv3DTranspose',
+           'MaxPool1D', 'MaxPool2D', 'MaxPool3D',
+           'AvgPool1D', 'AvgPool2D', 'AvgPool3D',
+           'GlobalMaxPool1D', 'GlobalMaxPool2D', 'GlobalMaxPool3D',
+           'GlobalAvgPool1D', 'GlobalAvgPool2D', 'GlobalAvgPool3D']
+
 from ..block import HybridBlock
 from ... import symbol
 from ...base import numeric_types
@@ -146,15 +153,14 @@ def __repr__(self):
         if self.bias is None:
             s += ', bias=False'
         s += ')'
+        shape = self.weight.shape
         return s.format(name=self.__class__.__name__,
-                        mapping=self._channels if not self._in_channels
-                        else '{0} -> {1}'.format(self._in_channels,
-                                                 self._channels),
+                        mapping='{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0]),
                         **self._kwargs)
 
 
 class Conv1D(_Conv):
-    """1D convolution layer (e.g. temporal convolution).
+    r"""1D convolution layer (e.g. temporal convolution).
 
     This layer creates a convolution kernel that is convolved
     with the layer input over a single spatial (or temporal) dimension
@@ -208,16 +214,15 @@ class Conv1D(_Conv):
         Initializer for the bias vector.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 3D array of shape
-        (batch_size, in_channels, width) if `layout` is `NCW`.
+    Inputs:
+        - **data**: 3D input tensor with shape `(batch_size, in_channels, width)`
+          when `layout` is `NCW`. For other layouts shape is permuted accordingly.
 
-    Output shape:
-        This depends on the `layout` parameter. Output is 3D array of shape
-        (batch_size, channels, out_width) if `layout` is `NCW`.
-        out_width is calculated as::
+    Outputs:
+        - **out**: 3D output tensor with shape `(batch_size, channels, out_width)`
+          when `layout` is `NCW`. out_width is calculated as::
 
-            out_width = floor((width+2*padding-dilation*(kernel_size-1)-1)/stride)+1
+              out_width = floor((width+2*padding-dilation*(kernel_size-1)-1)/stride)+1
     """
     def __init__(self, channels, kernel_size, strides=1, padding=0, dilation=1,
                  groups=1, layout='NCW', activation=None, use_bias=True,
@@ -232,7 +237,7 @@ def __init__(self, channels, kernel_size, strides=1, padding=0, dilation=1,
 
 
 class Conv2D(_Conv):
-    """2D convolution layer (e.g. spatial convolution over images).
+    r"""2D convolution layer (e.g. spatial convolution over images).
 
     This layer creates a convolution kernel that is convolved
     with the layer input to produce a tensor of
@@ -285,18 +290,18 @@ class Conv2D(_Conv):
         Initializer for the bias vector.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 4D array of shape
-        (batch_size, in_channels, height, width) if `layout` is `NCHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 4D array of shape
-        (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+    Inputs:
+        - **data**: 4D input tensor with shape
+          `(batch_size, in_channels, height, width)` when `layout` is `NCW`.
+          For other layouts shape is permuted accordingly.
 
-        out_height and out_width are calculated as::
+    Outputs:
+        - **out**: 4D output tensor with shape
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCW`.
+          out_height and out_width are calculated as::
 
-            out_height = floor((height+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
-            out_width = floor((width+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
+              out_height = floor((height+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
+              out_width = floor((width+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
     """
     def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
                  dilation=(1, 1), groups=1, layout='NCHW',
@@ -364,20 +369,19 @@ class Conv3D(_Conv):
         Initializer for the bias vector.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 5D array of shape
-        (batch_size, in_channels, depth, height, width) if `layout` is `NCDHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 5D array of shape
-        (batch_size, channels, out_depth, out_height, out_width) if `layout` is
-        `NCDHW`.
+    Inputs:
+        - **data**: 5D input tensor with shape
+          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCW`.
+          For other layouts shape is permuted accordingly.
 
-        out_depth, out_height and out_width are calculated as::
+    Outputs:
+        - **out**: 5D output tensor with shape
+          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCW`.
+          out_depth, out_height and out_width are calculated as::
 
-            out_depth = floor((depth+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
-            out_height = floor((height+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
-            out_width = floor((width+2*padding[2]-dilation[2]*(kernel_size[2]-1)-1)/stride[2])+1
+              out_depth = floor((depth+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
+              out_height = floor((height+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
+              out_width = floor((width+2*padding[2]-dilation[2]*(kernel_size[2]-1)-1)/stride[2])+1
     """
     def __init__(self, channels, kernel_size, strides=(1, 1, 1), padding=(0, 0, 0),
                  dilation=(1, 1, 1), groups=1, layout='NCDHW', activation=None,
@@ -445,17 +449,15 @@ class Conv1DTranspose(_Conv):
         Initializer for the bias vector.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 3D array of shape
-        (batch_size, in_channels, width) if `layout` is `NCW`.
+    Inputs:
+        - **data**: 3D input tensor with shape `(batch_size, in_channels, width)`
+          when `layout` is `NCW`. For other layouts shape is permuted accordingly.
 
-    Output shape:
-        This depends on the `layout` parameter. Output is 3D array of shape
-        (batch_size, channels, out_width) if `layout` is `NCW`.
+    Outputs:
+        - **out**: 3D output tensor with shape `(batch_size, channels, out_width)`
+          when `layout` is `NCW`. out_width is calculated as::
 
-        out_width is calculated as::
-
-            out_width = (width-1)*strides-2*padding+kernel_size+output_padding
+              out_width = (width-1)*strides-2*padding+kernel_size+output_padding
     """
     def __init__(self, channels, kernel_size, strides=1, padding=0, output_padding=0,
                  dilation=1, groups=1, layout='NCW', activation=None, use_bias=True,
@@ -530,18 +532,18 @@ class Conv2DTranspose(_Conv):
         Initializer for the bias vector.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 4D array of shape
-        (batch_size, in_channels, height, width) if `layout` is `NCHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 4D array of shape
-        (batch_size, channels, out_height, out_width) if `layout` is `NCHW`.
+    Inputs:
+        - **data**: 4D input tensor with shape
+          `(batch_size, in_channels, height, width)` when `layout` is `NCW`.
+          For other layouts shape is permuted accordingly.
 
-        out_height and out_width are calculated as::
+    Outputs:
+        - **out**: 4D output tensor with shape
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCW`.
+          out_height and out_width are calculated as::
 
-            out_height = (height-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
-            out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
+              out_height = (height-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
+              out_width = (width-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
     """
     def __init__(self, channels, kernel_size, strides=(1, 1), padding=(0, 0),
                  output_padding=(0, 0), dilation=(1, 1), groups=1, layout='NCHW',
@@ -616,14 +618,15 @@ class Conv3DTranspose(_Conv):
         Initializer for the bias vector.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 5D array of shape
-        (batch_size, in_channels, depth, height, width) if `layout` is `NCDHW`.
+    Inputs:
+        - **data**: 5D input tensor with shape
+          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCW`.
+          For other layouts shape is permuted accordingly.
 
-    Output shape:
-        This depends on the `layout` parameter. Output is 5D array of shape
-        (batch_size, channels, out_depth, out_height, out_width) if `layout` is `NCDHW`.
-        out_depth, out_height and out_width are calculated as::
+    Outputs:
+        - **out**: 5D output tensor with shape
+          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCW`.
+          out_depth, out_height and out_width are calculated as::
 
             out_depth = (depth-1)*strides[0]-2*padding[0]+kernel_size[0]+output_padding[0]
             out_height = (height-1)*strides[1]-2*padding[1]+kernel_size[1]+output_padding[1]
@@ -697,20 +700,18 @@ class MaxPool1D(_Pooling):
         When `True`, will use ceil instead of floor to compute the output shape.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 3D array of shape
-        (batch_size, channels, width) if `layout` is `NCW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 3D array of shape
-        (batch_size, channels, out_width) if `layout` is `NCW`.
+    Inputs:
+        - **data**: 3D input tensor with shape `(batch_size, in_channels, width)`
+          when `layout` is `NCW`. For other layouts shape is permuted accordingly.
 
-        out_width is calculated as::
+    Outputs:
+        - **out**: 3D output tensor with shape `(batch_size, channels, out_width)`
+          when `layout` is `NCW`. out_width is calculated as::
 
-            out_width = floor((width+2*padding-pool_size)/strides)+1
+              out_width = floor((width+2*padding-pool_size)/strides)+1
 
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
+          When `ceil_mode` is `True`, ceil will be used instead of floor in this
+          equation.
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, **kwargs):
@@ -744,21 +745,21 @@ class MaxPool2D(_Pooling):
         When `True`, will use ceil instead of floor to compute the output shape.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 4D array of shape
-        (batch_size, channels, height, width) if `layout` is `NCHW`.
+    Inputs:
+        - **data**: 4D input tensor with shape
+          `(batch_size, in_channels, height, width)` when `layout` is `NCW`.
+          For other layouts shape is permuted accordingly.
 
-    Output shape:
-        This depends on the `layout` parameter. Output is 4D array of shape
-        (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+    Outputs:
+        - **out**: 4D output tensor with shape
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCW`.
+          out_height and out_width are calculated as::
 
-        out_height and out_width are calculated as::
+              out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
+              out_width = floor((width+2*padding[1]-pool_size[1])/strides[1])+1
 
-            out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
-            out_width = floor((width+2*padding[1]-pool_size[1])/strides[1])+1
-
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
+          When `ceil_mode` is `True`, ceil will be used instead of floor in this
+          equation.
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0, layout='NCHW',
                  ceil_mode=False, **kwargs):
@@ -793,23 +794,22 @@ class MaxPool3D(_Pooling):
         When `True`, will use ceil instead of floor to compute the output shape.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 5D array of shape
-        (batch_size, channels, depth, height, width) if `layout` is `NCDHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 5D array of shape
-        (batch_size, channels, out_depth, out_height, out_width) if `layout`
-        is `NCDHW`.
+    Inputs:
+        - **data**: 5D input tensor with shape
+          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCW`.
+          For other layouts shape is permuted accordingly.
 
-        out_depth, out_height and out_width are calculated as ::
+    Outputs:
+        - **out**: 5D output tensor with shape
+          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCW`.
+          out_depth, out_height and out_width are calculated as::
 
-            out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
-            out_height = floor((height+2*padding[1]-pool_size[1])/strides[1])+1
-            out_width = floor((width+2*padding[2]-pool_size[2])/strides[2])+1
+              out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
+              out_height = floor((height+2*padding[1]-pool_size[1])/strides[1])+1
+              out_width = floor((width+2*padding[2]-pool_size[2])/strides[2])+1
 
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
+          When `ceil_mode` is `True`, ceil will be used instead of floor in this
+          equation.
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', **kwargs):
@@ -842,20 +842,18 @@ class AvgPool1D(_Pooling):
         When `True`, will use ceil instead of floor to compute the output shape.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 3D array of shape
-        (batch_size, channels, width) if `layout` is `NCW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 3D array of shape
-        (batch_size, channels, out_width) if `layout` is `NCW`.
+    Inputs:
+        - **data**: 3D input tensor with shape `(batch_size, in_channels, width)`
+          when `layout` is `NCW`. For other layouts shape is permuted accordingly.
 
-        out_width is calculated as::
+    Outputs:
+        - **out**: 3D output tensor with shape `(batch_size, channels, out_width)`
+          when `layout` is `NCW`. out_width is calculated as::
 
-            out_width = floor((width+2*padding-pool_size)/strides)+1
+              out_width = floor((width+2*padding-pool_size)/strides)+1
 
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
+          When `ceil_mode` is `True`, ceil will be used instead of floor in this
+          equation.
     """
     def __init__(self, pool_size=2, strides=None, padding=0, layout='NCW',
                  ceil_mode=False, **kwargs):
@@ -888,21 +886,21 @@ class AvgPool2D(_Pooling):
         When True, will use ceil instead of floor to compute the output shape.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 4D array of shape
-        (batch_size, channels, height, width) if `layout` is `NCHW`.
+    Inputs:
+        - **data**: 4D input tensor with shape
+          `(batch_size, in_channels, height, width)` when `layout` is `NCW`.
+          For other layouts shape is permuted accordingly.
 
-    Output shape:
-        This depends on the `layout` parameter. Output is 4D array of shape
-        (batch_size, channels, out_height, out_width)  if `layout` is `NCHW`.
+    Outputs:
+        - **out**: 4D output tensor with shape
+          `(batch_size, channels, out_height, out_width)` when `layout` is `NCW`.
+          out_height and out_width are calculated as::
 
-        out_height and out_width are calculated as::
+              out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
+              out_width = floor((width+2*padding[1]-pool_size[1])/strides[1])+1
 
-            out_height = floor((height+2*padding[0]-pool_size[0])/strides[0])+1
-            out_width = floor((width+2*padding[1]-pool_size[1])/strides[1])+1
-
-        When `ceil_mode` is `True`, ceil will be used instead of floor in this
-        equation.
+          When `ceil_mode` is `True`, ceil will be used instead of floor in this
+          equation.
     """
     def __init__(self, pool_size=(2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCHW', **kwargs):
@@ -936,23 +934,22 @@ class AvgPool3D(_Pooling):
         When True, will use ceil instead of floor to compute the output shape.
 
 
-    Input shape:
-        This depends on the `layout` parameter. Input is 5D array of shape
-        (batch_size, channels, depth, height, width) if `layout` is `NCDHW`.
-
-    Output shape:
-        This depends on the `layout` parameter. Output is 5D array of shape
-        (batch_size, channels, out_depth, out_height, out_width) if `layout`
-        is `NCDHW`.
+    Inputs:
+        - **data**: 5D input tensor with shape
+          `(batch_size, in_channels, depth, height, width)` when `layout` is `NCW`.
+          For other layouts shape is permuted accordingly.
 
-        out_depth, out_height and out_width are calculated as ::
+    Outputs:
+        - **out**: 5D output tensor with shape
+          `(batch_size, channels, out_depth, out_height, out_width)` when `layout` is `NCW`.
+          out_depth, out_height and out_width are calculated as::
 
-            out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
-            out_height = floor((height+2*padding[1]-pool_size[1])/strides[1])+1
-            out_width = floor((width+2*padding[2]-pool_size[2])/strides[2])+1
+              out_depth = floor((depth+2*padding[0]-pool_size[0])/strides[0])+1
+              out_height = floor((height+2*padding[1]-pool_size[1])/strides[1])+1
+              out_width = floor((width+2*padding[2]-pool_size[2])/strides[2])+1
 
-        When `ceil_mode` is `True,` ceil will be used instead of floor in this
-        equation.
+          When `ceil_mode` is `True,` ceil will be used instead of floor in this
+          equation.
     """
     def __init__(self, pool_size=(2, 2, 2), strides=None, padding=0,
                  ceil_mode=False, layout='NCDHW', **kwargs):
diff --git a/python/mxnet/gluon/parameter.py b/python/mxnet/gluon/parameter.py
index bef55d67e140..537d6365a494 100644
--- a/python/mxnet/gluon/parameter.py
+++ b/python/mxnet/gluon/parameter.py
@@ -18,6 +18,8 @@
 # coding: utf-8
 # pylint: disable=
 """Neural network parameter."""
+__all__ = ['DeferredInitializationError', 'Parameter', 'ParameterDict',
+           'tensor_types']
 
 from collections import OrderedDict
 import warnings
@@ -39,11 +41,11 @@ class DeferredInitializationError(MXNetError):
     pass
 
 class Parameter(object):
-    """A Container holding parameters (weights) of `Block`s.
+    """A Container holding parameters (weights) of Blocks.
 
-    `Parameter` holds a copy of the parameter on each `Context` after
-    it is initialized with `Parameter.initialize(...)`. If `grad_req` is
-    not `null`, it will also hold a gradient array on each `Context`::
+    :py:class:`Parameter` holds a copy of the parameter on each :py:class:`Context` after
+    it is initialized with ``Parameter.initialize(...)``. If :py:attr:`grad_req` is
+    not ``'null'``, it will also hold a gradient array on each :py:class:`Context`::
 
         ctx = mx.gpu(0)
         x = mx.nd.zeros((16, 100), ctx=ctx)
@@ -60,18 +62,18 @@ class Parameter(object):
     grad_req : {'write', 'add', 'null'}, default 'write'
         Specifies how to update gradient to grad arrays.
 
-        - 'write' means everytime gradient is written to grad `NDArray`.
-        - 'add' means everytime gradient is added to the grad `NDArray`. You need
-          to manually call `zero_grad()` to clear the gradient buffer before each
+        - ``'write'`` means everytime gradient is written to grad :py:class:`NDArray`.
+        - ``'add'`` means everytime gradient is added to the grad :py:class:`NDArray`. You need
+          to manually call ``zero_grad()`` to clear the gradient buffer before each
           iteration when using this option.
         - 'null' means gradient is not requested for this parameter. gradient arrays
           will not be allocated.
     shape : tuple of int, default None
         Shape of this parameter. By default shape is not specified. Parameter with
-        unknown shape can be used for `Symbol` API, but `init` will throw an error
-        when using `NDArray` API.
+        unknown shape can be used for :py:class:`Symbol` API, but ``init`` will throw an error
+        when using :py:class:`NDArray` API.
     dtype : numpy.dtype or str, default 'float32'
-        Data type of this parameter. For example, numpy.float32 or 'float32'.
+        Data type of this parameter. For example, ``numpy.float32`` or ``'float32'``.
     lr_mult : float, default 1.0
         Learning rate multiplier. Learning rate will be multiplied by lr_mult
         when updating this parameter with optimizer.
@@ -83,9 +85,15 @@ class Parameter(object):
     Attributes
     ----------
     grad_req : {'write', 'add', 'null'}
-        This can be set before or after initialization. Setting grad_req to null
-        with `x.grad_req = 'null'` saves memory and computation when you don't
+        This can be set before or after initialization. Setting ``grad_req`` to ``'null'``
+        with ``x.grad_req = 'null'`` saves memory and computation when you don't
         need gradient w.r.t x.
+    lr_mult : float
+        Local learning rate multiplier for this Parameter. The actual learning rate
+        is calculated with ``learning_rate * lr_mult``. You can set it with
+        ``param.lr_mult = 2.0``
+    wd_mult : float
+        Local weight decay multiplier for this Parameter.
     """
     def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
                  lr_mult=1.0, wd_mult=1.0, init=None, allow_deferred_init=False,
@@ -93,17 +101,19 @@ def __init__(self, name, grad_req='write', shape=None, dtype=mx_real_t,
         self._var = None
         self._data = None
         self._grad = None
+        self._ctx_list = None
+        self._ctx_map = None
         self._deferred_init = ()
         self._differentiable = differentiable
+        self._allow_deferred_init = allow_deferred_init
         self._grad_req = None
+        self._shape = shape
         self.name = name
-        self.shape = shape
         self.dtype = dtype
         self.lr_mult = lr_mult
         self.wd_mult = wd_mult
         self.grad_req = grad_req
         self.init = init
-        self.allow_deferred_init = allow_deferred_init
 
     def __repr__(self):
         s = 'Parameter {name} (shape={shape}, dtype={dtype})'
@@ -124,21 +134,53 @@ def grad_req(self, req):
         self._grad_req = req
         if req == 'null' and self._grad is not None:
             self._grad = None
-            for ctx in self._data:
-                self._data[ctx] = self._data[ctx].detach()
+            self._data = [i.detach() for i in self._data]
         elif self._data is not None:
             self._init_grad()
 
-    def _check_initialized(self, ctx=None):
-        if self._data is not None:
-            if ctx is not None and ctx not in self._data:
-                raise RuntimeError(
-                    "Parameter %s was not initialized on context %s. "
-                    "It was only initialized on %s."%(
-                        self.name, str(ctx), str(self.list_ctx())))
+    @property
+    def shape(self):
+        return self._shape
+
+    @shape.setter
+    def shape(self, new_shape):
+        if self._shape is None:
+            self._shape = new_shape
             return
+
+        assert len(self._shape) == len(new_shape) and \
+            all(j == 0 or i == j for i, j in zip(new_shape, self._shape)), \
+            "Expected shape %s is incompatible with given shape %s."%(
+                str(new_shape), str(self._shape))
+
+        self._shape = new_shape
+
+    def _check_and_get(self, arr_list, ctx):
+        if arr_list is not None:
+            if ctx is list:
+                return arr_list
+            if ctx is None:
+                if len(arr_list) == 1:
+                    return arr_list[0]
+                else:
+                    ctx = context.current_context()
+            if ctx.device_typeid < len(self._ctx_map):
+                ctx_list = self._ctx_map[ctx.device_typeid]
+                if ctx.device_id < len(ctx_list):
+                    idx = ctx_list[ctx.device_id]
+                    if idx is not None:
+                        return arr_list[idx]
+            raise RuntimeError(
+                "Parameter %s was not initialized on context %s. "
+                "It was only initialized on %s."%(
+                    self.name, str(ctx), str(self._ctx_list)))
         if self._deferred_init:
-            raise DeferredInitializationError
+            raise DeferredInitializationError(
+                "Parameter %s has not been initialized yet because initialization was " \
+                "deferred. Actual initialization happens during the first forward pass. " \
+                "Please pass one batch of data through the network before accessing Parameters. " \
+                "You can also avoid deferred initialization by specifying in_units, " \
+                "num_features, etc., for network layers."%(self.name))
         raise RuntimeError(
             "Parameter %s has not been initialized. Note that " \
             "you should initialize parameters and create Trainer " \
@@ -149,11 +191,12 @@ def _check_initialized(self, ctx=None):
     def _load_init(self, data, ctx):
         """(Re)initializes by loading from data."""
         if self.shape:
-            for i, j in zip(self.shape, data.shape):
-                assert i == 0 or i == j, \
+            for self_dim, data_dim in zip(self.shape, data.shape):
+                assert self_dim == 0 or self_dim == data_dim, \
                     "Failed loading Parameter %s from saved params: " \
                     "shape incompatible expacted %s vs saved %s"%(
                         self.name, str(self.shape), str(data.shape))
+            self.shape = tuple(i if i != 0 else j for i, j in zip(self.shape, data.shape))
         if self.dtype:
             assert np.dtype(self.dtype).type == data.dtype, \
                 "Failed loading Parameter %s from saved params: " \
@@ -180,7 +223,7 @@ def _finish_deferred_init(self):
         """Finishes deferred initialization."""
         if not self._deferred_init:
             return
-        init, ctx, default_init = self._deferred_init
+        init, ctx, default_init, data = self._deferred_init
         self._deferred_init = ()
         assert self.shape is not None and np.prod(self.shape) > 0, \
             "Cannot initialize Parameter %s because it has " \
@@ -189,18 +232,27 @@ def _finish_deferred_init(self):
                 self.name, str(self.shape))
 
         with autograd.pause():
-            data = ndarray.zeros(shape=self.shape, dtype=self.dtype,
-                                 ctx=context.cpu())
-            initializer.create(default_init)(
-                initializer.InitDesc(self.name, {'__init__': init}), data)
+            if data is None:
+                data = ndarray.zeros(shape=self.shape, dtype=self.dtype,
+                                     ctx=context.cpu())
+                initializer.create(default_init)(
+                    initializer.InitDesc(self.name, {'__init__': init}), data)
 
             self._init_impl(data, ctx)
 
-    def _init_impl(self, data, ctx):
+    def _init_impl(self, data, ctx_list):
         """Sets data and grad."""
-        self._data = OrderedDict()
-        for i in ctx:
-            self._data[i] = data.copyto(i)
+        self._ctx_list = list(ctx_list)
+        self._ctx_map = []
+        for i, ctx in enumerate(self._ctx_list):
+            while len(self._ctx_map) <= ctx.device_typeid:
+                self._ctx_map.append([])
+            dev_list = self._ctx_map[ctx.device_typeid]
+            while len(dev_list) <= ctx.device_id:
+                dev_list.append(None)
+            dev_list[ctx.device_id] = i
+
+        self._data = [data.copyto(ctx) for ctx in self._ctx_list]
         self._init_grad()
 
     def _init_grad(self):
@@ -209,10 +261,7 @@ def _init_grad(self):
             self._grad = None
             return
 
-        self._grad = OrderedDict()
-        for i in self._data:
-            self._grad[i] = ndarray.zeros_like(self._data[i])
-
+        self._grad = [ndarray.zeros_like(i) for i in self._data]
         autograd.mark_variables(self.list_data(), self.list_grad(), self.grad_req)
 
     def _reduce(self):
@@ -223,20 +272,24 @@ def _reduce(self):
 
     def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
                    force_reinit=False):
-        """Initializes parameter and gradient arrays. Only used for `NDArray` API.
+        """Initializes parameter and gradient arrays. Only used for :py:class:`NDArray` API.
 
         Parameters
         ----------
         init : Initializer
-            The initializer to use. Overrides `Parameter.init` and default_init.
-        ctx : Context or list of Context, defaults to `context.current_context()`.
+            The initializer to use. Overrides :py:meth:`Parameter.init` and default_init.
+        ctx : Context or list of Context, defaults to :py:meth:`context.current_context()`.
             Initialize Parameter on given context. If ctx is a list of Context, a
             copy will be made for each context.
 
-            .. note:: Copies are independent arrays. User is responsible for keeping
-            their values consistent when updating. Normally `gluon.Trainer` does this for you.
+            .. note::
+                Copies are independent arrays. User is responsible for keeping
+                their values consistent when updating.
+                Normally :py:class:`gluon.Trainer` does this for you.
+
         default_init : Initializer
-            Default initializer is used when both `init` and `Parameter.init` are `None`.
+            Default initializer is used when both :py:func:`init`
+            and :py:meth:`Parameter.init` are ``None``.
         force_reinit : bool, default False
             Whether to force re-initialization if parameter is already initialized.
 
@@ -275,19 +328,19 @@ def initialize(self, init=None, ctx=None, default_init=initializer.Uniform(),
         if init is None:
             init = default_init if self.init is None else self.init
         if not self.shape or np.prod(self.shape) <= 0:
-            if self.allow_deferred_init:
-                self._deferred_init = (init, ctx, default_init)
+            if self._allow_deferred_init:
+                self._deferred_init = (init, ctx, default_init, None)
                 return
             raise ValueError("Cannot initialize Parameter %s because it has " \
                              "invalid shape: %s."%(self.name, str(self.shape)))
 
-        self._deferred_init = (init, ctx, default_init)
+        self._deferred_init = (init, ctx, default_init, None)
         self._finish_deferred_init()
 
     def reset_ctx(self, ctx):
         """Re-assign Parameter to other contexts.
 
-        ctx : Context or list of Context, default `context.current_context()`.
+        ctx : Context or list of Context, default ``context.current_context()``.
             Assign Parameter to given context. If ctx is a list of Context, a
             copy will be made for each context.
         """
@@ -300,17 +353,23 @@ def reset_ctx(self, ctx):
             with autograd.pause():
                 self._init_impl(data, ctx)
         elif self._deferred_init:
-            init, _, default_init = self._deferred_init
-            self._deferred_init = (init, ctx, default_init)
+            init, _, default_init, data = self._deferred_init
+            self._deferred_init = (init, ctx, default_init, data)
         else:
             raise ValueError("Cannot reset context for Parameter %s because it "
                              "has not been initialized."%self.name)
 
 
     def set_data(self, data):
-        """Sets this parameter's value on all contexts to data."""
-        assert self._data is not None, \
-            "Parameter %s has not been initialized"%self.name
+        """Sets this parameter's value on all contexts."""
+        self.shape = data.shape
+
+        if self._data is None:
+            assert self._deferred_init is not None, \
+                "Parameter %s has not been initialized"%self.name
+            self._deferred_init = self._deferred_init[:3] + (data,)
+            return
+
         for arr in self.list_data():
             arr[:] = data
 
@@ -327,20 +386,12 @@ def data(self, ctx=None):
         -------
         NDArray on ctx
         """
-        if ctx is None:
-            list_ctx = self.list_ctx()
-            if len(list_ctx) == 1:
-                ctx = list_ctx[0]
-            else:
-                ctx = context.current_context()
-        self._check_initialized(ctx)
-        return self._data[ctx]
+        return self._check_and_get(self._data, ctx)
 
     def list_data(self):
         """Returns copies of this parameter on all contexts, in the same order
         as creation."""
-        self._check_initialized()
-        return list(self._data.values())
+        return self._check_and_get(self._data, list)
 
     def grad(self, ctx=None):
         """Returns a gradient buffer for this parameter on one context.
@@ -350,26 +401,20 @@ def grad(self, ctx=None):
         ctx : Context
             Desired context.
         """
-        if ctx is None:
-            list_ctx = self.list_ctx()
-            if len(list_ctx) == 1:
-                ctx = list_ctx[0]
-            else:
-                ctx = context.current_context()
-        self._check_initialized(ctx)
-        if self._grad is None:
+        if self._data is not None and self._grad is None:
             raise RuntimeError(
                 "Cannot get gradient array for Parameter %s " \
                 "because grad_req='null'"%(self.name))
-        return self._grad[ctx]
+        return self._check_and_get(self._grad, ctx)
 
     def list_grad(self):
         """Returns gradient buffers on all contexts, in the same order
-        as `values`."""
-        self._check_initialized()
-        assert self._grad is not None, \
-            "Parameter %s does not have gradients because grad_req='null'"%self.name
-        return list(self._grad.values())
+        as :py:meth:`values`."""
+        if self._data is not None and self._grad is None:
+            raise RuntimeError(
+                "Cannot get gradient array for Parameter %s " \
+                "because grad_req='null'"%(self.name))
+        return self._check_and_get(self._grad, list)
 
     def list_ctx(self):
         """Returns a list of contexts this parameter is initialized on."""
@@ -377,14 +422,14 @@ def list_ctx(self):
             if self._deferred_init:
                 return self._deferred_init[1]
             raise RuntimeError("Parameter %s has not been initialized"%self.name)
-        return list(self._data.keys())
+        return self._ctx_list
 
     def zero_grad(self):
         """Sets gradient buffer on all contexts to 0. No action is taken if
         parameter is uninitialized or doesn't require gradient."""
         if self._grad is None:
             return
-        for i in self._grad.values():
+        for i in self._grad:
             i[:] = 0
 
     def var(self):
@@ -395,18 +440,36 @@ def var(self):
                                    init=self.init)
         return self._var
 
+    def cast(self, dtype):
+        """Cast data and gradient of this Parameter to a new data type.
+
+        Parameters
+        ----------
+        dtype : str or numpy.dtype
+            The new data type.
+        """
+        self.dtype = dtype
+        if self._data is None:
+            return
+        with autograd.pause():
+            self._data = [i.astype(dtype) for i in self._data]
+            if self._grad is None:
+                return
+            self._grad = [i.astype(dtype) for i in self._grad]
+            autograd.mark_variables(self._data, self._grad, self.grad_req)
+
 
 class ParameterDict(object):
     """A dictionary managing a set of parameters.
 
     Parameters
     ----------
-    prefix : str, default ''
+    prefix : str, default ``''``
         The prefix to be prepended to all Parameters' names created by this dict.
     shared : ParameterDict or None
-        If not `None`, when this dict's `get` method creates a new parameter, will
-        first try to retrieve it from `shared` dict. Usually used for sharing
-        parameters with another `Block`.
+        If not ``None``, when this dict's :py:meth:`get` method creates a new parameter, will
+        first try to retrieve it from "shared" dict. Usually used for sharing
+        parameters with another Block.
     """
     def __init__(self, prefix='', shared=None):
         self._prefix = prefix
@@ -437,8 +500,8 @@ def values(self):
 
     @property
     def prefix(self):
-        """Prefix of this dict. It will be prepended to Parameters' name created
-        with `get`."""
+        """Prefix of this dict. It will be prepended to :py:class:`Parameter`s' name created
+        with :py:func:`get`."""
         return self._prefix
 
     def _get_impl(self, name):
@@ -450,9 +513,9 @@ def _get_impl(self, name):
         return None
 
     def get(self, name, **kwargs):
-        """Retrieves a `Parameter` with name `self.prefix+name`. If not found,
-        `get` will first try to retrieve it from `shared` dict. If still not
-        found, `get` will create a new `Parameter` with key-word arguments and
+        """Retrieves a :py:class:`Parameter` with name ``self.prefix+name``. If not found,
+        :py:func:`get` will first try to retrieve it from "shared" dict. If still not
+        found, :py:func:`get` will create a new :py:class:`Parameter` with key-word arguments and
         insert it to self.
 
         Parameters
@@ -461,12 +524,12 @@ def get(self, name, **kwargs):
             Name of the desired Parameter. It will be prepended with this dictionary's
             prefix.
         **kwargs : dict
-            The rest of key-word arguments for the created `Parameter`.
+            The rest of key-word arguments for the created :py:class:`Parameter`.
 
         Returns
         -------
         Parameter
-            The created or retrieved `Parameter`.
+            The created or retrieved :py:class:`Parameter`.
         """
         name = self.prefix + name
         param = self._get_impl(name)
@@ -486,7 +549,7 @@ def get(self, name, **kwargs):
         return param
 
     def update(self, other):
-        """Copies all Parameters in `other` to self."""
+        """Copies all Parameters in ``other`` to self."""
         for k, v in other.items():
             if k in self._params:
                 assert self._params[k] is v, \
@@ -497,14 +560,14 @@ def update(self, other):
 
     def initialize(self, init=initializer.Uniform(), ctx=None, verbose=False,
                    force_reinit=False):
-        """Initializes all Parameters managed by this dictionary to be used for `NDArray`
-        API. It has no effect when using `Symbol` API.
+        """Initializes all Parameters managed by this dictionary to be used for :py:class:`NDArray`
+        API. It has no effect when using :py:class:`Symbol` API.
 
         Parameters
         ----------
         init : Initializer
-            Global default Initializer to be used when `Parameter.init` is `None`.
-            Otherwise, `Parameter.init` takes precedence.
+            Global default Initializer to be used when :py:meth:`Parameter.init` is ``None``.
+            Otherwise, :py:meth:`Parameter.init` takes precedence.
         ctx : Context or list of Context
             Keeps a copy of Parameters on one or many context(s).
         force_reinit : bool, default False
@@ -523,7 +586,7 @@ def zero_grad(self):
     def reset_ctx(self, ctx):
         """Re-assign all Parameters to other contexts.
 
-        ctx : Context or list of Context, default `context.current_context()`.
+        ctx : Context or list of Context, default :py:meth:`context.current_context()`.
             Assign Parameter to given context. If ctx is a list of Context, a
             copy will be made for each context.
         """
@@ -568,7 +631,7 @@ def save(self, filename, strip_prefix=''):
                     "Prefix %s is to be striped before saving, but Parameter " \
                     "%s does not start with %s. If you are using Block.save_params, " \
                     "This may be due to your Block shares parameters from other " \
-                    "Blocks or you forgot to use `with name_scope()`` during init. " \
+                    "Blocks or you forgot to use ``with name_scope()`` during init. " \
                     "Consider switching to Block.collect_params.save and " \
                     "Block.collect_params.load instead."%(
                         strip_prefix, param.name, strip_prefix))
@@ -597,7 +660,9 @@ def load(self, filename, ctx, allow_missing=False,
                     "restore_prefix is %s but Parameters name %s does not start " \
                     "with %s"%(restore_prefix, name, restore_prefix)
         lprefix = len(restore_prefix)
-        arg_dict = {restore_prefix+k: v for k, v in ndarray.load(filename).items()}
+        loaded = [(k[4:] if k.startswith('arg:') or k.startswith('aux:') else k, v) \
+                  for k, v in ndarray.load(filename).items()]
+        arg_dict = {restore_prefix+k: v for k, v in loaded}
         if not allow_missing:
             for name in self.keys():
                 assert name in arg_dict, \
diff --git a/python/mxnet/gluon/rnn/rnn_cell.py b/python/mxnet/gluon/rnn/rnn_cell.py
index c9186fd3ce09..ea0e32faebc5 100644
--- a/python/mxnet/gluon/rnn/rnn_cell.py
+++ b/python/mxnet/gluon/rnn/rnn_cell.py
@@ -20,13 +20,18 @@
 # pylint: disable=too-many-branches, too-many-arguments, no-self-use
 # pylint: disable=too-many-lines, arguments-differ
 """Definition of various recurrent neural network cells."""
-from __future__ import print_function
+__all__ = ['RecurrentCell', 'HybridRecurrentCell',
+           'RNNCell', 'LSTMCell', 'GRUCell',
+           'SequentialRNNCell', 'DropoutCell',
+           'ModifierCell', 'ZoneoutCell', 'ResidualCell',
+           'BidirectionalCell']
 
 from ... import symbol, ndarray
 from ...base import string_types, numeric_types, _as_list
 from ..block import Block, HybridBlock
 from ..utils import _indent
 from .. import tensor_types
+from ..nn import LeakyReLU
 
 
 def _cells_state_info(cells, batch_size):
@@ -106,21 +111,12 @@ def __init__(self, prefix=None, params=None):
         self._modified = False
         self.reset()
 
-    def __repr__(self):
-        s = '{name}({mapping}'
-        if hasattr(self, '_activation'):
-            s += ', {_activation}'
-        s += ')'
-        mapping = ('{_input_size} -> {_hidden_size}'.format(**self.__dict__) if self._input_size
-                   else self._hidden_size)
-        return s.format(name=self.__class__.__name__,
-                        mapping=mapping,
-                        **self.__dict__)
-
     def reset(self):
         """Reset before re-using the cell for another graph."""
         self._init_counter = -1
         self._counter = -1
+        for cell in self._children:
+            cell.reset()
 
     def state_info(self, batch_size=0):
         """shape and layout information of states"""
@@ -229,6 +225,8 @@ def _get_activation(self, F, inputs, activation, **kwargs):
         """Get activation function. Convert if is string"""
         if isinstance(activation, string_types):
             return F.Activation(inputs, act_type=activation, **kwargs)
+        elif isinstance(activation, LeakyReLU):
+            return F.LeakyReLU(inputs, act_type='leaky', slope=activation._alpha, **kwargs)
         else:
             return activation(inputs, **kwargs)
 
@@ -273,7 +271,17 @@ def hybrid_forward(self, F, x, *args, **kwargs):
 
 
 class RNNCell(HybridRecurrentCell):
-    """Simple recurrent neural network cell.
+    r"""Elman RNN recurrent neural network cell.
+
+    Each call computes the following function:
+
+    .. math::
+
+        h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})
+
+    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is the hidden
+    state of the previous layer at time `t` or :math:`input_t` for the first layer.
+    If nonlinearity='relu', then `ReLU` is used instead of `tanh`.
 
     Parameters
     ----------
@@ -297,6 +305,17 @@ class RNNCell(HybridRecurrentCell):
     params : Parameter or None
         Container for weight sharing between cells.
         Created if `None`.
+
+
+    Inputs:
+        - **data**: input tensor with shape `(batch_size, input_size)`.
+        - **states**: a list of one initial recurrent state tensor with shape
+          `(batch_size, num_hidden)`.
+
+    Outputs:
+        - **out**: output tensor with shape `(batch_size, num_hidden)`.
+        - **next_states**: a list of one output recurrent state tensor with the
+          same shape as `states`.
     """
     def __init__(self, hidden_size, activation='tanh',
                  i2h_weight_initializer=None, h2h_weight_initializer=None,
@@ -325,6 +344,17 @@ def state_info(self, batch_size=0):
     def _alias(self):
         return 'rnn'
 
+    def __repr__(self):
+        s = '{name}({mapping}'
+        if hasattr(self, '_activation'):
+            s += ', {_activation}'
+        s += ')'
+        shape = self.i2h_weight.shape
+        mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0])
+        return s.format(name=self.__class__.__name__,
+                        mapping=mapping,
+                        **self.__dict__)
+
     def hybrid_forward(self, F, inputs, states, i2h_weight,
                        h2h_weight, i2h_bias, h2h_bias):
         prefix = 't%d_'%self._counter
@@ -341,7 +371,25 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
 
 
 class LSTMCell(HybridRecurrentCell):
-    """Long-Short Term Memory (LSTM) network cell.
+    r"""Long-Short Term Memory (LSTM) network cell.
+
+    Each call computes the following function:
+
+    .. math::
+        \begin{array}{ll}
+        i_t = sigmoid(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
+        f_t = sigmoid(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
+        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
+        o_t = sigmoid(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
+        c_t = f_t * c_{(t-1)} + i_t * g_t \\
+        h_t = o_t * \tanh(c_t)
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the
+    cell state at time `t`, :math:`x_t` is the hidden state of the previous
+    layer at time `t` or :math:`input_t` for the first layer, and :math:`i_t`,
+    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and
+    out gates, respectively.
 
     Parameters
     ----------
@@ -365,6 +413,17 @@ class LSTMCell(HybridRecurrentCell):
     params : Parameter or None
         Container for weight sharing between cells.
         Created if `None`.
+
+
+    Inputs:
+        - **data**: input tensor with shape `(batch_size, input_size)`.
+        - **states**: a list of two initial recurrent state tensors. Each has shape
+          `(batch_size, num_hidden)`.
+
+    Outputs:
+        - **out**: output tensor with shape `(batch_size, num_hidden)`.
+        - **next_states**: a list of two output recurrent state tensors. Each has
+          the same shape as `states`.
     """
     def __init__(self, hidden_size,
                  i2h_weight_initializer=None, h2h_weight_initializer=None,
@@ -394,6 +453,14 @@ def state_info(self, batch_size=0):
     def _alias(self):
         return 'lstm'
 
+    def __repr__(self):
+        s = '{name}({mapping})'
+        shape = self.i2h_weight.shape
+        mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0])
+        return s.format(name=self.__class__.__name__,
+                        mapping=mapping,
+                        **self.__dict__)
+
     def hybrid_forward(self, F, inputs, states, i2h_weight,
                        h2h_weight, i2h_bias, h2h_bias):
         prefix = 't%d_'%self._counter
@@ -416,10 +483,24 @@ def hybrid_forward(self, F, inputs, states, i2h_weight,
 
 
 class GRUCell(HybridRecurrentCell):
-    """Gated Rectified Unit (GRU) network cell.
+    r"""Gated Rectified Unit (GRU) network cell.
     Note: this is an implementation of the cuDNN version of GRUs
     (slight modification compared to Cho et al. 2014).
 
+    Each call computes the following function:
+
+    .. math::
+        \begin{array}{ll}
+        r_t = sigmoid(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
+        i_t = sigmoid(W_{ii} x_t + b_{ii} + W_hi h_{(t-1)} + b_{hi}) \\
+        n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
+        h_t = (1 - i_t) * n_t + i_t * h_{(t-1)} \\
+        \end{array}
+
+    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden
+    state of the previous layer at time `t` or :math:`input_t` for the first layer,
+    and :math:`r_t`, :math:`i_t`, :math:`n_t` are the reset, input, and new gates, respectively.
+
     Parameters
     ----------
     hidden_size : int
@@ -440,6 +521,17 @@ class GRUCell(HybridRecurrentCell):
     params : Parameter or None
         Container for weight sharing between cells.
         Created if `None`.
+
+
+    Inputs:
+        - **data**: input tensor with shape `(batch_size, input_size)`.
+        - **states**: a list of one initial recurrent state tensor with shape
+          `(batch_size, num_hidden)`.
+
+    Outputs:
+        - **out**: output tensor with shape `(batch_size, num_hidden)`.
+        - **next_states**: a list of one output recurrent state tensor with the
+          same shape as `states`.
     """
     def __init__(self, hidden_size,
                  i2h_weight_initializer=None, h2h_weight_initializer=None,
@@ -467,6 +559,14 @@ def state_info(self, batch_size=0):
     def _alias(self):
         return 'gru'
 
+    def __repr__(self):
+        s = '{name}({mapping})'
+        shape = self.i2h_weight.shape
+        mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0])
+        return s.format(name=self.__class__.__name__,
+                        mapping=mapping,
+                        **self.__dict__)
+
     def hybrid_forward(self, F, inputs, states, i2h_weight,
                        h2h_weight, i2h_bias, h2h_bias):
         # pylint: disable=too-many-locals
@@ -518,7 +618,8 @@ def add(self, cell):
 
         Parameters
         ----------
-        cell : rnn cell
+        cell : RecurrentCell
+            The cell to add.
         """
         self.register_child(cell)
 
@@ -581,6 +682,15 @@ class DropoutCell(HybridRecurrentCell):
     rate : float
         Percentage of elements to drop out, which
         is 1 - percentage to retain.
+
+
+    Inputs:
+        - **data**: input tensor with shape `(batch_size, size)`.
+        - **states**: a list of recurrent state tensors.
+
+    Outputs:
+        - **out**: output tensor with shape `(batch_size, size)`.
+        - **next_states**: returns input `states` directly.
     """
     def __init__(self, rate, prefix=None, params=None):
         super(DropoutCell, self).__init__(prefix, params)
@@ -669,7 +779,7 @@ def __init__(self, base_cell, zoneout_outputs=0., zoneout_states=0.):
         super(ZoneoutCell, self).__init__(base_cell)
         self.zoneout_outputs = zoneout_outputs
         self.zoneout_states = zoneout_states
-        self.prev_output = None
+        self._prev_output = None
 
     def __repr__(self):
         s = '{name}(p_out={zoneout_outputs}, p_state={zoneout_states}, {base_cell})'
@@ -681,14 +791,14 @@ def _alias(self):
 
     def reset(self):
         super(ZoneoutCell, self).reset()
-        self.prev_output = None
+        self._prev_output = None
 
     def hybrid_forward(self, F, inputs, states):
         cell, p_outputs, p_states = self.base_cell, self.zoneout_outputs, self.zoneout_states
         next_output, next_states = cell(inputs, states)
         mask = (lambda p, like: F.Dropout(F.ones_like(like), p=p))
 
-        prev_output = self.prev_output
+        prev_output = self._prev_output
         if prev_output is None:
             prev_output = F.zeros_like(next_output)
 
@@ -697,7 +807,7 @@ def hybrid_forward(self, F, inputs, states):
         states = ([F.where(mask(p_states, new_s), new_s, old_s) for new_s, old_s in
                    zip(next_states, states)] if p_states != 0. else next_states)
 
-        self.prev_output = output
+        self._prev_output = output
 
         return output, states
 
diff --git a/python/mxnet/gluon/rnn/rnn_layer.py b/python/mxnet/gluon/rnn/rnn_layer.py
index 86b7c618e503..204f3c9bd507 100644
--- a/python/mxnet/gluon/rnn/rnn_layer.py
+++ b/python/mxnet/gluon/rnn/rnn_layer.py
@@ -21,9 +21,10 @@
 # pylint: disable=too-many-lines, arguments-differ
 """Definition of various recurrent neural network layers."""
 from __future__ import print_function
+__all__ = ['RNN', 'LSTM', 'GRU']
 
 from ... import ndarray
-from ..nn import Block
+from .. import Block
 from . import rnn_cell
 
 
@@ -88,8 +89,8 @@ def __repr__(self):
         if self._dir == 2:
             s += ', bidirectional'
         s += ')'
-        mapping = ('{_input_size} -> {_hidden_size}'.format(**self.__dict__) if self._input_size
-                   else self._hidden_size)
+        shape = self.i2h_weight[0].shape
+        mapping = '{0} -> {1}'.format(shape[1] if shape[1] else None, shape[0])
         return s.format(name=self.__class__.__name__,
                         mapping=mapping,
                         **self.__dict__)
@@ -141,7 +142,7 @@ def begin_state(self, batch_size=0, func=ndarray.zeros, **kwargs):
         batch_size: int
             Only required for `NDArray` API. Size of the batch ('N' in layout).
             Dimension of the input.
-        func : callable, default `symbol.zeros`
+        func : callable, default `ndarray.zeros`
             Function for creating initial state.
 
             For Symbol API, func can be `symbol.zeros`, `symbol.uniform`,
@@ -172,7 +173,7 @@ def forward(self, inputs, states=None):
         batch_size = inputs.shape[self._layout.find('N')]
         skip_states = states is None
         if skip_states:
-            states = self.begin_state(batch_size)
+            states = self.begin_state(batch_size, ctx=inputs.context)
         if isinstance(states, ndarray.NDArray):
             states = [states]
         for state, info in zip(states, self.state_info(batch_size)):
@@ -278,34 +279,31 @@ class RNN(_RNNLayer):
         Shared Parameters for this `Block`.
 
 
-    Input shapes:
-        The input shape depends on `layout`. For `layout='TNC'`, the
-        input has shape `(sequence_length, batch_size, input_size)`
+    Inputs:
+        - **data**: input tensor with shape `(sequence_length, batch_size, input_size)`
+          when `layout` is "TNC". For other layouts dimensions are permuted accordingly.
+        - **states**: initial recurrent state tensor with shape
+          `(num_layers, batch_size, num_hidden)`. If `bidirectional` is True,
+          shape will instead be `(2*num_layers, batch_size, num_hidden)`. If
+          `states` is None, zeros will be used as default begin states.
 
-
-    Output shape:
-        The output shape depends on `layout`. For `layout='TNC'`, the
-        output has shape `(sequence_length, batch_size, num_hidden)`.
-        If `bidirectional` is True, output shape will instead be
-        `(sequence_length, batch_size, 2*num_hidden)`
-
-    Recurrent state:
-        The recurrent state is an NDArray with shape `(num_layers, batch_size, num_hidden)`.
-        If `bidirectional` is True, the recurrent state shape will instead be
-        `(2*num_layers, batch_size, num_hidden)`
-        If input recurrent state is None, zeros are used as default begin states,
-        and the output recurrent state is omitted.
+    Outputs:
+        - **out**: output tensor with shape `(sequence_length, batch_size, num_hidden)`
+          when `layout` is "TNC". If `bidirectional` is True, output shape will instead
+          be `(sequence_length, batch_size, 2*num_hidden)`
+        - **out_states**: output recurrent state tensor with the same shape as `states`.
+          If `states` is None `out_states` will not be returned.
 
 
     Examples
     --------
     >>> layer = mx.gluon.rnn.RNN(100, 3)
     >>> layer.initialize()
-    >>> input = mx.nd.random_uniform(shape=(5, 3, 10))
+    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
     >>> # by default zeros are used as begin state
     >>> output = layer(input)
     >>> # manually specify begin state.
-    >>> h0 = mx.nd.random_uniform(shape=(3, 3, 100))
+    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
     >>> output, hn = layer(input, h0)
     """
     def __init__(self, hidden_size, num_layers=1, activation='relu',
@@ -381,35 +379,32 @@ class LSTM(_RNNLayer):
         Shared Parameters for this `Block`.
 
 
-    Input shapes:
-        The input shape depends on `layout`. For `layout='TNC'`, the
-        input has shape `(sequence_length, batch_size, input_size)`
+    Inputs:
+        - **data**: input tensor with shape `(sequence_length, batch_size, input_size)`
+          when `layout` is "TNC". For other layouts dimensions are permuted accordingly.
+        - **states**: a list of two initial recurrent state tensors. Each has shape
+          `(num_layers, batch_size, num_hidden)`. If `bidirectional` is True,
+          shape will instead be `(2*num_layers, batch_size, num_hidden)`. If
+          `states` is None, zeros will be used as default begin states.
 
-    Output shape:
-        The output shape depends on `layout`. For `layout='TNC'`, the
-        output has shape `(sequence_length, batch_size, num_hidden)`.
-        If `bidirectional` is True, output shape will instead be
-        `(sequence_length, batch_size, 2*num_hidden)`
-
-    Recurrent state:
-        The recurrent state is a list of two NDArrays. Both has shape
-        `(num_layers, batch_size, num_hidden)`.
-        If `bidirectional` is True, each recurrent state will instead have shape
-        `(2*num_layers, batch_size, num_hidden)`.
-        If input recurrent state is None, zeros are used as default begin states,
-        and the output recurrent state is omitted.
+    Outputs:
+        - **out**: output tensor with shape `(sequence_length, batch_size, num_hidden)`
+          when `layout` is "TNC". If `bidirectional` is True, output shape will instead
+          be `(sequence_length, batch_size, 2*num_hidden)`
+        - **out_states**: a list of two output recurrent state tensors with the same
+          shape as in `states`. If `states` is None `out_states` will not be returned.
 
 
     Examples
     --------
     >>> layer = mx.gluon.rnn.LSTM(100, 3)
     >>> layer.initialize()
-    >>> input = mx.nd.random_uniform(shape=(5, 3, 10))
+    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
     >>> # by default zeros are used as begin state
     >>> output = layer(input)
     >>> # manually specify begin state.
-    >>> h0 = mx.nd.random_uniform(shape=(3, 3, 100))
-    >>> c0 = mx.nd.random_uniform(shape=(3, 3, 100))
+    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
+    >>> c0 = mx.nd.random.uniform(shape=(3, 3, 100))
     >>> output, hn = layer(input, [h0, c0])
     """
     def __init__(self, hidden_size, num_layers=1, layout='TNC',
@@ -481,33 +476,31 @@ class GRU(_RNNLayer):
         Shared Parameters for this `Block`.
 
 
-    Input shapes:
-        The input shape depends on `layout`. For `layout='TNC'`, the
-        input has shape `(sequence_length, batch_size, input_size)`
-
-    Output shape:
-        The output shape depends on `layout`. For `layout='TNC'`, the
-        output has shape `(sequence_length, batch_size, num_hidden)`.
-        If `bidirectional` is True, output shape will instead be
-        `(sequence_length, batch_size, 2*num_hidden)`
+    Inputs:
+        - **data**: input tensor with shape `(sequence_length, batch_size, input_size)`
+          when `layout` is "TNC". For other layouts dimensions are permuted accordingly.
+        - **states**: initial recurrent state tensor with shape
+          `(num_layers, batch_size, num_hidden)`. If `bidirectional` is True,
+          shape will instead be `(2*num_layers, batch_size, num_hidden)`. If
+          `states` is None, zeros will be used as default begin states.
 
-    Recurrent state:
-        The recurrent state is an NDArray with shape `(num_layers, batch_size, num_hidden)`.
-        If `bidirectional` is True, the recurrent state shape will instead be
-        `(2*num_layers, batch_size, num_hidden)`
-        If input recurrent state is None, zeros are used as default begin states,
-        and the output recurrent state is omitted.
+    Outputs:
+        - **out**: output tensor with shape `(sequence_length, batch_size, num_hidden)`
+          when `layout` is "TNC". If `bidirectional` is True, output shape will instead
+          be `(sequence_length, batch_size, 2*num_hidden)`
+        - **out_states**: output recurrent state tensor with the same shape as `states`.
+          If `states` is None `out_states` will not be returned.
 
 
     Examples
     --------
     >>> layer = mx.gluon.rnn.GRU(100, 3)
     >>> layer.initialize()
-    >>> input = mx.nd.random_uniform(shape=(5, 3, 10))
+    >>> input = mx.nd.random.uniform(shape=(5, 3, 10))
     >>> # by default zeros are used as begin state
     >>> output = layer(input)
     >>> # manually specify begin state.
-    >>> h0 = mx.nd.random_uniform(shape=(3, 3, 100))
+    >>> h0 = mx.nd.random.uniform(shape=(3, 3, 100))
     >>> output, hn = layer(input, h0)
     """
     def __init__(self, hidden_size, num_layers=1, layout='TNC',
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index bb2cc763b5ba..f3a14609587f 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -18,6 +18,7 @@
 # coding: utf-8
 # pylint: disable=
 """Parameter optimizer."""
+__all__ = ['Trainer']
 
 from .. import optimizer as opt
 from ..model import _create_kvstore
@@ -43,8 +44,20 @@ class Trainer(object):
     kvstore : str or KVStore
         kvstore type for multi-gpu and distributed training. See help on
         :any:`mxnet.kvstore.create` for more information.
+    compression_params : dict
+        Specifies type of gradient compression and additional arguments depending
+        on the type of compression being used. For example, 2bit compression requires a threshold.
+        Arguments would then be {'type':'2bit', 'threshold':0.5}
+        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
+
+    Properties
+    ----------
+    learning_rate: float
+        The current learning rate of the optimizer. Given an Optimizer object
+        optimizer, its learning rate can be accessed as optimizer.learning_rate.
     """
-    def __init__(self, params, optimizer, optimizer_params=None, kvstore='device'):
+    def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
+                 compression_params=None):
         if isinstance(params, (dict, ParameterDict)):
             params = list(params.values())
         if not isinstance(params, (list, tuple)):
@@ -58,7 +71,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device'):
                     "First argument must be a list or dict of Parameters, " \
                     "got list of %s."%(type(param)))
             self._params.append(param)
-
+        self._compression_params = compression_params
         optimizer_params = optimizer_params if optimizer_params else {}
         self._scale = optimizer_params.get('rescale_grad', 1.0)
         self._contexts = self._check_contexts()
@@ -97,6 +110,8 @@ def _init_kvstore(self):
         kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts),
                                                      arg_arrays)
         if kvstore:
+            if self._compression_params:
+                kvstore.set_gradient_compression(self._compression_params)
             if 'dist' in kvstore.type:
                 update_on_kvstore = False
             for i, param in enumerate(self._params):
@@ -113,6 +128,31 @@ def _init_kvstore(self):
 
         self._kv_initialized = True
 
+
+    @property
+    def learning_rate(self):
+        if not isinstance(self._optimizer, opt.Optimizer):
+            raise UserWarning("Optimizer has to be defined before its learning "
+                              "rate can be accessed.")
+        else:
+            return self._optimizer.learning_rate
+
+
+    def set_learning_rate(self, lr):
+        """Sets a new learning rate of the optimizer.
+
+        Parameters
+        ----------
+        lr : float
+            The new learning rate of the optimizer.
+        """
+        if not isinstance(self._optimizer, opt.Optimizer):
+            raise UserWarning("Optimizer has to be defined before its learning "
+                              "rate is mutated.")
+        else:
+            self._optimizer.set_learning_rate(lr)
+
+
     def step(self, batch_size, ignore_stale_grad=False):
         """Makes one step of parameter update. Should be called after
         `autograd.compute_gradient` and outside of `record()` scope.
@@ -158,3 +198,38 @@ def step(self, batch_size, ignore_stale_grad=False):
                 if not ignore_stale_grad or arr._fresh_grad:
                     upd(i, grad, arr)
                     arr._fresh_grad = False
+
+    def save_states(self, fname):
+        """Saves trainer states (e.g. optimizer, momentum) to a file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to output states file.
+        """
+        assert self._optimizer is not None
+
+        if self._update_on_kvstore:
+            self._kvstore.save_optimizer_states(fname, dump_optimizer=True)
+        else:
+            with open(fname, 'wb') as fout:
+                fout.write(self._updaters[0].get_states(dump_optimizer=True))
+
+    def load_states(self, fname):
+        """Loads trainer states (e.g. optimizer, momentum) from a file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to input states file.
+        """
+        if self._update_on_kvstore:
+            self._kvstore.load_optimizer_states(fname)
+            self._optimizer = self._kvstore._updater.optimizer
+        else:
+            with open(fname, 'rb') as f:
+                states = f.read()
+            for updater in self._updaters:
+                updater.set_states(states)
+                updater.optimizer = self._updaters[0].optimizer
+            self._optimizer = self._updaters[0].optimizer
diff --git a/python/mxnet/gluon/utils.py b/python/mxnet/gluon/utils.py
index cece22b75b14..890fb608ddfd 100644
--- a/python/mxnet/gluon/utils.py
+++ b/python/mxnet/gluon/utils.py
@@ -18,8 +18,12 @@
 # coding: utf-8
 # pylint: disable=
 """Parallelization utility optimizer."""
+__all__ = ['split_data', 'split_and_load', 'clip_global_norm',
+           'check_sha1', 'download']
+
 import os
 import hashlib
+import warnings
 try:
     import requests
 except ImportError:
@@ -27,7 +31,7 @@ class requests_failed_to_import(object):
         pass
     requests = requests_failed_to_import
 
-import math
+import numpy as np
 
 from .. import ndarray
 
@@ -113,11 +117,12 @@ def clip_global_norm(arrays, max_norm):
     """Rescales NDArrays so that the sum of their 2-norm is smaller than `max_norm`.
     """
     assert len(arrays) > 0
-    total_norm = 0
-    for arr in arrays:
-        arr = arr.reshape((-1,))
-        total_norm += ndarray.dot(arr, arr)
-    total_norm = math.sqrt(total_norm.asscalar())
+    total_norm = ndarray.add_n(*[ndarray.dot(x, x)
+                                 for x in (arr.reshape((-1,)) for arr in arrays)])
+    total_norm = ndarray.sqrt(total_norm).asscalar()
+    if not np.isfinite(total_norm):
+        warnings.warn(UserWarning('nan or inf is detected. Clipping results will be undefined.'),
+                      stacklevel=2)
     scale = max_norm / (total_norm + 1e-8)
     if scale < 1.0:
         for arr in arrays:
@@ -205,4 +210,10 @@ def download(url, path=None, overwrite=False, sha1_hash=None):
                 if chunk: # filter out keep-alive new chunks
                     f.write(chunk)
 
+        if sha1_hash and not check_sha1(fname, sha1_hash):
+            raise UserWarning('File {} is downloaded but the content hash does not match. ' \
+                              'The repo may be outdated or download may be incomplete. ' \
+                              'If the "repo_url" is overridden, consider switching to ' \
+                              'the default repo.'.format(fname))
+
     return fname
diff --git a/python/mxnet/image/detection.py b/python/mxnet/image/detection.py
index 8ac1aebe72dd..6ed9c3a974fe 100644
--- a/python/mxnet/image/detection.py
+++ b/python/mxnet/image/detection.py
@@ -27,7 +27,7 @@
 
 from ..base import numeric_types
 from .. import ndarray as nd
-from .._ndarray_internal import _cvcopyMakeBorder as copyMakeBorder
+from ..ndarray._internal import _cvcopyMakeBorder as copyMakeBorder
 from .. import io
 from .image import RandomOrderAug, ColorJitterAug, LightingAug, ColorNormalizeAug
 from .image import ResizeAug, ForceResizeAug, CastAug, HueJitterAug, RandomGrayAug
@@ -243,7 +243,7 @@ def _check_satisfy_constraints(self, label, xmin, ymin, xmax, ymax, width, heigh
         if valid_objects.size < 1:
             return False
         intersects = self._intersect(label[valid_objects, 1:], x1, y1, x2, y2)
-        coverages = self._calculate_areas(intersects) / object_areas
+        coverages = self._calculate_areas(intersects) / object_areas[valid_objects]
         coverages = coverages[np.where(coverages > 0)[0]]
         if coverages.size > 0 and np.amin(coverages) > self.min_object_covered:
             return True
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index 2e40019971ac..ace9cb18fffd 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -34,9 +34,9 @@
 
 from ..base import numeric_types
 from .. import ndarray as nd
-from .. import _ndarray_internal as _internal
-from .._ndarray_internal import _cvimresize as imresize
-from .._ndarray_internal import _cvcopyMakeBorder as copyMakeBorder
+from ..ndarray import _internal
+from ..ndarray._internal import _cvimresize as imresize
+from ..ndarray._internal import _cvcopyMakeBorder as copyMakeBorder
 from .. import io
 from .. import recordio
 
@@ -505,6 +505,29 @@ def __call__(self, src):
         raise NotImplementedError("Must override implementation.")
 
 
+class SequentialAug(Augmenter):
+    """Composing a sequential augmenter list.
+
+    Parameters
+    ----------
+    ts : list of augmenters
+        A series of augmenters to be applied in sequential order.
+    """
+    def __init__(self, ts):
+        super(SequentialAug, self).__init__()
+        self.ts = ts
+
+    def dumps(self):
+        """Override the default to avoid duplicate dump."""
+        return [self.__class__.__name__.lower(), [x.dumps() for x in self.ts]]
+
+    def __call__(self, src):
+        """Augmenter body"""
+        for aug in self.ts:
+            src = aug(src)
+        return src
+
+
 class ResizeAug(Augmenter):
     """Make resize shorter edge to size augmenter.
 
@@ -727,12 +750,12 @@ def __call__(self, src):
         https://beesbuzz.biz/code/hsv_color_transforms.php
         """
         alpha = random.uniform(-self.hue, self.hue)
-        vsu = np.cos(alpha * np.pi)
-        vsw = np.sin(alpha * np.pi)
+        u = np.cos(alpha * np.pi)
+        w = np.sin(alpha * np.pi)
         bt = np.array([[1.0, 0.0, 0.0],
-                       [0.0, vsu, -vsw],
-                       [0.0, vsw, vsu]])
-        t = np.dot(np.dot(self.tyiq, bt), self.ityiq).T
+                       [0.0, u, -w],
+                       [0.0, w, u]])
+        t = np.dot(np.dot(self.ityiq, bt), self.tyiq).T
         src = nd.dot(src, nd.array(t))
         return src
 
@@ -849,12 +872,13 @@ def __call__(self, src):
 
 class CastAug(Augmenter):
     """Cast to float32"""
-    def __init__(self):
-        super(CastAug, self).__init__(type='float32')
+    def __init__(self, typ='float32'):
+        super(CastAug, self).__init__(type=typ)
+        self.typ = typ
 
     def __call__(self, src):
         """Augmenter body"""
-        src = src.astype(np.float32)
+        src = src.astype(self.typ)
         return src
 
 
@@ -1180,7 +1204,23 @@ def check_valid_image(self, data):
     def imdecode(self, s):
         """Decodes a string or byte string to an NDArray.
         See mx.img.imdecode for more details."""
-        return imdecode(s)
+        def locate():
+            """Locate the image file/index if decode fails."""
+            if self.seq is not None:
+                idx = self.seq[self.cur - 1]
+            else:
+                idx = self.cur - 1
+            if self.imglist is not None:
+                _, fname = self.imglist[idx]
+                msg = "filename: {}".format(fname)
+            else:
+                msg = "index: {}".format(idx)
+            return "Broken image " + msg
+        try:
+            img = imdecode(s)
+        except Exception as e:
+            raise RuntimeError("{}, {}".format(locate(), e))
+        return img
 
     def read_image(self, fname):
         """Reads an input image `fname` and returns the decoded raw bytes.
diff --git a/python/mxnet/io.py b/python/mxnet/io.py
index 0404e34ea36c..25a95be787b3 100644
--- a/python/mxnet/io.py
+++ b/python/mxnet/io.py
@@ -29,11 +29,14 @@
     h5py = None
 import numpy as np
 from .base import _LIB
-from .base import c_array, c_str, mx_uint, py_str
+from .base import c_str_array, mx_uint, py_str
 from .base import DataIterHandle, NDArrayHandle
 from .base import mx_real_t
 from .base import check_call, build_param_doc as _build_param_doc
 from .ndarray import NDArray
+from .ndarray.sparse import CSRNDArray
+from .ndarray.sparse import array as sparse_array
+from .ndarray import _ndarray_cls
 from .ndarray import array
 from .ndarray import concatenate
 
@@ -188,6 +191,7 @@ class DataIter(object):
     --------
     NDArrayIter : Data-iterator for MXNet NDArray or numpy-ndarray objects.
     CSVIter : Data-iterator for csv data.
+    LibSVMIter : Data-iterator for libsvm data.
     ImageIter : Data-iterator for images.
     """
     def __init__(self, batch_size=0):
@@ -510,8 +514,37 @@ def _init_data(data, allow_empty, default_name):
 
     return list(data.items())
 
+def _has_instance(data, dtype):
+    """return True if data has instance of dtype"""
+    if isinstance(data, dtype):
+        return True
+    if isinstance(data, list):
+        for v in data:
+            if isinstance(v, dtype):
+                return True
+    if isinstance(data, dict):
+        for v in data.values():
+            if isinstance(v, dtype):
+                return True
+    return False
+
+def _shuffle(data, idx):
+    """Shuffle the data."""
+    shuffle_data = []
+
+    for k, v in data:
+        if (isinstance(v, h5py.Dataset) if h5py else False):
+            shuffle_data.append((k, v))
+        elif isinstance(v, CSRNDArray):
+            shuffle_data.append((k, sparse_array(v.asscipy()[idx], v.context)))
+        else:
+            shuffle_data.append((k, array(v.asnumpy()[idx], v.context)))
+
+    return shuffle_data
+
 class NDArrayIter(DataIter):
-    """Returns an iterator for ``mx.nd.NDArray``, ``numpy.ndarray`` or ``h5py.Dataset``.
+    """Returns an iterator for ``mx.nd.NDArray``, ``numpy.ndarray``, ``h5py.Dataset``
+    or ``mx.nd.sparse.CSRNDArray``.
 
     Example usage:
     ----------
@@ -574,6 +607,18 @@ class NDArrayIter(DataIter):
     >>> label = {'label1':np.zeros(shape=(10,1)), 'label2':np.zeros(shape=(20,1))}
     >>> dataiter = mx.io.NDArrayIter(data, label, 3, True, last_batch_handle='discard')
 
+    `NDArrayIter` also supports ``mx.nd.sparse.CSRNDArray``
+    with `last_batch_handle` set to `discard`.
+
+    >>> csr_data = mx.nd.array(np.arange(40).reshape((10,4))).tostype('csr')
+    >>> labels = np.ones([10, 1])
+    >>> dataiter = mx.io.NDArrayIter(csr_data, labels, 3, last_batch_handle='discard')
+    >>> [batch.data[0] for batch in dataiter]
+    [
+    <CSRNDArray 3x4 @cpu(0)>,
+    <CSRNDArray 3x4 @cpu(0)>,
+    <CSRNDArray 3x4 @cpu(0)>]
+
     Parameters
     ----------
     data: array or list of array or dict of string to array
@@ -599,6 +644,10 @@ def __init__(self, data, label=None, batch_size=1, shuffle=False,
                  label_name='softmax_label'):
         super(NDArrayIter, self).__init__(batch_size)
 
+        if ((_has_instance(data, CSRNDArray) or _has_instance(label, CSRNDArray)) and
+                (last_batch_handle != 'discard')):
+            raise NotImplementedError("`NDArrayIter` only supports ``CSRNDArray``" \
+                                      " with `last_batch_handle` set to `discard`.")
         self.data = _init_data(data, allow_empty=False, default_name=data_name)
         self.label = _init_data(label, allow_empty=True, default_name=label_name)
 
@@ -606,14 +655,8 @@ def __init__(self, data, label=None, batch_size=1, shuffle=False,
         # shuffle data
         if shuffle:
             np.random.shuffle(self.idx)
-            self.data = [(k, array(v.asnumpy()[self.idx], v.context))
-                         if not (isinstance(v, h5py.Dataset)
-                                 if h5py else False) else (k, v)
-                         for k, v in self.data]
-            self.label = [(k, array(v.asnumpy()[self.idx], v.context))
-                          if not (isinstance(v, h5py.Dataset)
-                                  if h5py else False) else (k, v)
-                          for k, v in self.label]
+            self.data = _shuffle(self.data, self.idx)
+            self.label = _shuffle(self.label, self.idx)
 
         # batching
         if last_batch_handle == 'discard':
@@ -720,7 +763,7 @@ class MXDataIter(DataIter):
     """A python wrapper a C++ data iterator.
 
     This iterator is the Python wrapper to all native C++ data iterators, such
-    as `CSVIter, `ImageRecordIter`, `MNISTIter`, etc. When initializing
+    as `CSVIter`, `ImageRecordIter`, `MNISTIter`, etc. When initializing
     `CSVIter` for example, you will get an `MXDataIter` instance to use in your
     Python code. Calls to `next`, `reset`, etc will be delegated to the
     underlying C++ data iterators.
@@ -801,12 +844,12 @@ def iter_next(self):
     def getdata(self):
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetData(self.handle, ctypes.byref(hdl)))
-        return NDArray(hdl, False)
+        return _ndarray_cls(hdl, False)
 
     def getlabel(self):
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl)))
-        return NDArray(hdl, False)
+        return _ndarray_cls(hdl, False)
 
     def getindex(self):
         index_size = ctypes.c_uint64(0)
@@ -814,10 +857,13 @@ def getindex(self):
         check_call(_LIB.MXDataIterGetIndex(self.handle,
                                            ctypes.byref(index_data),
                                            ctypes.byref(index_size)))
-        address = ctypes.addressof(index_data.contents)
-        dbuffer = (ctypes.c_uint64* index_size.value).from_address(address)
-        np_index = np.frombuffer(dbuffer, dtype=np.uint64)
-        return np_index.copy()
+        if index_size.value:
+            address = ctypes.addressof(index_data.contents)
+            dbuffer = (ctypes.c_uint64* index_size.value).from_address(address)
+            np_index = np.frombuffer(dbuffer, dtype=np.uint64)
+            return np_index.copy()
+        else:
+            return None
 
     def getpad(self):
         pad = ctypes.c_int(0)
@@ -873,11 +919,11 @@ def creator(*args, **kwargs):
         param_vals = []
 
         for k, val in kwargs.items():
-            param_keys.append(c_str(k))
-            param_vals.append(c_str(str(val)))
+            param_keys.append(k)
+            param_vals.append(str(val))
         # create atomic symbol
-        param_keys = c_array(ctypes.c_char_p, param_keys)
-        param_vals = c_array(ctypes.c_char_p, param_vals)
+        param_keys = c_str_array(param_keys)
+        param_vals = c_str_array(param_vals)
         iter_handle = DataIterHandle()
         check_call(_LIB.MXDataIterCreateIter(
             handle,
diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
index fd0091182aea..23eb454b5bda 100644
--- a/python/mxnet/kvstore.py
+++ b/python/mxnet/kvstore.py
@@ -19,42 +19,67 @@
 """ Key value store interface of MXNet for parameter synchronization."""
 from __future__ import absolute_import
 
+from array import array
 import ctypes
 import pickle
 from .ndarray import NDArray
-from .base import _LIB
-from .base import check_call, c_array, c_str, string_types, mx_uint, py_str
+from .ndarray import _ndarray_cls
+from .base import _LIB, c_str_array, c_handle_array, c_array, c_array_buf, c_str
+from .base import check_call, string_types, mx_uint, py_str
 from .base import NDArrayHandle, KVStoreHandle
 from . import optimizer as opt
 
 def _ctype_key_value(keys, vals):
+    """
+    Returns ctype arrays for the key-value args, and the whether string keys are used.
+    For internal use only.
+    """
     if isinstance(keys, (tuple, list)):
         assert(len(keys) == len(vals))
         c_keys = []
         c_vals = []
+        use_str_keys = None
         for key, val in zip(keys, vals):
-            c_key_i, c_val_i = _ctype_key_value(key, val)
+            c_key_i, c_val_i, str_keys_i = _ctype_key_value(key, val)
             c_keys += c_key_i
             c_vals += c_val_i
-        return (c_array(ctypes.c_char_p, c_keys), c_array(NDArrayHandle, c_vals))
-    names = []
-    keys = str(keys)
+            use_str_keys = str_keys_i if use_str_keys is None else use_str_keys
+            assert(use_str_keys == str_keys_i), "inconsistent types of keys detected."
+        c_keys_arr = c_array(ctypes.c_char_p, c_keys) if use_str_keys \
+                     else c_array(ctypes.c_int, c_keys)
+        c_vals_arr = c_array(ctypes.c_void_p, c_vals)
+        return (c_keys_arr, c_vals_arr, use_str_keys)
+
+    assert(isinstance(keys, (int,) + string_types)), \
+           "unexpected type for keys: " + str(type(keys))
+    use_str_keys = isinstance(keys, string_types)
     if isinstance(vals, NDArray):
-        names.append(c_str(keys))
-        return (c_array(ctypes.c_char_p, names),
-                c_array(NDArrayHandle, [vals.handle]))
+        c_keys = c_str_array([keys]) if use_str_keys \
+                 else c_array_buf(ctypes.c_int, array('i', [keys]))
+        return (c_keys, c_handle_array([vals]), use_str_keys)
     else:
         for value in vals:
             assert(isinstance(value, NDArray))
-        return (c_array(ctypes.c_char_p, [c_str(keys)] * len(vals)),
-                c_array(NDArrayHandle, [value.handle for value in vals]))
+        c_keys = c_str_array([keys] * len(vals)) if use_str_keys \
+                 else c_array_buf(ctypes.c_int, array('i', [keys] * len(vals)))
+        return (c_keys, c_handle_array(vals), use_str_keys)
+
+def _ctype_dict(param_dict):
+    """
+    Returns ctype arrays for keys and values(converted to strings) in a dictionary
+    """
+    assert(isinstance(param_dict, dict)), \
+        "unexpected type for param_dict: " + str(type(param_dict))
+    c_keys = c_array(ctypes.c_char_p, [c_str(k) for k in param_dict.keys()])
+    c_vals = c_array(ctypes.c_char_p, [c_str(str(v)) for v in param_dict.values()])
+    return (c_keys, c_vals)
 
 def _updater_wrapper(updater):
     """A wrapper for the user-defined handle."""
     def updater_handle(key, lhs_handle, rhs_handle, _):
         """ ctypes function """
-        lhs = NDArray(NDArrayHandle(lhs_handle))
-        rhs = NDArray(NDArrayHandle(rhs_handle))
+        lhs = _ndarray_cls(NDArrayHandle(lhs_handle))
+        rhs = _ndarray_cls(NDArrayHandle(rhs_handle))
         updater(key, lhs, rhs)
     return updater_handle
 
@@ -73,6 +98,7 @@ def __init__(self, handle):
         self.handle = handle
         self._updater = None
         self._updater_func = None
+        self._str_updater_func = None
 
     def __del__(self):
         check_call(_LIB.MXKVStoreFree(self.handle))
@@ -87,9 +113,9 @@ def init(self, key, value):
 
         Parameters
         ----------
-        key : str or sequence of str
+        key : str, int, or sequence of str or int
             The keys.
-        value : NDArray or sequence of NDArray
+        value : NDArray, RowSparseNDArray or sequence of NDArray or RowSparseNDArray
             Values corresponding to the keys.
 
         Examples
@@ -107,25 +133,38 @@ def init(self, key, value):
         >>> # init a list of key-value pairs
         >>> keys = ['5', '7', '9']
         >>> kv.init(keys, [mx.nd.ones(shape)]*len(keys))
+
+        >>> # init a row_sparse value
+        >>> kv.init('4', mx.nd.ones(shape).tostype('row_sparse'))
+        >>> b = mx.nd.sparse.zeros('row_sparse', shape)
+        >>> kv.row_sparse_pull('4', row_ids=mx.nd.array([0, 1]), out=b)
+        >>> print b
+        <RowSparseNDArray 2x3 @cpu(0)>
         """
-        ckeys, cvals = _ctype_key_value(key, value)
-        check_call(_LIB.MXKVStoreInitEx(self.handle, mx_uint(len(ckeys)), ckeys, cvals))
+        ckeys, cvals, use_str_keys = _ctype_key_value(key, value)
+        if use_str_keys:
+            check_call(_LIB.MXKVStoreInitEx(self.handle, mx_uint(len(ckeys)), ckeys, cvals))
+        else:
+            check_call(_LIB.MXKVStoreInit(self.handle, mx_uint(len(ckeys)), ckeys, cvals))
 
     def push(self, key, value, priority=0):
         """ Pushes a single or a sequence of key-value pairs into the store.
 
         This function returns immediately after adding an operator to the engine.
-        The actual operation is executed asynchronously after all previous `push`
-        and `pull` calls for the same input key(s) are finished.
-        There is no synchronization between workers. One can use ``_barrier()``
-        to sync all workers.
+        The actual operation is executed asynchronously. If there are consecutive
+        pushes to the same key, there is no guarantee on the serialization of pushes.
+        The execution of a push does not guarantee that all previous pushes are
+        finished.
+        There is no synchronization between workers.
+        One can use ``_barrier()`` to sync all workers.
 
         Parameters
         ----------
-        key : str or list of str
+        key : str, int, or sequence of str or int
             Keys.
 
-        value : NDArray or list of NDArray or list of list of NDArray
+        value : NDArray, RowSparseNDArray, list of NDArray or RowSparseNDArray,
+                or list of list of NDArray or RowSparseNDArray
             Values corresponding to the keys.
 
         priority : int, optional
@@ -153,6 +192,7 @@ def push(self, key, value, priority=0):
 
         >>> # push a list of keys.
         >>> # single device
+        >>> keys = ['4', '5', '6']
         >>> kv.push(keys, [mx.nd.ones(shape)]*len(keys))
         >>> b = [mx.nd.zeros(shape)]*len(keys)
         >>> kv.pull(keys, out=b)
@@ -161,17 +201,30 @@ def push(self, key, value, priority=0):
         [ 1.  1.  1.]]
 
         >>> # multiple devices:
+        >>> keys = ['7', '8', '9']
         >>> b = [[mx.nd.ones(shape, gpu) for gpu in gpus]] * len(keys)
         >>> kv.push(keys, b)
         >>> kv.pull(keys, out=b)
         >>> print b[1][1].asnumpy()
         [[ 4.  4.  4.]
         [ 4.  4.  4.]]
+
+        >>> # push a row_sparse value
+        >>> b = mx.nd.sparse.zeros('row_sparse', shape)
+        >>> kv.init('10', mx.nd.sparse.zeros('row_sparse', shape))
+        >>> kv.push('10', mx.nd.ones(shape).tostype('row_sparse'))
+        >>> # pull out the value
+        >>> kv.row_sparse_pull('10', row_ids=mx.nd.array([0, 1]), out=b)
+        >>> print b
+        <RowSparseNDArray 2x3 @cpu(0)>
         """
-        ckeys, cvals = _ctype_key_value(key, value)
-        check_call(_LIB.MXKVStorePushEx(
-            self.handle, mx_uint(len(ckeys)), ckeys, cvals,
-            ctypes.c_int(priority)))
+        ckeys, cvals, use_str_keys = _ctype_key_value(key, value)
+        if use_str_keys:
+            check_call(_LIB.MXKVStorePushEx(
+                self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority)))
+        else:
+            check_call(_LIB.MXKVStorePush(
+                self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority)))
 
 
     def pull(self, key, out=None, priority=0):
@@ -181,14 +234,17 @@ def pull(self, key, out=None, priority=0):
         Subsequent attempts to read from the `out` variable will be blocked until the
         pull operation completes.
 
-        `pull` is executed asynchronously after all previous `push` and `pull` calls
-        for the same input key(s) are finished.
+        `pull` is executed asynchronously after all previous `pull` calls and only
+        the last `push` call for the same input key(s) are finished.
 
-        The returned values are gauranteed to be the latest values in the store.
+        The returned values are guaranteed to be the latest values in the store.
+
+        For `RowSparseNDArray` values, this call is ignored,
+        please use ``row_sparse_pull`` instead.
 
         Parameters
         ----------
-        key : int or list of int
+        key : str, int, or sequence of str or int
             Keys.
 
         out: NDArray or list of NDArray or list of list of NDArray
@@ -224,6 +280,7 @@ def pull(self, key, out=None, priority=0):
         [[ 2.  2.  2.]
         [ 2.  2.  2.]]
         >>> # On multiple devices
+        >>> keys = ['6', '8', '10']
         >>> b = [[mx.nd.ones(shape, gpu) for gpu in gpus]] * len(keys)
         >>> kv.pull(keys, out=b)
         >>> print b[1][1].asnumpy()
@@ -231,10 +288,133 @@ def pull(self, key, out=None, priority=0):
         [ 2.  2.  2.]]
         """
         assert(out is not None)
-        ckeys, cvals = _ctype_key_value(key, out)
-        check_call(_LIB.MXKVStorePullEx(
-            self.handle, mx_uint(len(ckeys)), ckeys, cvals,
-            ctypes.c_int(priority)))
+        ckeys, cvals, use_str_keys = _ctype_key_value(key, out)
+        if use_str_keys:
+            check_call(_LIB.MXKVStorePullEx(
+                self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority)))
+        else:
+            check_call(_LIB.MXKVStorePull(
+                self.handle, mx_uint(len(ckeys)), ckeys, cvals, ctypes.c_int(priority)))
+
+    def row_sparse_pull(self, key, out=None, priority=0, row_ids=None):
+        """ Pulls a single RowSparseNDArray value or a sequence of RowSparseNDArray values \
+        from the store with specified row_ids.
+
+        `row_sparse_pull` is executed asynchronously after all previous
+        `pull`/`row_sparse_pull` calls and the last `push` call for the
+        same input key(s) are finished.
+
+        The returned values are guaranteed to be the latest values in the store.
+
+        Parameters
+        ----------
+        key : str, int, or sequence of str or int
+            Keys.
+
+        out: RowSparseNDArray or list of RowSparseNDArray or list of list of RowSparseNDArray
+            Values corresponding to the keys. The stype is expected to be row_sparse
+
+        priority : int, optional
+            The priority of the pull operation.
+            Higher priority pull operations are likely to be executed before
+            other pull actions.
+
+        row_ids : NDArray or list of NDArray
+            The row_ids for which to pull for each value. Each row_id is an 1D NDArray \
+            whose values don't have to be unique nor sorted.
+
+        Examples
+        --------
+        >>> shape = (3, 3)
+        >>> kv.init('3', mx.nd.ones(shape).tostype('row_sparse'))
+        >>> a = mx.nd.sparse.zeros('row_sparse', shape)
+        >>> row_ids = mx.nd.array([0, 2], dtype='int64')
+        >>> kv.row_sparse_pull('3', out=a, row_ids=row_ids)
+        >>> print a.asnumpy()
+        [[ 1.  1.  1.]
+        [ 0.  0.  0.]
+        [ 1.  1.  1.]]
+        >>> duplicate_row_ids = mx.nd.array([2, 2], dtype='int64')
+        >>> kv.row_sparse_pull('3', out=a, row_ids=duplicate_row_ids)
+        >>> print a.asnumpy()
+        [[ 0.  0.  0.]
+        [ 0.  0.  0.]
+        [ 1.  1.  1.]]
+        >>> unsorted_row_ids = mx.nd.array([1, 0], dtype='int64')
+        >>> kv.row_sparse_pull('3', out=a, row_ids=unsorted_row_ids)
+        >>> print a.asnumpy()
+        [[ 1.  1.  1.]
+        [ 1.  1.  1.]
+        [ 0.  0.  0.]]
+        """
+        assert(out is not None)
+        assert(row_ids is not None)
+        ckeys, cvals, use_str_keys = _ctype_key_value(key, out)
+        _, crow_ids, _ = _ctype_key_value(key, row_ids)
+        assert(len(crow_ids) == len(cvals)), \
+               "the number of row_ids doesn't match the number of values"
+        if use_str_keys:
+            check_call(_LIB.MXKVStorePullRowSparseEx(
+                self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority)))
+        else:
+            check_call(_LIB.MXKVStorePullRowSparse(
+                self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority)))
+
+    def set_gradient_compression(self, compression_params):
+        """ Specifies type of low-bit quantization for gradient compression \
+         and additional arguments depending on the type of compression being used.
+
+        2bit Gradient Compression takes a positive float `threshold`.
+        The technique works by thresholding values such that positive values in the
+        gradient above threshold will be set to threshold. Negative values whose absolute
+        values are higher than threshold, will be set to the negative of threshold.
+        Values whose absolute values are less than threshold will be set to 0.
+        By doing so, each value in the gradient is in one of three states. 2bits are
+        used to represent these states, and every 16 float values in the original
+        gradient can be represented using one float. This compressed representation
+        can reduce communication costs. The difference between these thresholded values and
+        original values is stored at the sender's end as residual and added to the
+        gradient in the next iteration.
+
+        When kvstore is 'local', gradient compression is used to reduce communication
+        between multiple devices (gpus). Gradient is quantized on each GPU which
+        computed the gradients, then sent to the GPU which merges the gradients. This
+        receiving GPU dequantizes the gradients and merges them. Note that this
+        increases memory usage on each GPU because of the residual array stored.
+
+        When kvstore is 'dist', gradient compression is used to reduce communication
+        from worker to sender. Gradient is quantized on each worker which
+        computed the gradients, then sent to the server which dequantizes
+        this data and merges the gradients from each worker. Note that this
+        increases CPU memory usage on each worker because of the residual array stored.
+        Only worker to server communication is compressed in this setting.
+        If each machine has multiple GPUs, currently this GPU to GPU or GPU to CPU communication
+        is not compressed. Server to worker communication (in the case of pull)
+        is also not compressed.
+
+        To use 2bit compression, we need to specify `type` as `2bit`.
+        Only specifying `type` would use default value for the threshold.
+        To completely specify the arguments for 2bit compression, we would need to pass
+        a dictionary which includes `threshold` like:
+        {'type': '2bit', 'threshold': 0.5}
+
+        Parameters
+        ----------
+        compression_params : dict
+            A dictionary specifying the type and parameters for gradient compression.
+            The key `type` in this dictionary is a
+            required string argument and specifies the type of gradient compression.
+            Currently `type` can be only `2bit`
+            Other keys in this dictionary are optional and specific to the type
+            of gradient compression.
+        """
+        if ('device' in self.type) or ('dist' in self.type):
+            ckeys, cvals = _ctype_dict(compression_params)
+            check_call(_LIB.MXKVStoreSetGradientCompression(self.handle,
+                                                            mx_uint(len(compression_params)),
+                                                            ckeys, cvals))
+        else:
+            raise Exception('Gradient compression is not supported for this type of kvstore')
 
     def set_optimizer(self, optimizer):
         """ Registers an optimizer with the kvstore.
@@ -320,7 +500,7 @@ def num_workers(self):
         check_call(_LIB.MXKVStoreGetGroupSize(self.handle, ctypes.byref(size)))
         return size.value
 
-    def save_optimizer_states(self, fname):
+    def save_optimizer_states(self, fname, dump_optimizer=False):
         """Saves the optimizer (updater) state to a file. This is often used when checkpointing
         the model during training.
 
@@ -328,10 +508,13 @@ def save_optimizer_states(self, fname):
         ----------
         fname : str
             Path to the output states file.
+        dump_optimizer : bool, default False
+            Whether to also save the optimizer itself. This would also save optimizer
+            information such as learning rate and weight decay schedules.
         """
         assert self._updater is not None, "Cannot save states for distributed training"
         with open(fname, 'wb') as fout:
-            fout.write(self._updater.get_states())
+            fout.write(self._updater.get_states(dump_optimizer))
 
     def load_optimizer_states(self, fname):
         """Loads the optimizer (updater) state from the file.
@@ -341,7 +524,7 @@ def load_optimizer_states(self, fname):
         fname : str
             Path to input states file.
         """
-        assert self._updater is not None, "Cannot save states for distributed training"
+        assert self._updater is not None, "Cannot load states for distributed training"
         self._updater.set_states(open(fname, 'rb').read())
 
     def _set_updater(self, updater):
@@ -373,10 +556,16 @@ def _set_updater(self, updater):
         [ 6.  6.  6.]]
         """
         self._updater = updater
+        # set updater with int keys
         _updater_proto = ctypes.CFUNCTYPE(
             None, ctypes.c_int, NDArrayHandle, NDArrayHandle, ctypes.c_void_p)
         self._updater_func = _updater_proto(_updater_wrapper(updater))
-        check_call(_LIB.MXKVStoreSetUpdater(self.handle, self._updater_func, None))
+        # set updater with str keys
+        _str_updater_proto = ctypes.CFUNCTYPE(
+            None, ctypes.c_char_p, NDArrayHandle, NDArrayHandle, ctypes.c_void_p)
+        self._str_updater_func = _str_updater_proto(_updater_wrapper(updater))
+        check_call(_LIB.MXKVStoreSetUpdaterEx(self.handle, self._updater_func,
+                                              self._str_updater_func, None))
 
 
     def _barrier(self):
@@ -436,7 +625,7 @@ def create(name='local'):
 
     Parameters
     ----------
-    name : {'local', 'device', 'dist_sync', 'dist_device_sync', 'dist_async'}
+    name : {'local', 'device', 'nccl', 'dist_sync', 'dist_device_sync', 'dist_async'}
         The type of KVStore.
     Returns
     -------
diff --git a/python/mxnet/kvstore_server.py b/python/mxnet/kvstore_server.py
index 1bb995a45ca8..2504b4674a83 100644
--- a/python/mxnet/kvstore_server.py
+++ b/python/mxnet/kvstore_server.py
@@ -57,7 +57,7 @@ def server_controller(cmd_id, cmd_body, _):
                     raise
                 self.kvstore.set_optimizer(optimizer)
             else:
-                print ("server %d, unknown command (%d, %s)" % (
+                print("server %d, unknown command (%d, %s)" % (
                     self.kvstore.rank, cmd_id, cmd_body))
         return server_controller
 
diff --git a/python/mxnet/libinfo.py b/python/mxnet/libinfo.py
index 7da0dcfc8d2d..ce6060623633 100644
--- a/python/mxnet/libinfo.py
+++ b/python/mxnet/libinfo.py
@@ -61,4 +61,4 @@ def find_lib_path():
 
 
 # current version
-__version__ = "0.11.0"
+__version__ = "1.0.0"
diff --git a/python/mxnet/metric.py b/python/mxnet/metric.py
index 00cc2da61f3c..5b0780aeccee 100644
--- a/python/mxnet/metric.py
+++ b/python/mxnet/metric.py
@@ -886,7 +886,7 @@ class :math:`k`.
     >>> print ce.get()
     ('cross-entropy', 0.57159948348999023)
     """
-    def __init__(self, eps=1e-8, name='cross-entropy',
+    def __init__(self, eps=1e-12, name='cross-entropy',
                  output_names=None, label_names=None):
         super(CrossEntropy, self).__init__(
             name, eps=eps,
@@ -917,6 +917,73 @@ def update(self, labels, preds):
             self.sum_metric += (-numpy.log(prob + self.eps)).sum()
             self.num_inst += label.shape[0]
 
+@register
+@alias('nll_loss')
+class NegativeLogLikelihood(EvalMetric):
+    """Computes the negative log-likelihood loss.
+
+    The negative log-likelihoodd loss over a batch of sample size :math:`N` is given by
+
+    .. math::
+       -\\sum_{n=1}^{N}\\sum_{k=1}^{K}t_{nk}\\log (y_{nk}),
+
+    where :math:`K` is the number of classes, :math:`y_{nk}` is the prediceted probability for
+    :math:`k`-th class for :math:`n`-th sample. :math:`t_{nk}=1` if and only if sample
+    :math:`n` belongs to class :math:`k`.
+
+    Parameters
+    ----------
+    eps : float
+        Negative log-likelihood loss is undefined for predicted value is 0,
+        so predicted values are added with the small constant.
+    name : str
+        Name of this metric instance for display.
+    output_names : list of str, or None
+        Name of predictions that should be used when updating with update_dict.
+        By default include all predictions.
+    label_names : list of str, or None
+        Name of labels that should be used when updating with update_dict.
+        By default include all labels.
+
+    Examples
+    --------
+    >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
+    >>> labels   = [mx.nd.array([0, 1, 1])]
+    >>> nll_loss = mx.metric.NegativeLogLikelihood()
+    >>> nll_loss.update(labels, predicts)
+    >>> print nll_loss.get()
+    ('nll-loss', 0.57159948348999023)
+    """
+    def __init__(self, eps=1e-12, name='nll-loss',
+                 output_names=None, label_names=None):
+        super(NegativeLogLikelihood, self).__init__(
+            name, eps=eps,
+            output_names=output_names, label_names=label_names)
+        self.eps = eps
+
+    def update(self, labels, preds):
+        """Updates the internal evaluation result.
+
+        Parameters
+        ----------
+        labels : list of `NDArray`
+            The labels of the data.
+
+        preds : list of `NDArray`
+            Predicted values.
+        """
+        check_label_shapes(labels, preds)
+
+        for label, pred in zip(labels, preds):
+            label = label.asnumpy()
+            pred = pred.asnumpy()
+
+            label = label.ravel()
+            num_examples = pred.shape[0]
+            assert label.shape[0] == num_examples, (label.shape[0], num_examples)
+            prob = pred[numpy.arange(num_examples, dtype=numpy.int64), numpy.int64(label)]
+            self.sum_metric += (-numpy.log(prob + self.eps)).sum()
+            self.num_inst += num_examples
 
 @register
 @alias('pearsonr')
diff --git a/python/mxnet/model.py b/python/mxnet/model.py
index 01b3fa50e18f..33dae1732599 100644
--- a/python/mxnet/model.py
+++ b/python/mxnet/model.py
@@ -20,6 +20,7 @@
 """MXNet model module"""
 from __future__ import absolute_import, print_function
 
+import os
 import time
 import logging
 import warnings
@@ -93,8 +94,7 @@ def _create_kvstore(kvstore, num_device, arg_params):
 
     return (kv, update_on_kvstore)
 
-def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names,
-                        update_on_kvstore):
+def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names, update_on_kvstore):
     """Initialize kvstore"""
     for idx, param_on_devs in enumerate(param_arrays):
         name = param_names[idx]
@@ -103,6 +103,26 @@ def _initialize_kvstore(kvstore, param_arrays, arg_params, param_names,
         if update_on_kvstore:
             kvstore.pull(name, param_on_devs, priority=-idx)
 
+def _update_params_on_kvstore_nccl(param_arrays, grad_arrays, kvstore, param_names):
+    """Perform update of param_arrays from grad_arrays on NCCL kvstore."""
+    valid_indices = [index for index, grad_list in
+                     enumerate(grad_arrays) if grad_list[0] is not None]
+    valid_grad_arrays = [grad_arrays[i] for i in valid_indices]
+    valid_param_arrays = [param_arrays[i] for i in valid_indices]
+    valid_param_names = [param_names[i] for i in valid_indices]
+    size = len(valid_grad_arrays)
+    start = 0
+    # Use aggregation by default only with NCCL
+    default_batch = 16
+    batch = int(os.getenv('MXNET_UPDATE_AGGREGATION_SIZE', default_batch))
+    while start < size:
+        end = start + batch if start + batch < size else size
+        # push gradient, priority is negative index
+        kvstore.push(valid_param_names[start:end], valid_grad_arrays[start:end], priority=-start)
+        # pull back the weights
+        kvstore.pull(valid_param_names[start:end], valid_param_arrays[start:end], priority=-start)
+        start = end
+
 def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
     """Perform update of param_arrays from grad_arrays on kvstore."""
     for index, pair in enumerate(zip(param_arrays, grad_arrays)):
@@ -118,10 +138,11 @@ def _update_params_on_kvstore(param_arrays, grad_arrays, kvstore, param_names):
 def _update_params(param_arrays, grad_arrays, updater, num_device,
                    kvstore=None, param_names=None):
     """Perform update of param_arrays from grad_arrays not on kvstore."""
-    for index, pair in enumerate(zip(param_arrays, grad_arrays)):
+    for i, pair in enumerate(zip(param_arrays, grad_arrays)):
         arg_list, grad_list = pair
         if grad_list[0] is None:
             continue
+        index = i
         if kvstore:
             name = param_names[index]
             # push gradient, priority is negative index
@@ -131,7 +152,7 @@ def _update_params(param_arrays, grad_arrays, updater, num_device,
         for k, p in enumerate(zip(arg_list, grad_list)):
             # faked an index here, to make optimizer create diff
             # state for the same index but on diff devs, TODO(mli)
-            # use a better solution latter
+            # use a better solution later
             w, g = p
             updater(index*num_device+k, g, w)
 
@@ -263,9 +284,14 @@ def _train_multi_device(symbol, ctx, arg_names, param_names, aux_names,
                 executor_manager.backward()
 
                 if update_on_kvstore:
-                    _update_params_on_kvstore(executor_manager.param_arrays,
-                                              executor_manager.grad_arrays,
-                                              kvstore, executor_manager.param_names)
+                    if 'nccl' in kvstore.type:
+                        _update_params_on_kvstore_nccl(executor_manager.param_arrays,
+                                                       executor_manager.grad_arrays,
+                                                       kvstore, executor_manager.param_names)
+                    else:
+                        _update_params_on_kvstore(executor_manager.param_arrays,
+                                                  executor_manager.grad_arrays,
+                                                  kvstore, executor_manager.param_names)
                 else:
                     _update_params(executor_manager.param_arrays,
                                    executor_manager.grad_arrays,
diff --git a/python/mxnet/module/base_module.py b/python/mxnet/module/base_module.py
index 3123462f9c7c..bae166e3ffd8 100644
--- a/python/mxnet/module/base_module.py
+++ b/python/mxnet/module/base_module.py
@@ -957,7 +957,8 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
 
     def init_optimizer(self, kvstore='local', optimizer='sgd',
                        optimizer_params=(('learning_rate', 0.01),), force_init=False):
-        """Installs and initializes optimizers.
+        """Installs and initializes optimizers, as well as initialize kvstore for
+           distributed training
 
         Parameters
         ----------
diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py
index 6f7ab52ad503..0bea260cd3d9 100644
--- a/python/mxnet/module/bucketing_module.py
+++ b/python/mxnet/module/bucketing_module.py
@@ -52,10 +52,18 @@ class BucketingModule(BaseModule):
     state_names : list of str
         States are similar to data and label, but not provided by data iterator.
         Instead they are initialized to 0 and can be set by set_states()
+    group2ctxs : list of dict of str to context
+        Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
+    compression_params : dict
+        Specifies type of gradient compression and additional arguments depending
+        on the type of compression being used. For example, 2bit compression requires a threshold.
+        Arguments would then be {'type':'2bit', 'threshold':0.5}
+        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     """
     def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
                  context=ctx.cpu(), work_load_list=None,
-                 fixed_param_names=None, state_names=None):
+                 fixed_param_names=None, state_names=None, group2ctxs=None,
+                 compression_params=None):
         super(BucketingModule, self).__init__(logger=logger)
 
         assert default_bucket_key is not None
@@ -73,18 +81,18 @@ def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
         _check_input_names(symbol, state_names, "state", True)
         _check_input_names(symbol, fixed_param_names, "fixed_param", True)
 
+        self._compression_params = compression_params
         self._fixed_param_names = fixed_param_names
         self._state_names = state_names
         self._context = context
         self._work_load_list = work_load_list
-        self._fixed_param_prefix = fixed_param_prefix
-        if self._fixed_param_prefix is None:
-            self._fixed_param_prefix = []
+        self._group2ctxs = group2ctxs
 
         self._buckets = {}
         self._curr_module = None
         self._curr_bucket_key = None
         self._params_dirty = False
+        self._monitor = None
 
     def _reset_bind(self):
         """Internal utility function to reset binding."""
@@ -322,7 +330,9 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
         module = Module(symbol, data_names, label_names, logger=self.logger,
                         context=self._context, work_load_list=self._work_load_list,
                         fixed_param_names=self._fixed_param_names,
-                        state_names=self._state_names)
+                        state_names=self._state_names,
+                        group2ctxs=self._group2ctxs,
+                        compression_params=self._compression_params)
         module.bind(data_shapes, label_shapes, for_training, inputs_need_grad,
                     force_rebind=False, shared_module=None, grad_req=grad_req)
         self._curr_module = module
@@ -348,19 +358,18 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
         assert self.binded, 'call bind before switching bucket'
         if not bucket_key in self._buckets:
             symbol, data_names, label_names = self._sym_gen(bucket_key)
-            fixed_param_names = list()
-            for name in symbol.list_arguments():
-                for prefix in self._fixed_param_prefix:
-                    if prefix in name:
-                        fixed_param_names.append(name)
             module = Module(symbol, data_names, label_names,
                             logger=self.logger, context=self._context,
                             work_load_list=self._work_load_list,
                             fixed_param_names=self._fixed_param_names,
-                            state_names=self._state_names)
+                            state_names=self._state_names,
+                            group2ctxs=self._group2ctxs,
+                            compression_params=self._compression_params)
             module.bind(data_shapes, label_shapes, self._curr_module.for_training,
                         self._curr_module.inputs_need_grad,
                         force_rebind=False, shared_module=self._buckets[self._default_bucket_key])
+            if self._monitor is not None:
+                module.install_monitor(self._monitor)
             self._buckets[bucket_key] = module
 
         self._curr_module = self._buckets[bucket_key]
@@ -504,5 +513,6 @@ def symbol(self):
     def install_monitor(self, mon):
         """Installs monitor on all executors """
         assert self.binded
+        self._monitor = mon
         for mod in self._buckets.values():
             mod.install_monitor(mon)
diff --git a/python/mxnet/module/executor_group.py b/python/mxnet/module/executor_group.py
index 0f3c079f8fcb..ea7651b65d93 100755
--- a/python/mxnet/module/executor_group.py
+++ b/python/mxnet/module/executor_group.py
@@ -139,10 +139,12 @@ class DataParallelExecutorGroup(object):
         Requirement for gradient accumulation. Can be 'write', 'add', or 'null'
         (default to 'write').
         Can be specified globally (str) or for each argument (list, dict).
+    group2ctxs : list of dict of str to context
+        Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
     """
     def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_names,
                  for_training, inputs_need_grad, shared_group=None, logger=logging,
-                 fixed_param_names=None, grad_req='write', state_names=None):
+                 fixed_param_names=None, grad_req='write', state_names=None, group2ctxs=None):
         self.param_names = param_names
         self.arg_names = symbol.list_arguments()
         self.aux_names = symbol.list_auxiliary_states()
@@ -150,6 +152,10 @@ def __init__(self, symbol, contexts, workload, data_shapes, label_shapes, param_
         self.symbol = symbol
         self.contexts = contexts
         self.workload = workload
+        if group2ctxs is None:
+            group2ctxs = [None] * len(self.contexts)
+        assert len(group2ctxs) == len(self.contexts)
+        self.group2ctxs = group2ctxs
 
         self.for_training = for_training
         self.inputs_need_grad = inputs_need_grad
@@ -597,9 +603,11 @@ def _bind_ith_exec(self, i, data_shapes, label_shapes, shared_group):
         if label_shapes is not None:
             input_types.update({x.name: x.dtype for x in label_shapes})
 
+        group2ctx = self.group2ctxs[i]
+
         executor = self.symbol.simple_bind(ctx=context, grad_req=self.grad_req,
                                            type_dict=input_types, shared_arg_names=self.param_names,
-                                           shared_exec=shared_exec,
+                                           shared_exec=shared_exec, group2ctx=group2ctx,
                                            shared_buffer=shared_data_arrays, **input_shapes)
         self._total_exec_bytes += int(executor.debug_str().split('\n')[-3].split()[1])
         return executor
diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
index 058edd57eb3d..a9c6516a32ed 100644
--- a/python/mxnet/module/module.py
+++ b/python/mxnet/module/module.py
@@ -25,7 +25,6 @@
 import warnings
 
 from .. import context as ctx
-from .. import ndarray as nd
 from .. import optimizer as opt
 
 from .executor_group import DataParallelExecutorGroup
@@ -33,6 +32,7 @@
 from ..model import load_checkpoint
 from ..initializer import Uniform, InitDesc
 from ..io import DataDesc
+from ..ndarray import zeros
 
 from .base_module import BaseModule, _check_input_names, _parse_data_desc
 
@@ -59,10 +59,18 @@ class Module(BaseModule):
     state_names : list of str
         states are similar to data and label, but not provided by data iterator.
         Instead they are initialized to 0 and can be set by `set_states()`.
+    group2ctxs : list of dict of str to context
+        Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
+    compression_params : dict
+        Specifies type of gradient compression and additional arguments depending
+        on the type of compression being used. For example, 2bit compression requires a threshold.
+        Arguments would then be {'type':'2bit', 'threshold':0.5}
+        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     """
     def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
                  logger=logging, context=ctx.cpu(), work_load_list=None,
-                 fixed_param_names=None, state_names=None):
+                 fixed_param_names=None, state_names=None, group2ctxs=None,
+                 compression_params=None):
         super(Module, self).__init__(logger=logger)
 
         if isinstance(context, ctx.Context):
@@ -73,6 +81,8 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
         assert len(work_load_list) == len(self._context)
         self._work_load_list = work_load_list
 
+        self._group2ctxs = group2ctxs
+
         self._symbol = symbol
 
         data_names = list(data_names) if data_names is not None else []
@@ -99,6 +109,7 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
         self._aux_params = None
         self._params_dirty = False
 
+        self._compression_params = compression_params
         self._optimizer = None
         self._kvstore = None
         self._update_on_kvstore = None
@@ -403,7 +414,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
             assert isinstance(shared_module, Module) and \
                     shared_module.binded and shared_module.params_initialized
             shared_group = shared_module._exec_group
-            assert len(shared_group.execs) == len(self._context)
+            assert len(shared_group.execs) >= len(self._context)
         else:
             shared_group = None
 
@@ -413,7 +424,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
                                                      for_training, inputs_need_grad,
                                                      shared_group, logger=self.logger,
                                                      fixed_param_names=self._fixed_param_names,
-                                                     grad_req=grad_req,
+                                                     grad_req=grad_req, group2ctxs=self._group2ctxs,
                                                      state_names=self._state_names)
         self._total_exec_bytes = self._exec_group._total_exec_bytes
         if shared_module is not None:
@@ -427,13 +438,13 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
         else:
             assert self._arg_params is None and self._aux_params is None
             param_arrays = [
-                nd.zeros(x[0].shape, dtype=x[0].dtype)
+                zeros(shape=x[0].shape, dtype=x[0].dtype, stype=x[0].stype)
                 for x in self._exec_group.param_arrays
             ]
             self._arg_params = {name:arr for name, arr in zip(self._param_names, param_arrays)}
 
             aux_arrays = [
-                nd.zeros(x[0].shape, dtype=x[0].dtype)
+                zeros(x[0].shape, dtype=x[0].dtype)
                 for x in self._exec_group.aux_arrays
             ]
             self._aux_params = {name:arr for name, arr in zip(self._aux_names, aux_arrays)}
@@ -441,7 +452,6 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
         if shared_module is not None and shared_module.optimizer_initialized:
             self.borrow_optimizer(shared_module)
 
-
     def reshape(self, data_shapes, label_shapes=None):
         """Reshapes the module for new input shapes.
 
@@ -483,6 +493,7 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
 
         if self._params_dirty:
             self._sync_params_from_devices()
+
         (kvstore, update_on_kvstore) = \
                 _create_kvstore(kvstore, len(self._context), self._arg_params)
 
@@ -521,6 +532,8 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
         self._updater = None
 
         if kvstore:
+            if self._compression_params:
+                kvstore.set_gradient_compression(self._compression_params)
             # copy initialized local parameters to kvstore
             _initialize_kvstore(kvstore=kvstore,
                                 param_arrays=self._exec_group.param_arrays,
diff --git a/python/mxnet/ndarray/__init__.py b/python/mxnet/ndarray/__init__.py
new file mode 100644
index 000000000000..586dc9eda6fb
--- /dev/null
+++ b/python/mxnet/ndarray/__init__.py
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""NDArray API of MXNet."""
+
+from . import _internal, contrib, linalg, op, random, sparse, utils
+# pylint: disable=wildcard-import, redefined-builtin
+try:
+    from .gen_op import * # pylint: disable=unused-wildcard-import
+except ImportError:
+    pass
+from . import register
+from .op import *
+from .ndarray import *
+# pylint: enable=wildcard-import
+from .utils import load, save, zeros, empty, array
+from .sparse import _ndarray_cls
+from .ndarray import _GRAD_REQ_MAP
+
+__all__ = op.__all__ + ndarray.__all__ + utils.__all__ + ['contrib', 'linalg', 'random', 'sparse']
diff --git a/python/mxnet/ndarray/_internal.py b/python/mxnet/ndarray/_internal.py
new file mode 100644
index 000000000000..5f3ce976dbc5
--- /dev/null
+++ b/python/mxnet/ndarray/_internal.py
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import, unused-import
+"""NDArray namespace used to register internal functions."""
+import os as _os
+import sys as _sys
+
+import numpy as np
+
+try:
+    if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
+        from .._ctypes.ndarray import NDArrayBase, CachedOp
+        from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke
+    elif _sys.version_info >= (3, 0):
+        from .._cy3.ndarray import NDArrayBase, CachedOp
+        from .._cy3.ndarray import _set_ndarray_class, _imperative_invoke
+    else:
+        from .._cy2.ndarray import NDArrayBase, CachedOp
+        from .._cy2.ndarray import _set_ndarray_class, _imperative_invoke
+except ImportError:
+    if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
+        raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
+    from .._ctypes.ndarray import NDArrayBase, CachedOp
+    from .._ctypes.ndarray import _set_ndarray_class, _imperative_invoke
+
+from ..base import _Null
+try:
+    from .gen__internal import * # pylint: disable=unused-wildcard-import
+except ImportError:
+    pass
+
+__all__ = ['NDArrayBase', 'CachedOp', '_imperative_invoke', '_set_ndarray_class']
diff --git a/python/mxnet/ndarray/contrib.py b/python/mxnet/ndarray/contrib.py
new file mode 100644
index 000000000000..0a3fe48c64cc
--- /dev/null
+++ b/python/mxnet/ndarray/contrib.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-wildcard-import
+"""Contrib NDArray API of MXNet."""
+try:
+    from .gen_contrib import *
+except ImportError:
+    pass
+
+__all__ = []
diff --git a/python/mxnet/ndarray/linalg.py b/python/mxnet/ndarray/linalg.py
new file mode 100644
index 000000000000..baceab25d447
--- /dev/null
+++ b/python/mxnet/ndarray/linalg.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-wildcard-import
+"""Linear Algebra NDArray API of MXNet."""
+try:
+    from .gen_linalg import *
+except ImportError:
+    pass
+
+__all__ = []
diff --git a/python/mxnet/ndarray.py b/python/mxnet/ndarray/ndarray.py
similarity index 58%
rename from python/mxnet/ndarray.py
rename to python/mxnet/ndarray/ndarray.py
index 42f0ff5e87cf..a45a6a82471e 100644
--- a/python/mxnet/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -16,74 +16,78 @@
 # under the License.
 
 # coding: utf-8
-# pylint: disable= too-many-lines, redefined-builtin, protected-access
+# pylint: disable=too-many-lines, protected-access
 # pylint: disable=import-error, no-name-in-module, undefined-variable
 """NDArray API of MXNet."""
 from __future__ import absolute_import
 from __future__ import division
+
 try:
     from __builtin__ import slice as py_slice
 except ImportError:
     from builtins import slice as py_slice
 
+from array import array as native_array
 import ctypes
 import warnings
-
-import os as _os
-import sys as _sys
-
 import operator
+from functools import reduce # pylint: disable=redefined-builtin
 import numpy as np
-from .base import _LIB, string_types, numeric_types, integer_types
-from .base import c_array, py_str, c_str, mx_real_t, _Null  # pylint: disable=unused-import
-from .base import mx_uint, NDArrayHandle, check_call, OpHandle
-from .base import ctypes2buffer
-from .context import Context
-from . import _ndarray_internal as _internal
-from .ndarray_doc import _build_doc
-
-
-# Use different version of SymbolBase
-# When possible, use cython to speedup part of computation.
-# pylint: disable=unused-import
-try:
-    if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
-        from ._ctypes.ndarray import NDArrayBase, _set_ndarray_class
-        from ._ctypes.ndarray import CachedOp, _imperative_invoke
-    elif _sys.version_info >= (3, 0):
-        from ._cy3.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke
-        from ._cy3.ndarray import CachedOp, _imperative_invoke
-    else:
-        from ._cy2.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke
-        from ._cy2.ndarray import CachedOp, _imperative_invoke
-except ImportError:
-    if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
-        raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
-    from ._ctypes.ndarray import NDArrayBase, _set_ndarray_class, _imperative_invoke
-    from ._ctypes.ndarray import CachedOp, _imperative_invoke
-# pylint: enable=unused-import
+from ..base import _LIB, numeric_types, integer_types
+from ..base import c_array, c_array_buf, c_handle_array, mx_real_t
+from ..base import mx_uint, NDArrayHandle, check_call
+from ..base import ctypes2buffer
+from ..context import Context
+from . import _internal
+from . import op
+from ._internal import NDArrayBase
+
+__all__ = ["NDArray", "concatenate", "_DTYPE_NP_TO_MX", "_DTYPE_MX_TO_NP", "_GRAD_REQ_MAP",
+           "ones", "add", "arange", "divide", "equal", "full", "greater", "greater_equal",
+           "imdecode", "lesser", "lesser_equal", "maximum", "minimum", "moveaxis", "modulo",
+           "multiply", "not_equal", "onehot_encode", "power", "subtract", "true_divide",
+           "waitall", "_new_empty_handle"]
+
+_STORAGE_TYPE_UNDEFINED = -1
+_STORAGE_TYPE_DEFAULT = 0
+_STORAGE_TYPE_ROW_SPARSE = 1
+_STORAGE_TYPE_CSR = 2
 
 # pylint: disable= no-member
 _DTYPE_NP_TO_MX = {
-    None       : -1,
-    np.float32 : 0,
-    np.float64 : 1,
-    np.float16 : 2,
-    np.uint8   : 3,
-    np.int32   : 4,
-    np.int8    : 5,
-    np.int64   : 6,
+    None: -1,
+    np.float32: 0,
+    np.float64: 1,
+    np.float16: 2,
+    np.uint8: 3,
+    np.int32: 4,
+    np.int8: 5,
+    np.int64: 6,
 }
 
 _DTYPE_MX_TO_NP = {
-    -1 : None,
-    0 : np.float32,
-    1 : np.float64,
-    2 : np.float16,
-    3 : np.uint8,
-    4 : np.int32,
-    5 : np.int8,
-    6 : np.int64,
+    -1: None,
+    0: np.float32,
+    1: np.float64,
+    2: np.float16,
+    3: np.uint8,
+    4: np.int32,
+    5: np.int8,
+    6: np.int64,
+}
+
+_STORAGE_TYPE_STR_TO_ID = {
+    'undefined': _STORAGE_TYPE_UNDEFINED,
+    'default': _STORAGE_TYPE_DEFAULT,
+    'row_sparse': _STORAGE_TYPE_ROW_SPARSE,
+    'csr': _STORAGE_TYPE_CSR,
+}
+
+_STORAGE_TYPE_ID_TO_STR = {
+    _STORAGE_TYPE_UNDEFINED: 'undefined',
+    _STORAGE_TYPE_DEFAULT: 'default',
+    _STORAGE_TYPE_ROW_SPARSE: 'row_sparse',
+    _STORAGE_TYPE_CSR: 'csr',
 }
 
 _GRAD_REQ_MAP = {
@@ -93,6 +97,12 @@
 }
 # pylint: enable= no-member
 
+# Return code for dispatching indexing function call
+_NDARRAY_UNSUPPORTED_INDEXING = -1
+_NDARRAY_BASIC_INDEXING = 0
+_NDARRAY_ADVANCED_INDEXING = 1
+
+
 def _new_empty_handle():
     """Returns a new empty handle.
 
@@ -107,6 +117,7 @@ def _new_empty_handle():
     check_call(_LIB.MXNDArrayCreateNone(ctypes.byref(hdl)))
     return hdl
 
+
 def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """Return a new handle with specified shape and context.
 
@@ -119,7 +130,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
     """
     hdl = NDArrayHandle()
     check_call(_LIB.MXNDArrayCreateEx(
-        c_array(mx_uint, shape),
+        c_array_buf(mx_uint, native_array('I', shape)),
         mx_uint(len(shape)),
         ctypes.c_int(ctx.device_typeid),
         ctypes.c_int(ctx.device_id),
@@ -128,6 +139,19 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
         ctypes.byref(hdl)))
     return hdl
 
+
+def _new_from_shared_mem(shared_pid, shared_id, shape, dtype):
+    hdl = NDArrayHandle()
+    check_call(_LIB.MXNDArrayCreateFromSharedMem(
+        ctypes.c_int(shared_pid),
+        ctypes.c_int(shared_id),
+        c_array(mx_uint, shape),
+        mx_uint(len(shape)),
+        ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
+        ctypes.byref(hdl)))
+    return hdl
+
+
 def waitall():
     """Wait for all async operations to finish in MXNet.
 
@@ -135,6 +159,13 @@ def waitall():
     """
     check_call(_LIB.MXNDArrayWaitAll())
 
+
+def _storage_type(handle):
+    storage_type = ctypes.c_int(0)
+    check_call(_LIB.MXNDArrayGetStorageType(handle, ctypes.byref(storage_type)))
+    return storage_type.value
+
+
 class NDArray(NDArrayBase):
     """An array object representing a multidimensional, homogeneous array of
 fixed-size items.
@@ -144,6 +175,7 @@ class NDArray(NDArrayBase):
     # make numpy functions return NDArray instead of numpy object array
     __array_priority__ = 1000.0
     # pylint: disable= no-member, undefined-variable
+
     def __repr__(self):
         """Returns a string representation of the array."""
         shape_info = 'x'.join(['%d' % x for x in self.shape])
@@ -151,6 +183,16 @@ def __repr__(self):
                                       self.__class__.__name__,
                                       shape_info, self.context)
 
+    def __reduce__(self):
+        return NDArray, (None,), self.__getstate__()
+
+    def _to_shared_mem(self):
+        shared_pid = ctypes.c_int()
+        shared_id = ctypes.c_int()
+        check_call(_LIB.MXNDArrayGetSharedMemHandle(
+            self.handle, ctypes.byref(shared_pid), ctypes.byref(shared_id)))
+        return shared_pid.value, shared_id.value, self.shape, self.dtype
+
     def __add__(self, other):
         """x.__add__(y) <=> x+y <=> mx.nd.add(x, y) """
         return add(self, other)
@@ -160,7 +202,7 @@ def __iadd__(self, other):
         if not self.writable:
             raise ValueError('trying to add to a readonly NDArray')
         if isinstance(other, NDArray):
-            return broadcast_add(self, other, out=self)
+            return op.broadcast_add(self, other, out=self)
         elif isinstance(other, numeric_types):
             return _internal._plus_scalar(self, float(other), out=self)
         else:
@@ -178,7 +220,7 @@ def __isub__(self, other):
         if not self.writable:
             raise ValueError('trying to subtract from a readonly NDArray')
         if isinstance(other, NDArray):
-            return broadcast_sub(self, other, out=self)
+            return op.broadcast_sub(self, other, out=self)
         elif isinstance(other, numeric_types):
             return _internal._minus_scalar(self, float(other), out=self)
         else:
@@ -201,7 +243,7 @@ def __imul__(self, other):
         if not self.writable:
             raise ValueError('trying to multiply to a readonly NDArray')
         if isinstance(other, NDArray):
-            return broadcast_mul(self, other, out=self)
+            return op.broadcast_mul(self, other, out=self)
         elif isinstance(other, numeric_types):
             return _internal._mul_scalar(self, float(other), out=self)
         else:
@@ -223,7 +265,7 @@ def __idiv__(self, other):
         if not self.writable:
             raise ValueError('trying to divide from a readonly NDArray')
         if isinstance(other, NDArray):
-            return broadcast_div(self, other, out=self)
+            return op.broadcast_div(self, other, out=self)
         elif isinstance(other, numeric_types):
             return _internal._div_scalar(self, float(other), out=self)
         else:
@@ -251,7 +293,7 @@ def __imod__(self, other):
         if not self.writable:
             raise ValueError('trying to take modulo from a readonly NDArray')
         if isinstance(other, NDArray):
-            return broadcast_mod(self, other, out=self)
+            return op.broadcast_mod(self, other, out=self)
         elif isinstance(other, numeric_types):
             return _internal._mod_scalar(self, float(other), out=self)
         else:
@@ -269,6 +311,10 @@ def __eq__(self, other):
         """x.__eq__(y) <=> x==y <=> mx.nd.equal(x, y) """
         return equal(self, other)
 
+    def __hash__(self):
+        """Default hash function."""
+        return id(self)//16
+
     def __ne__(self, other):
         """x.__ne__(y) <=> x!=y <=> mx.nd.not_equal(x, y) """
         return not_equal(self, other)
@@ -290,8 +336,14 @@ def __le__(self, other):
         return lesser_equal(self, other)
 
     def __bool__(self):
-        raise ValueError("The truth value of an NDArray is ambiguous. " \
-                         "Please convert to number with asscalar() first.")
+        num_elements = reduce(operator.mul, self.shape, 1)
+        if num_elements == 0:
+            return False
+        elif num_elements == 1:
+            return bool(self.asscalar())
+        else:
+            raise ValueError("The truth value of an NDArray with multiple elements " \
+                             "is ambiguous.")
 
     __nonzero__ = __bool__
 
@@ -324,16 +376,25 @@ def __setstate__(self, state):
         else:
             self.handle = None
 
+    # pylint: disable=line-too-long
     def __setitem__(self, key, value):
         """x.__setitem__(i, y) <=> x[i]=y
 
-        Set self[key] to value.
+        Sets value to self[key]. This functions supports advanced indexing defined in the following reference with
+        some restrictions.
+
+        https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing
+
+        - If key is a list type, only a list of integers is supported, e.g. key=[1, 2] is supported,
+          while not for key=[[1, 2]].
+        - Ellipsis (...) and np.newaxis are not supported.
+        - Boolean array indexing is not supported.
 
         Parameters
         ----------
-        key : int, slice or tuple
+        key : int, slice, list, np.ndarray, NDArray, or tuple of all previous types
             The indexing key.
-        value : scalar, NDArray or numpy.ndarray
+        value : scalar or array-like object that can be broadcast to the shape of self[key]
             The value to set.
 
         Examples
@@ -362,104 +423,44 @@ def __setitem__(self, key, value):
         >>> x.asnumpy()
         array([[ 1.,  2.,  1.],
                [ 0.,  0.,  4.]], dtype=float32)
+        >>> x[[0], [1, 2]] = 5
+        >>> x.asnumpy()
+        array([[ 1.,  5.,  5.],
+               [ 0.,  0.,  4.]], dtype=float32)
+        >>> x[::-1, 0:2:2] = [6]
+        >>> x.asnumpy()
+        array([[ 6.,  5.,  5.],
+               [ 6.,  0.,  4.]], dtype=float32)
         """
-        # pylint: disable=too-many-branches
-        if not self.writable:
-            raise ValueError('Cannot assign to readonly NDArray')
-        if isinstance(key, integer_types):
-            sliced_arr = self._at(key)
-            sliced_arr[:] = value
-            return
-        elif isinstance(key, py_slice):
-            if key.step is not None:
-                raise ValueError('NDArray only supports slicing with step size 1')
-            if key.start is not None or key.stop is not None:
-                sliced_arr = self._slice(key.start, key.stop)
-                sliced_arr[:] = value
-                return
-            if isinstance(value, NDArray):
-                if value.handle is not self.handle:
-                    value.copyto(self)
-            elif isinstance(value, numeric_types):
-                _internal._set_value(float(value), out=self)
-            elif isinstance(value, (np.ndarray, np.generic)):
-                self._sync_copyfrom(value)
-            else:
-                raise TypeError(
-                    'NDArray does not support assignment with %s of type %s'%(
-                        str(value), str(type(value))))
-        elif isinstance(key, tuple):
-            # multi-dimension indexing
-            my_shape = self.shape
-            assert len(key) <= len(my_shape), \
-                "Indexing dimensions exceed array dimensions, %d vs %d"%(
-                    len(key), len(my_shape))
-            begin = [0 for _ in my_shape]
-            end = [x for x in my_shape]
-            expand = []
-            for i, slice_i in enumerate(key):
-                if isinstance(slice_i, integer_types):
-                    assert slice_i < my_shape[i]
-                    begin[i] = slice_i
-                    end[i] = slice_i + 1
-                    expand.append(i)
-                elif isinstance(slice_i, py_slice):
-                    # only support continuous slicing
-                    assert slice_i.step is None, \
-                        "NDArray only supports slicing with step size 1."
-                    begin[i] = slice_i.start or 0
-                    end[i] = slice_i.stop or my_shape[i]
-                    assert begin[i] < end[i]
-                    assert end[i] <= my_shape[i]
-                else:
-                    raise ValueError(
-                        "NDArray does not support slicing with key %s of type %s."%(
-                            str(slice_i), str(type(slice_i))))
-
-            if isinstance(value, NDArray):
-                value = value.as_in_context(self.context)
-                self._slice_assign(value, begin, end, expand)
-            elif isinstance(value, numeric_types):
-                _internal._crop_assign_scalar(self, out=self,
-                                              begin=begin, end=end,
-                                              scalar=value)
-            elif isinstance(value, (np.ndarray, np.generic)):
-                value = array(value, ctx=self.context, dtype=self.dtype)
-                self._slice_assign(value, begin, end, expand)
-            else:
-                raise TypeError(
-                    'NDArray does not support assignment with %s of type %s'%(
-                        str(value), str(type(value))))
+        indexing_dispatch_code = _get_indexing_dispatch_code(key)
+        if indexing_dispatch_code == _NDARRAY_BASIC_INDEXING:
+            self._set_nd_basic_indexing(key, value)
+        elif indexing_dispatch_code == _NDARRAY_ADVANCED_INDEXING:
+            self._set_nd_advanced_indexing(key, value)
         else:
-            raise ValueError(
-                "NDArray does not support slicing with key %s of type %s."%(
-                    str(key), str(type(key))))
-        # pylint: enable=too-many-branches
-
-    def _slice_assign(self, value, begin, end, expand):
-        vshape = list(value.shape)
-        if expand and len(vshape) != len(begin):
-            if len(expand) + len(vshape) != len(begin):
-                sshape = [e - b for e, b in zip(end, begin)]
-                for i in reversed(expand):
-                    sshape.pop(i)
-                raise ValueError(
-                    "Cannot assign NDArray with shape %s to NDArray slice with " \
-                    "shape %s"%(str(vshape), str(sshape)))
-            for i in expand:
-                vshape.insert(i, 1)
-            value = value.reshape(vshape)
-        _internal._crop_assign(self, value, out=self,
-                               begin=begin, end=end)
+            raise ValueError('Indexing NDArray with index=%s and type=%s is not supported'
+                             % (str(key), str(type(key))))
+    # pylint: enable=line-too-long
 
+    # pylint: disable=line-too-long
     def __getitem__(self, key):
         """x.__getitem__(i) <=> x[i]
 
-        Returns a sliced view of this array.
+        Returns a sliced view of this array if the elements fetched are contiguous in memory;
+        otherwise, returns a newly created NDArray.
+        This functions supports advanced indexing defined in the following reference with
+        some restrictions.
+
+        https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing
+
+        - If key is a list type, only a list of integers is supported, e.g. key=[1, 2] is supported,
+          while not for key=[[1, 2]].
+        - Ellipsis (...) and np.newaxis are not supported.
+        - Boolean array indexing is not supported.
 
         Parameters
         ----------
-        key : int or slice
+        key : int, slice, list, np.ndarray, NDArray, or tuple of all previous types
             Indexing key.
 
         Examples
@@ -475,52 +476,354 @@ def __getitem__(self, key):
         >>> x.asnumpy()
         array([[ 2.,  2.,  2.],
                [ 3.,  4.,  5.]], dtype=float32)
+        >>> x = mx.nd.arange(0, 8, dtype='int32').reshape((2, 2, 2))
+        >>> x[[0, 1]]
+        [[[0 1]
+          [2 3]]
+         [[4 5]
+          [6 7]]]
+        >>> x[1:, [0, 1]]
+        [[[4 5]
+          [6 7]]]
+        >>> y = np.array([0, 1], dtype='int32')
+        >>> x[1:, y]
+        [[[4 5]
+          [6 7]]]
+        >>> y = mx.nd.array([0, 1], dtype='int32')
+        >>> x[1:, y]
+        [[[4 5]
+          [6 7]]]
         """
-        # multi-dimensional slicing is not supported yet
+        indexing_dispatch_code = _get_indexing_dispatch_code(key)
+        if indexing_dispatch_code == _NDARRAY_BASIC_INDEXING:
+            return self._get_nd_basic_indexing(key)
+        elif indexing_dispatch_code == _NDARRAY_ADVANCED_INDEXING:
+            return self._get_nd_advanced_indexing(key)
+        else:
+            raise ValueError('Indexing NDArray with index=%s and type=%s is not supported'
+                             % (str(key), str(type(key))))
+    # pylint: enable=line-too-long
+
+    def _get_index_nd(self, key):
+        """Returns an index array for use in scatter_nd and gather_nd."""
+        def _is_advanced_index(index):
+            """The definition of advanced index here includes integers as well, while
+            integers are considered as basic index type when the key contains only
+            slices and integers."""
+            return not isinstance(index, py_slice)
+
+        if isinstance(key, (NDArray, np.ndarray, list, integer_types, py_slice)):
+            key = (key,)
+
+        assert isinstance(key, tuple),\
+            'index=%s must be a NDArray, or np.ndarray, or list, or tuple ' \
+            ' type to use advanced indexing, received type=%s' % (str(key), str(type(key)))
+
+        assert len(key) > 0, "Cannot slice with empty indices"
+        shape = self.shape
+        assert len(shape) >= len(key),\
+            "Slicing dimensions exceeds array dimensions, %d vs %d" % (len(key), len(shape))
+        indices = []
+        dtype = 'int32'  # index data type passed to gather_nd op
+        need_broadcast = (len(key) != 1)
+        advanced_indices = []  # include list, NDArray, np.ndarray, integer
+        basic_indices = []  # include only slices
+        advanced_index_bshape = None  # final advanced index shape
+        for i, idx_i in enumerate(key):
+            is_advanced_index = True
+            if isinstance(idx_i, (np.ndarray, list, tuple)):
+                idx_i = array(idx_i, ctx=self.context, dtype=dtype)
+                advanced_indices.append(i)
+            elif isinstance(idx_i, py_slice):
+                start, stop, step = _get_index_range(idx_i.start, idx_i.stop, shape[i], idx_i.step)
+                idx_i = arange(start, stop, step, ctx=self.context, dtype=dtype)
+                basic_indices.append(i)
+                is_advanced_index = False
+            elif isinstance(idx_i, integer_types):
+                start, stop, step = _get_index_range(idx_i, idx_i+1, shape[i], 1)
+                idx_i = arange(start, stop, step, ctx=self.context, dtype=dtype)
+                advanced_indices.append(i)
+            elif isinstance(idx_i, NDArray):
+                if dtype != idx_i.dtype:
+                    idx_i = idx_i.astype(dtype)
+                advanced_indices.append(i)
+            else:
+                raise IndexError('Indexing NDArray with index=%s of type=%s is not supported'
+                                 % (str(key), str(type(key))))
+            if is_advanced_index:
+                if advanced_index_bshape is None:
+                    advanced_index_bshape = idx_i.shape
+                elif advanced_index_bshape != idx_i.shape:
+                    need_broadcast = True
+                    advanced_index_bshape = _get_broadcast_shape(advanced_index_bshape, idx_i.shape)
+            indices.append(idx_i)
+
+        # Get final index shape for gather_nd. See the following reference
+        # for determining the output array shape.
+        # https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.indexing.html#combining-advanced-and-basic-indexing  # pylint: disable=line-too-long
+        if len(advanced_indices) == 0:
+            raise ValueError('Advanced index tuple must contain at least one of the following types:'
+                             ' list, tuple, NDArray, np.ndarray, integer, received index=%s' % key)
+        # determine the output array's shape by checking whether advanced_indices are all adjacent
+        # or separated by slices
+        advanced_indices_adjacent = True
+        for i in range(0, len(advanced_indices)-1):
+            if advanced_indices[i] + 1 != advanced_indices[i+1]:
+                advanced_indices_adjacent = False
+                break
+
+        index_bshape_list = []  # index broadcasted shape
+        if advanced_indices_adjacent:
+            for i in range(0, advanced_indices[0]):
+                index_bshape_list.extend(indices[i].shape)
+                if not need_broadcast and indices[i].shape != advanced_index_bshape:
+                    need_broadcast = True
+            index_bshape_list.extend(advanced_index_bshape)
+            for i in range(advanced_indices[-1]+1, len(indices)):
+                if not need_broadcast and indices[i].shape != advanced_index_bshape:
+                    need_broadcast = True
+                index_bshape_list.extend(indices[i].shape)
+        else:
+            index_bshape_list.extend(advanced_index_bshape)
+            for i in basic_indices:
+                index_bshape_list.extend(indices[i].shape)
+                if not need_broadcast and indices[i].shape != advanced_index_bshape:
+                    need_broadcast = True
+        index_bshape = tuple(index_bshape_list)
+
+        # Need to broadcast all ndarrays in indices to the final shape.
+        # For example, suppose an array has shape=(5, 6, 7, 8) and
+        # key=(slice(1, 5), [[1, 2]], slice(2, 5), [1]).
+        # Since key[1] and key[3] are two advanced indices here and they are
+        # separated by basic indices key[0] and key[2], the output shape
+        # is (1, 2, 4, 3), where the first two elements come from the shape
+        # that key[1] and key[3] should broadcast to, which is (1, 2), and
+        # the last two elements come from the shape of two basic indices.
+        # In order to broadcast all basic and advanced indices to the output shape,
+        # we need to reshape them based on their axis. For example, to broadcast key[0],
+        # with shape=(4,), we first need to reshape it into (1, 1, 4, 1), and then
+        # broadcast the reshaped array to (1, 2, 4, 3); to broadcast key[1], we first
+        # reshape it into (1, 2, 1, 1), then broadcast the reshaped array to (1, 2, 4, 3).
+        if need_broadcast:
+            broadcasted_indices = []
+            idx_rshape = [1] * len(index_bshape)
+            if advanced_indices_adjacent:
+                advanced_index_bshape_start = advanced_indices[0]  # start index of advanced_index_bshape in index_shape
+                advanced_index_bshape_stop = advanced_index_bshape_start + len(advanced_index_bshape)
+                for i, idx in enumerate(key):
+                    if _is_advanced_index(idx):
+                        k = advanced_index_bshape_stop
+                        # find the reshaped shape for indices[i]
+                        for dim_size in indices[i].shape[::-1]:
+                            k -= 1
+                            idx_rshape[k] = dim_size
+                    else:
+                        if i < advanced_indices[0]:  # slice is on the left side of advanced indices
+                            idx_rshape[i] = indices[i].shape[0]
+                        elif i > advanced_indices[-1]:  # slice is on the right side of advanced indices
+                            idx_rshape[i-len(key)] = indices[i].shape[0]
+                        else:
+                            raise ValueError('basic index i=%d cannot be between advanced index i=%d and i=%d'
+                                             % (i, advanced_indices[0], advanced_indices[-1]))
+                    # broadcast current index to the final shape
+                    broadcasted_indices.append(indices[i].reshape(tuple(idx_rshape)).broadcast_to(index_bshape))
+                    # reset idx_rshape to ones
+                    for j, _ in enumerate(idx_rshape):
+                        idx_rshape[j] = 1
+            else:
+                basic_index_offset = len(advanced_index_bshape)
+                for i, idx in enumerate(key):
+                    if _is_advanced_index(idx):
+                        k = len(advanced_index_bshape)
+                        for dim_size in indices[i].shape[::-1]:
+                            k -= 1
+                            idx_rshape[k] = dim_size
+                    else:
+                        idx_rshape[basic_index_offset] = indices[i].shape[0]
+                        basic_index_offset += 1
+                    # broadcast current index to the final shape
+                    broadcasted_indices.append(indices[i].reshape(tuple(idx_rshape)).broadcast_to(index_bshape))
+                    # reset idx_rshape to ones
+                    for j, _ in enumerate(idx_rshape):
+                        idx_rshape[j] = 1
+
+            indices = broadcasted_indices
+        return op.stack(*indices)
+
+    def _prepare_value_nd(self, value, vshape):
+        """Given value and vshape, create an `NDArray` from value with the same
+        context and dtype as the current one and broadcast it to vshape."""
+        if isinstance(value, numeric_types):
+            value_nd = full(shape=vshape, val=value, ctx=self.context, dtype=self.dtype)
+        elif isinstance(value, NDArray):
+            value_nd = value.as_in_context(self.context)
+            if value_nd.dtype != self.dtype:
+                value_nd = value_nd.astype(self.dtype)
+        else:
+            try:
+                value_nd = array(value, ctx=self.context, dtype=self.dtype)
+            except:
+                raise TypeError('NDArray does not support assignment with non-array-like'
+                                ' object %s of type %s' % (str(value), str(type(value))))
+        if value_nd.shape != vshape:
+            value_nd = value_nd.broadcast_to(vshape)
+        return value_nd
+
+    def _set_nd_basic_indexing(self, key, value):
+        """This function is called by __setitem__ when key is a basic index, i.e.
+        an integer, or a slice, or a tuple of integers and slices. No restrictions
+        on the values of slices' steps."""
+        shape = self.shape
         if isinstance(key, integer_types):
-            if key > self.shape[0] - 1:
+            sliced_arr = self._at(key)
+            sliced_arr[:] = value
+            return
+        elif isinstance(key, py_slice):
+            if key.step is None or key.step == 1:  # trivial step
+                if key.start is not None or key.stop is not None:
+                    sliced_arr = self._slice(key.start, key.stop)
+                    sliced_arr[:] = value
+                    return
+                # assign value to the whole NDArray
+                # may need to broadcast first
+                if isinstance(value, NDArray):
+                    if value.handle is not self.handle:
+                        value.copyto(self)
+                elif isinstance(value, numeric_types):
+                    _internal._full(shape=shape, ctx=self.context,
+                                    dtype=self.dtype, value=float(value), out=self)
+                elif isinstance(value, (np.ndarray, np.generic)):
+                    if isinstance(value, np.generic) or value.shape != shape:
+                        value = np.broadcast_to(value, shape)
+                    self._sync_copyfrom(value)
+                else:  # value might be a list or a tuple
+                    value_nd = self._prepare_value_nd(value, shape)
+                    value_nd.copyto(self)
+                return
+            else:  # non-trivial step, use _slice_assign or _slice_assign_scalar
+                key = (key,)
+
+        assert isinstance(key, tuple), "key=%s must be a tuple of slices and integers" % str(key)
+
+        assert len(key) <= len(shape), "Indexing dimensions exceed array dimensions, %d vs %d"\
+                                       % (len(key), len(shape))
+        begin = []
+        end = []
+        steps = []
+        oshape = []  # output shape of slice using key
+        vshape = []  # value shape of data[key]
+        for i, slice_i in enumerate(key):
+            dim_size = 1
+            if isinstance(slice_i, py_slice):
+                begin.append(slice_i.start)
+                end.append(slice_i.stop)
+                steps.append(slice_i.step)
+                start, stop, step = _get_index_range(slice_i.start, slice_i.stop,
+                                                     shape[i], slice_i.step)
+                dim_size = _get_dim_size(start, stop, step)
+                vshape.append(dim_size)
+            elif isinstance(slice_i, integer_types):
+                begin.append(slice_i)
+                end.append(slice_i+1)
+                steps.append(1)
+            else:
+                raise ValueError("basic indexing does not support index=%s of type=%s"
+                                 % (str(slice_i), str(type(slice_i))))
+            oshape.append(dim_size)
+
+        oshape.extend(shape[len(key):])
+        vshape.extend(shape[len(key):])
+        # if key contains all integers, vshape should be (1,)
+        if len(vshape) == 0:
+            vshape.append(1)
+        oshape = tuple(oshape)
+        vshape = tuple(vshape)
+
+        if isinstance(value, numeric_types):
+            _internal._slice_assign_scalar(self, out=self, begin=begin, end=end,
+                                           step=steps, scalar=float(value))
+        else:
+            value_nd = self._prepare_value_nd(value, vshape)
+            if vshape != oshape:
+                value_nd = value_nd.reshape(oshape)
+            _internal._slice_assign(self, value_nd, begin, end, steps, out=self)
+
+    def _set_nd_advanced_indexing(self, key, value):
+        """This function is called by __setitem__ when key is an advanced index."""
+        indices = self._get_index_nd(key)
+        vshape = _get_oshape_of_gather_nd_op(self.shape, indices.shape)
+        value_nd = self._prepare_value_nd(value, vshape)
+        _internal._scatter_set_nd(data=value_nd, indices=indices, shape=self.shape, out=self)
+
+    def _get_nd_basic_indexing(self, key):
+        """This function is called when key is a slice, or an integer,
+        or a tuple of slices or integers"""
+        shape = self.shape
+        if isinstance(key, integer_types):
+            if key > shape[0] - 1:
                 raise IndexError(
                     'index {} is out of bounds for axis 0 with size {}'.format(
-                        key, self.shape[0]))
+                        key, shape[0]))
             return self._at(key)
         elif isinstance(key, py_slice):
-            if key.step is not None:
-                raise ValueError("NDArray only supports slicing with step size 1.")
-            if key.start is not None or key.stop is not None:
+            if key.step is not None and key.step != 1:
+                if key.step == 0:
+                    raise ValueError("slice step cannot be zero")
+                return op.slice(self, begin=(key.start,), end=(key.stop,), step=(key.step,))
+            elif key.start is not None or key.stop is not None:
                 return self._slice(key.start, key.stop)
             else:
                 return self
-        elif isinstance(key, tuple):
-            shape = self.shape
-            oshape = []
-            begin = []
-            end = []
-            assert len(shape) >= len(key), \
-                "Slicing dimensions exceeds array dimensions, %d vs %d"%(
-                    len(key), len(shape))
-            i = -1
-            for i, slice_i in enumerate(key):
-                if isinstance(slice_i, integer_types):
-                    begin.append(slice_i)
-                    end.append(slice_i+1)
-                elif isinstance(slice_i, py_slice):
-                    if slice_i.step is not None:
-                        raise ValueError("NDArray only supports slicing with step size 1.")
-                    begin.append(0 if slice_i.start is None else slice_i.start)
-                    end.append(shape[i] if slice_i.stop is None else slice_i.stop)
-                    oshape.append(end[i] - begin[i])
-                else:
-                    raise ValueError(
-                        "NDArray does not support slicing with key %s of type %s."%(
-                            str(slice_i), str(type(slice_i))))
-            oshape.extend(shape[i+1:])
-            if len(oshape) == 0:
-                oshape.append(1)
-            return slice(self, begin, end).reshape(oshape)
-        else:
-            raise ValueError(
-                "NDArray does not support slicing with key %s of type %s."%(
-                    str(key), str(type(key))))
+
+        if not isinstance(key, tuple):
+            raise ValueError('index=%s must be a slice, or an ineger, or a tuple'
+                             ' of slices and integers to use basic indexing, received type=%s'
+                             % (str(key), str(type(key))))
+        assert len(key) != 0, 'basic index cannot be an empty tuple'
+        begin = []
+        end = []
+        step = []
+        kept_axes = []  # axes where slice_i is a slice
+        i = -1
+        for i, slice_i in enumerate(key):
+            if isinstance(slice_i, integer_types):
+                begin.append(slice_i)
+                end.append(slice_i+1)
+                step.append(1)
+            elif isinstance(slice_i, py_slice):
+                if slice_i.step == 0:
+                    raise ValueError('basic index=%s cannot have slice=%s with step = 0'
+                                     % (str(key), str(slice_i)))
+                begin.append(slice_i.start)
+                end.append(slice_i.stop)
+                step.append(slice_i.step)
+                kept_axes.append(i)
+            else:
+                raise ValueError('basic_indexing does not support slicing with '
+                                 'index=%s of type=%s.' % (str(slice_i), str(type(slice_i))))
+        kept_axes.extend(range(i+1, len(shape)))
+        sliced_nd = op.slice(self, begin, end, step)
+        if len(kept_axes) == len(shape):
+            return sliced_nd
+        # squeeze sliced_shape to remove the axes indexed by integers
+        oshape = []
+        sliced_shape = sliced_nd.shape
+        for axis in kept_axes:
+            oshape.append(sliced_shape[axis])
+        # if key is a tuple of integers, still need to keep 1 dim
+        # while in Numpy, the output will become an value instead of an ndarray
+        if len(oshape) == 0:
+            oshape.append(1)
+        oshape = tuple(oshape)
+        assert np.prod(oshape) == np.prod(sliced_shape), 'oshape=%s has different size'\
+                                                         ' than sliced_shape=%s'\
+                                                         % (oshape, sliced_shape)
+        return sliced_nd.reshape(oshape)
+
+    def _get_nd_advanced_indexing(self, key):
+        """Get item when key is a tuple of any objects of the following types:
+        NDArray, np.ndarray, list, tuple, slice, and integer."""
+        return op.gather_nd(self, self._get_index_nd(key))
 
     def _sync_copyfrom(self, source_array):
         """Performs a synchronized copy from the `source_array` to the current array.
@@ -583,10 +886,10 @@ def _slice(self, start, stop):
         array([], shape=(0, 2), dtype=float32)
         """
         handle = NDArrayHandle()
-        start = mx_uint(start) if start else mx_uint(0)
-        stop = mx_uint(stop) if stop else mx_uint(self.shape[0])
+        start, stop, _ = _get_index_range(start, stop, self.shape[0])
+
         check_call(_LIB.MXNDArraySlice(
-            self.handle, start, stop, ctypes.byref(handle)))
+            self.handle, mx_uint(start), mx_uint(stop), ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
     def _at(self, idx):
@@ -613,9 +916,14 @@ def _at(self, idx):
         array([ 1.], dtype=float32)
         """
         handle = NDArrayHandle()
-        idx = mx_uint(idx)
+        if idx < 0:
+            length = self.shape[0]
+            idx += length
+            if idx < 0:
+                raise IndexError('index %d is out of bounds for axis 0 with size %d'
+                                 % (idx-length, length))
         check_call(_LIB.MXNDArrayAt(
-            self.handle, idx, ctypes.byref(handle)))
+            self.handle, mx_uint(idx), ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
     def reshape(self, shape):
@@ -665,10 +973,578 @@ def reshape(self, shape):
         # Actual reshape
         check_call(_LIB.MXNDArrayReshape(self.handle,
                                          len(shape),
-                                         c_array(ctypes.c_int, shape),
+                                         c_array_buf(ctypes.c_int, native_array('i', shape)),
                                          ctypes.byref(handle)))
         return NDArray(handle=handle, writable=self.writable)
 
+    def reshape_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reshape_like`.
+
+        The arguments are the same as for :py:func:`reshape_like`, with
+        this array as data.
+        """
+        return op.reshape_like(self, *args, **kwargs)
+
+    def zeros_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`zeros_like`.
+
+        The arguments are the same as for :py:func:`zeros_like`, with
+        this array as data.
+        """
+        return op.zeros_like(self, *args, **kwargs)
+
+    def ones_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ones_like`.
+
+        The arguments are the same as for :py:func:`ones_like`, with
+        this array as data.
+        """
+        return op.ones_like(self, *args, **kwargs)
+
+    def broadcast_axes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`broadcast_axes`.
+
+        The arguments are the same as for :py:func:`broadcast_axes`, with
+        this array as data.
+        """
+        return op.broadcast_axes(self, *args, **kwargs)
+
+    def repeat(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`repeat`.
+
+        The arguments are the same as for :py:func:`repeat`, with
+        this array as data.
+        """
+        return op.repeat(self, *args, **kwargs)
+
+    def pad(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pad`.
+
+        The arguments are the same as for :py:func:`pad`, with
+        this array as data.
+        """
+        return op.pad(self, *args, **kwargs)
+
+    def swapaxes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`swapaxes`.
+
+        The arguments are the same as for :py:func:`swapaxes`, with
+        this array as data.
+        """
+        return op.swapaxes(self, *args, **kwargs)
+
+    def split(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split`.
+
+        The arguments are the same as for :py:func:`split`, with
+        this array as data.
+        """
+        return op.split(self, *args, **kwargs)
+
+    def slice(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice`.
+
+        The arguments are the same as for :py:func:`slice`, with
+        this array as data.
+        """
+        return op.slice(self, *args, **kwargs)
+
+    def slice_axis(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_axis`.
+
+        The arguments are the same as for :py:func:`slice_axis`, with
+        this array as data.
+        """
+        return op.slice_axis(self, *args, **kwargs)
+
+    def take(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`take`.
+
+        The arguments are the same as for :py:func:`take`, with
+        this array as data.
+        """
+        return op.take(self, *args, **kwargs)
+
+    def one_hot(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`one_hot`.
+
+        The arguments are the same as for :py:func:`one_hot`, with
+        this array as data.
+        """
+        return op.one_hot(self, *args, **kwargs)
+
+    def pick(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pick`.
+
+        The arguments are the same as for :py:func:`pick`, with
+        this array as data.
+        """
+        return op.pick(self, *args, **kwargs)
+
+    def sort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sort`.
+
+        The arguments are the same as for :py:func:`sort`, with
+        this array as data.
+        """
+        return op.sort(self, *args, **kwargs)
+
+    def topk(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`topk`.
+
+        The arguments are the same as for :py:func:`topk`, with
+        this array as data.
+        """
+        return op.topk(self, *args, **kwargs)
+
+    def argsort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argsort`.
+
+        The arguments are the same as for :py:func:`argsort`, with
+        this array as data.
+        """
+        return op.argsort(self, *args, **kwargs)
+
+    def argmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax`.
+
+        The arguments are the same as for :py:func:`argmax`, with
+        this array as data.
+        """
+        return op.argmax(self, *args, **kwargs)
+
+    def argmax_channel(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax_channel`.
+
+        The arguments are the same as for :py:func:`argmax_channel`, with
+        this array as data.
+        """
+        return op.argmax_channel(self, *args, **kwargs)
+
+    def argmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmin`.
+
+        The arguments are the same as for :py:func:`argmin`, with
+        this array as data.
+        """
+        return op.argmin(self, *args, **kwargs)
+
+    def clip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`clip`.
+
+        The arguments are the same as for :py:func:`clip`, with
+        this array as data.
+        """
+        return op.clip(self, *args, **kwargs)
+
+    def abs(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`abs`.
+
+        The arguments are the same as for :py:func:`abs`, with
+        this array as data.
+        """
+        return op.abs(self, *args, **kwargs)
+
+    def sign(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sign`.
+
+        The arguments are the same as for :py:func:`sign`, with
+        this array as data.
+        """
+        return op.sign(self, *args, **kwargs)
+
+    def flatten(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flatten`.
+
+        The arguments are the same as for :py:func:`flatten`, with
+        this array as data.
+        """
+        return op.flatten(self, *args, **kwargs)
+
+    def expand_dims(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expand_dims`.
+
+        The arguments are the same as for :py:func:`expand_dims`, with
+        this array as data.
+        """
+        return op.expand_dims(self, *args, **kwargs)
+
+    def tile(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tile`.
+
+        The arguments are the same as for :py:func:`tile`, with
+        this array as data.
+        """
+        return op.tile(self, *args, **kwargs)
+
+    def transpose(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`transpose`.
+
+        The arguments are the same as for :py:func:`transpose`, with
+        this array as data.
+        """
+        return op.transpose(self, *args, **kwargs)
+
+    def flip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flip`.
+
+        The arguments are the same as for :py:func:`flip`, with
+        this array as data.
+        """
+        return op.flip(self, *args, **kwargs)
+
+    def sum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sum`.
+
+        The arguments are the same as for :py:func:`sum`, with
+        this array as data.
+        """
+        return op.sum(self, *args, **kwargs)
+
+    def nansum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nansum`.
+
+        The arguments are the same as for :py:func:`nansum`, with
+        this array as data.
+        """
+        return op.nansum(self, *args, **kwargs)
+
+    def prod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`prod`.
+
+        The arguments are the same as for :py:func:`prod`, with
+        this array as data.
+        """
+        return op.prod(self, *args, **kwargs)
+
+    def nanprod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nanprod`.
+
+        The arguments are the same as for :py:func:`nanprod`, with
+        this array as data.
+        """
+        return op.nanprod(self, *args, **kwargs)
+
+    def mean(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`mean`.
+
+        The arguments are the same as for :py:func:`mean`, with
+        this array as data.
+        """
+        return op.mean(self, *args, **kwargs)
+
+    def max(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`max`.
+
+        The arguments are the same as for :py:func:`max`, with
+        this array as data.
+        """
+        return op.max(self, *args, **kwargs)
+
+    def min(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`min`.
+
+        The arguments are the same as for :py:func:`min`, with
+        this array as data.
+        """
+        return op.min(self, *args, **kwargs)
+
+    def norm(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`norm`.
+
+        The arguments are the same as for :py:func:`norm`, with
+        this array as data.
+        """
+        return op.norm(self, *args, **kwargs)
+
+    def round(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`round`.
+
+        The arguments are the same as for :py:func:`round`, with
+        this array as data.
+        """
+        return op.round(self, *args, **kwargs)
+
+    def rint(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rint`.
+
+        The arguments are the same as for :py:func:`rint`, with
+        this array as data.
+        """
+        return op.rint(self, *args, **kwargs)
+
+    def fix(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`fix`.
+
+        The arguments are the same as for :py:func:`fix`, with
+        this array as data.
+        """
+        return op.fix(self, *args, **kwargs)
+
+    def floor(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`floor`.
+
+        The arguments are the same as for :py:func:`floor`, with
+        this array as data.
+        """
+        return op.floor(self, *args, **kwargs)
+
+    def ceil(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ceil`.
+
+        The arguments are the same as for :py:func:`ceil`, with
+        this array as data.
+        """
+        return op.ceil(self, *args, **kwargs)
+
+    def trunc(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`trunc`.
+
+        The arguments are the same as for :py:func:`trunc`, with
+        this array as data.
+        """
+        return op.trunc(self, *args, **kwargs)
+
+    def sin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sin`.
+
+        The arguments are the same as for :py:func:`sin`, with
+        this array as data.
+        """
+        return op.sin(self, *args, **kwargs)
+
+    def cos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cos`.
+
+        The arguments are the same as for :py:func:`cos`, with
+        this array as data.
+        """
+        return op.cos(self, *args, **kwargs)
+
+    def tan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tan`.
+
+        The arguments are the same as for :py:func:`tan`, with
+        this array as data.
+        """
+        return op.tan(self, *args, **kwargs)
+
+    def arcsin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsin`.
+
+        The arguments are the same as for :py:func:`arcsin`, with
+        this array as data.
+        """
+        return op.arcsin(self, *args, **kwargs)
+
+    def arccos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccos`.
+
+        The arguments are the same as for :py:func:`arccos`, with
+        this array as data.
+        """
+        return op.arccos(self, *args, **kwargs)
+
+    def arctan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctan`.
+
+        The arguments are the same as for :py:func:`arctan`, with
+        this array as data.
+        """
+        return op.arctan(self, *args, **kwargs)
+
+    def degrees(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`degrees`.
+
+        The arguments are the same as for :py:func:`degrees`, with
+        this array as data.
+        """
+        return op.degrees(self, *args, **kwargs)
+
+    def radians(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`radians`.
+
+        The arguments are the same as for :py:func:`radians`, with
+        this array as data.
+        """
+        return op.radians(self, *args, **kwargs)
+
+    def sinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sinh`.
+
+        The arguments are the same as for :py:func:`sinh`, with
+        this array as data.
+        """
+        return op.sinh(self, *args, **kwargs)
+
+    def cosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cosh`.
+
+        The arguments are the same as for :py:func:`cosh`, with
+        this array as data.
+        """
+        return op.cosh(self, *args, **kwargs)
+
+    def tanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tanh`.
+
+        The arguments are the same as for :py:func:`tanh`, with
+        this array as data.
+        """
+        return op.tanh(self, *args, **kwargs)
+
+    def arcsinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsinh`.
+
+        The arguments are the same as for :py:func:`arcsinh`, with
+        this array as data.
+        """
+        return op.arcsinh(self, *args, **kwargs)
+
+    def arccosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccosh`.
+
+        The arguments are the same as for :py:func:`arccosh`, with
+        this array as data.
+        """
+        return op.arccosh(self, *args, **kwargs)
+
+    def arctanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctanh`.
+
+        The arguments are the same as for :py:func:`arctanh`, with
+        this array as data.
+        """
+        return op.arctanh(self, *args, **kwargs)
+
+    def exp(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`exp`.
+
+        The arguments are the same as for :py:func:`exp`, with
+        this array as data.
+        """
+        return op.exp(self, *args, **kwargs)
+
+    def expm1(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expm1`.
+
+        The arguments are the same as for :py:func:`expm1`, with
+        this array as data.
+        """
+        return op.expm1(self, *args, **kwargs)
+
+    def log(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log`.
+
+        The arguments are the same as for :py:func:`log`, with
+        this array as data.
+        """
+        return op.log(self, *args, **kwargs)
+
+    def log10(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log10`.
+
+        The arguments are the same as for :py:func:`log10`, with
+        this array as data.
+        """
+        return op.log10(self, *args, **kwargs)
+
+    def log2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log2`.
+
+        The arguments are the same as for :py:func:`log2`, with
+        this array as data.
+        """
+        return op.log2(self, *args, **kwargs)
+
+    def log1p(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log1p`.
+
+        The arguments are the same as for :py:func:`log1p`, with
+        this array as data.
+        """
+        return op.log1p(self, *args, **kwargs)
+
+    def sqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sqrt`.
+
+        The arguments are the same as for :py:func:`sqrt`, with
+        this array as data.
+        """
+        return op.sqrt(self, *args, **kwargs)
+
+    def rsqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rsqrt`.
+
+        The arguments are the same as for :py:func:`rsqrt`, with
+        this array as data.
+        """
+        return op.rsqrt(self, *args, **kwargs)
+
+    def cbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cbrt`.
+
+        The arguments are the same as for :py:func:`cbrt`, with
+        this array as data.
+        """
+        return op.cbrt(self, *args, **kwargs)
+
+    def rcbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rcbrt`.
+
+        The arguments are the same as for :py:func:`rcbrt`, with
+        this array as data.
+        """
+        return op.rcbrt(self, *args, **kwargs)
+
+    def square(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`square`.
+
+        The arguments are the same as for :py:func:`square`, with
+        this array as data.
+        """
+        return op.square(self, *args, **kwargs)
+
+    def reciprocal(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reciprocal`.
+
+        The arguments are the same as for :py:func:`reciprocal`, with
+        this array as data.
+        """
+        return op.reciprocal(self, *args, **kwargs)
+
+    def relu(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`relu`.
+
+        The arguments are the same as for :py:func:`relu`, with
+        this array as data.
+        """
+        return op.relu(self, *args, **kwargs)
+
+    def sigmoid(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sigmoid`.
+
+        The arguments are the same as for :py:func:`sigmoid`, with
+        this array as data.
+        """
+        return op.sigmoid(self, *args, **kwargs)
+
+    def softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmax`.
+
+        The arguments are the same as for :py:func:`softmax`, with
+        this array as data.
+        """
+        return op.softmax(self, *args, **kwargs)
+
+    def log_softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log_softmax`.
+
+        The arguments are the same as for :py:func:`log_softmax`, with
+        this array as data.
+        """
+        return op.log_softmax(self, *args, **kwargs)
+
     # pylint: disable= undefined-variable
     def broadcast_to(self, shape):
         """Broadcasts the input array to a new shape.
@@ -717,9 +1593,9 @@ def broadcast_to(self, shape):
         if (cur_shape_arr[broadcasting_axes] != 1).any():
             raise ValueError(err_str)
         if cur_shape != self.shape:
-            return broadcast_to(self.reshape(cur_shape), shape=shape)
+            return op.broadcast_to(self.reshape(cur_shape), shape=shape)
         else:
-            return broadcast_to(self, shape=tuple(shape))
+            return op.broadcast_to(self, shape=tuple(shape))
     # pylint: enable= undefined-variable
 
     def wait_to_read(self):
@@ -742,7 +1618,6 @@ def wait_to_read(self):
         """
         check_call(_LIB.MXNDArrayWaitToRead(self.handle))
 
-
     @property
     def ndim(self):
         """Returns the number of dimensions of this array
@@ -777,6 +1652,7 @@ def shape(self):
             self.handle, ctypes.byref(ndim), ctypes.byref(pdata)))
         return tuple(pdata[:ndim.value])
 
+
     @property
     def size(self):
         """Number of elements in the array.
@@ -841,6 +1717,12 @@ def dtype(self):
             self.handle, ctypes.byref(mx_dtype)))
         return _DTYPE_MX_TO_NP[mx_dtype.value]
 
+    @property
+    def stype(self):
+        """Storage-type of the array.
+        """
+        return _STORAGE_TYPE_ID_TO_STR[_storage_type(self.handle)]
+
     @property
     # pylint: disable= invalid-name, undefined-variable
     def T(self):
@@ -866,7 +1748,7 @@ def T(self):
         """
         if len(self.shape) < 2:
             return self
-        return transpose(self)
+        return op.transpose(self)
     # pylint: enable= invalid-name, undefined-variable
 
     @property
@@ -936,6 +1818,11 @@ def astype(self, dtype):
         dtype : numpy.dtype or str
             The type of the returned array.
 
+        Returns
+        -------
+        NDArray, CSRNDArray or RowSparseNDArray
+            The copied array after casting to the specified type.
+
         Examples
         --------
         >>> x = mx.nd.zeros((2,3), dtype='float32')
@@ -964,7 +1851,7 @@ def copyto(self, other):
 
         Returns
         -------
-        NDArray
+        NDArray, CSRNDArray or RowSparseNDArray
             The copied array. If ``other`` is an ``NDArray``, then the return value
             and ``other`` will point to the same ``NDArray``.
 
@@ -998,7 +1885,7 @@ def copy(self):
 
         Returns
         -------
-        NDArray
+        NDArray, CSRNDArray or RowSparseNDArray
             The copied array
 
         Examples
@@ -1024,7 +1911,7 @@ def as_in_context(self, context):
 
         Returns
         -------
-        NDArray
+        NDArray, CSRNDArray or RowSparseNDArray
             The target array.
 
 
@@ -1042,7 +1929,7 @@ def as_in_context(self, context):
             return self
         return self.copyto(context)
 
-    def attach_grad(self, grad_req='write'):
+    def attach_grad(self, grad_req='write', stype=None):
         """Attach a gradient buffer to this NDArray, so that `backward`
         can compute gradient with respect to it.
 
@@ -1053,8 +1940,14 @@ def attach_grad(self, grad_req='write'):
             - 'write': gradient will be overwritten on every backward.
             - 'add': gradient will be added to existing value on every backward.
             - 'null': do not compute gradient for this NDArray.
+        stype : str, optional
+            The storage type of the gradient array. Defaults to the same stype of this NDArray.
         """
-        grad = zeros_like(self)  # pylint: disable=undefined-variable
+        from . import zeros as _zeros
+        if stype is not None:
+            grad = _zeros(self.shape, stype=stype)
+        else:
+            grad = op.zeros_like(self)  # pylint: disable=undefined-variable
         grad_req = _GRAD_REQ_MAP[grad_req]
         check_call(_LIB.MXAutogradMarkVariables(
             1, ctypes.pointer(self.handle),
@@ -1064,17 +1957,19 @@ def attach_grad(self, grad_req='write'):
     @property
     def grad(self):
         """Returns gradient buffer attached to this NDArray."""
+        from . import _ndarray_cls
         hdl = NDArrayHandle()
         check_call(_LIB.MXNDArrayGetGrad(self.handle, ctypes.byref(hdl)))
         if hdl.value is None:
             return None
-        return NDArray(hdl)
+        return _ndarray_cls(hdl)
 
     def detach(self):
         """Returns a new NDArray, detached from the current graph."""
+        from . import _ndarray_cls
         hdl = NDArrayHandle()
         check_call(_LIB.MXNDArrayDetach(self.handle, ctypes.byref(hdl)))
-        return NDArray(hdl)
+        return _ndarray_cls(hdl)
 
     def backward(self, out_grad=None, retain_graph=False, train_mode=True):
         """Compute the gradients of this NDArray w.r.t variables.
@@ -1096,91 +1991,156 @@ def backward(self, out_grad=None, retain_graph=False, train_mode=True):
             ograd_handles = [out_grad.handle]
 
         check_call(_LIB.MXAutogradBackwardEx(
-            1, c_array(NDArrayHandle, [self.handle]),
+            1, c_handle_array([self]),
             c_array(NDArrayHandle, ograd_handles),
+            0,
+            ctypes.c_void_p(0),
             ctypes.c_int(retain_graph),
-            ctypes.c_int(train_mode)))
-
+            ctypes.c_int(0),
+            ctypes.c_int(train_mode),
+            ctypes.c_void_p(0),
+            ctypes.c_void_p(0)))
 
-def onehot_encode(indices, out):
-    """One-hot encoding indices into matrix out.
-
-    .. note:: `onehot_encode` is deprecated. Use `one_hot` instead.
-
-    """
-    # pylint: disable= no-member, protected-access
-    return _internal._onehot_encode(indices, out, out=out)
-    # pylint: enable= no-member, protected-access
+    def tostype(self, stype):
+        """Return a copy of the array with chosen storage type.
 
+        See Also
+        ----------
+        :meth:`mxnet.ndarray.cast_storage`.
 
-def empty(shape, ctx=None, dtype=mx_real_t):
-    """Returns a new array of given shape and type, without initializing entries.
+        Returns
+        -------
+        NDArray, CSRNDArray or RowSparseNDArray
+            A copy of the array with the chosen storage stype
+        """
+        return op.cast_storage(self, stype=stype)
+
+
+def _get_indexing_dispatch_code(key):
+    """Returns a dispatch code for calling basic or advanced indexing functions."""
+    if isinstance(key, (NDArray, np.ndarray)):
+        return _NDARRAY_ADVANCED_INDEXING
+    elif isinstance(key, list):
+        # TODO(junwu): Add support for nested lists besides integer list
+        for i in key:
+            if not isinstance(i, integer_types):
+                raise TypeError('Indexing NDArray only supports a list of integers as index'
+                                ' when key is of list type, received element=%s of type=%s'
+                                % (str(i), str(type(i))))
+        return _NDARRAY_ADVANCED_INDEXING
+    elif isinstance(key, (integer_types, py_slice)):
+        return _NDARRAY_BASIC_INDEXING
+    elif isinstance(key, tuple):
+        for idx in key:
+            if isinstance(idx, (NDArray, np.ndarray, list, tuple)):
+                return _NDARRAY_ADVANCED_INDEXING
+            elif not isinstance(idx, (py_slice, integer_types)):
+                raise ValueError("NDArray does not support slicing with key %s of type %s."
+                                 % (str(idx), str(type(idx))))
+        return _NDARRAY_BASIC_INDEXING
+    else:
+        return _NDARRAY_UNSUPPORTED_INDEXING
+
+
+def _get_index_range(start, stop, length, step=1):
+    """Given start, stop, step and array length, return
+    absolute values of start, stop, and step for generating index range.
+    The returned values have been compensated by adding length if they
+    are less than zero for all the cases but slice(None, None, -1).
+    Note that the returned value of stop is not necessarily >= 0, since
+    absolute stop is -1 in the case of slice(None, None, -1)."""
+    if step == 0:
+        raise ValueError('step size cannot be zero')
+    if length < 0:
+        raise ValueError('array length cannot be less than zero')
+    if step is None:
+        step = 1
+    if start is None:
+        if step > 0:
+            start = 0
+        else:
+            start = length - 1
+    elif start < 0:
+        start += length
+        if start < 0:
+            raise IndexError('Slicing start %d exceeds limit of %d' % (start-length, length))
+    elif start >= length:
+        raise IndexError('Slicing start %d exceeds limit of %d' % (start, length))
+
+    if stop is None:
+        if step > 0:
+            stop = length
+        else:
+            # this supports case such as ::-1
+            # stop = -1 here refers to the element before index 0,
+            # instead of the last element in the array
+            stop = -1
+    elif stop < 0:
+        stop += length
+        if stop < 0:
+            raise IndexError('Slicing stop %d exceeds limit of %d' % (stop-length, length))
+    elif stop > length:
+        raise IndexError('Slicing stop %d exceeds limit of %d' % (stop, length))
+
+    return start, stop, step
+
+
+def _get_oshape_of_gather_nd_op(dshape, ishape):
+    """Given data and index shapes, get the output `NDArray` shape.
+    This basically implements the infer shape logic of op gather_nd."""
+    assert len(dshape) > 0 and len(ishape) > 0
+    oshape = list(ishape[1:])
+    if ishape[0] < len(dshape):
+        oshape.extend(dshape[ishape[0]:])
+    return tuple(oshape)
+
+
+def _get_dim_size(start, stop, step):
+    """Given start, stop, and stop, calculate the number of elements
+    of this slice."""
+    assert step != 0
+    if step > 0:
+        assert start < stop
+        dim_size = (stop - start - 1) // step + 1
+    else:
+        assert stop < start
+        dim_size = (start - stop - 1) // (-step) + 1
+    return dim_size
 
-    Parameters
-    ----------
-    shape : int or tuple of int
-        The shape of the empty array.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
-    dtype : str or numpy.dtype, optional
-        An optional value type (default is `float32`).
 
-    Returns
-    -------
-    NDArray
-        A created array.
+def _get_broadcast_shape(shape1, shape2):
+    """Given two shapes that are not identical, find the shape
+    that both input shapes can broadcast to."""
+    if shape1 == shape2:
+        return shape1
 
-    Examples
-    --------
-    >>> mx.nd.empty(1)
-    <NDArray 1 @cpu(0)>
-    >>> mx.nd.empty((1,2), mx.gpu(0))
-    <NDArray 1x2 @gpu(0)>
-    >>> mx.nd.empty((1,2), mx.gpu(0), 'float16')
-    <NDArray 1x2 @gpu(0)>
-    """
-    if isinstance(shape, integer_types):
-        shape = (shape, )
-    if ctx is None:
-        ctx = Context.default_ctx
-    return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
+    length1 = len(shape1)
+    length2 = len(shape2)
+    if length1 > length2:
+        shape = list(shape1)
+    else:
+        shape = list(shape2)
+    i = max(length1, length2) - 1
+    for a, b in zip(shape1[::-1], shape2[::-1]):
+        if a != 1 and b != 1 and a != b:
+            raise ValueError('shape1=%s is not broadcastable to shape2=%s' % (shape1, shape2))
+        shape[i] = max(a, b)
+        i -= 1
+    return tuple(shape)
 
-def zeros(shape, ctx=None, dtype=mx_real_t, **kwargs):
-    """Returns a new array filled with all zeros, with the given shape and type.
 
-    Parameters
-    ----------
-    shape : int or tuple of int
-        The shape of the empty array.
-    ctx : Context, optional
-        An optional device context (default is the current default context).
-    dtype : str or numpy.dtype, optional
-        An optional value type (default is `float32`).
-    out : NDArray, optional
-        The output NDArray (default is `None`).
+def onehot_encode(indices, out):
+    """One-hot encoding indices into matrix out.
 
-    Returns
-    -------
-    NDArray
-        A created array
+    .. note:: `onehot_encode` is deprecated. Use `one_hot` instead.
 
-    Examples
-    --------
-    >>> mx.nd.zeros(1).asnumpy()
-    array([ 0.], dtype=float32)
-    >>> mx.nd.zeros((1,2), mx.gpu(0))
-    <NDArray 1x2 @gpu(0)>
-    >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy()
-    array([[ 0.,  0.]], dtype=float16)
     """
-    # pylint: disable= unused-argument
-    if ctx is None:
-        ctx = Context.default_ctx
     # pylint: disable= no-member, protected-access
-    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+    return _internal._onehot_encode(indices, out, out=out)
     # pylint: enable= no-member, protected-access
 
-def ones(shape, ctx=None, dtype=mx_real_t, **kwargs):
+
+def ones(shape, ctx=None, dtype=None, **kwargs):
     """Returns a new array filled with all ones, with the given shape and type.
 
     Parameters
@@ -1212,10 +2172,12 @@ def ones(shape, ctx=None, dtype=mx_real_t, **kwargs):
     # pylint: disable= unused-argument
     if ctx is None:
         ctx = Context.default_ctx
+    dtype = mx_real_t if dtype is None else dtype
     # pylint: disable= no-member, protected-access
     return _internal._ones(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
     # pylint: enable= no-member, protected-access
 
+
 def full(shape, val, ctx=None, dtype=mx_real_t, out=None):
     """Returns a new array of given shape and type, filled with the given value `val`.
 
@@ -1269,18 +2231,6 @@ def array(source_array, ctx=None, dtype=None):
     -------
     NDArray
         An `NDArray` with the same contents as the `source_array`.
-
-    Examples
-    --------
-    >>> import numpy as np
-    >>> mx.nd.array([1, 2, 3])
-    <NDArray 3 @cpu(0)>
-    >>> mx.nd.array([[1, 2], [3, 4]])
-    <NDArray 2x2 @cpu(0)>
-    >>> mx.nd.array(np.zeros((3, 2)))
-    <NDArray 3x2 @cpu(0)>
-    >>> mx.nd.array(np.zeros((3, 2)), mx.gpu(0))
-    <NDArray 3x2 @gpu(0)>
     """
     if isinstance(source_array, NDArray):
         dtype = source_array.dtype if dtype is None else dtype
@@ -1331,10 +2281,10 @@ def moveaxis(tensor, source, destination):
     except IndexError:
         raise ValueError('Destination should verify 0 <= destination < tensor.ndim'
                          'Got %d' % destination)
-    return transpose(tensor, axes)
+    return op.transpose(tensor, axes)
 
 
-# pylint: disable= no-member, protected-access, too-many-arguments
+# pylint: disable= no-member, protected-access, too-many-arguments, redefined-outer-name
 def arange(start, stop=None, step=1.0, repeat=1, ctx=None, dtype=mx_real_t):
     """Returns evenly spaced values within a given interval.
 
@@ -1345,11 +2295,11 @@ def arange(start, stop=None, step=1.0, repeat=1, ctx=None, dtype=mx_real_t):
 
     Parameters
     ----------
-    start : float, optional
+    start : number, optional
         Start of interval. The default start value is 0.
-    stop : float
+    stop : number
         End of interval.
-    step : float, optional
+    step : number, optional
         Spacing between values. The default step size is 1.
     repeat : int, optional
         Number of times to repeat each element. The default repeat count is 1.
@@ -1382,6 +2332,7 @@ def arange(start, stop=None, step=1.0, repeat=1, ctx=None, dtype=mx_real_t):
                              dtype=dtype, ctx=str(ctx))
 # pylint: enable= no-member, protected-access, too-many-arguments
 
+
 #pylint: disable= too-many-arguments, no-member, protected-access
 def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None):
     """ Helper function for element-wise operation.
@@ -1430,6 +2381,7 @@ def _ufunc_helper(lhs, rhs, fn_array, fn_scalar, lfn_scalar, rfn_scalar=None):
         raise TypeError('type %s not supported' % str(type(rhs)))
 #pylint: enable= too-many-arguments, no-member, protected-access
 
+
 def add(lhs, rhs):
     """Returns element-wise sum of the input arrays with broadcasting.
 
@@ -1485,12 +2437,13 @@ def add(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_add,
+        op.broadcast_add,
         operator.add,
         _internal._plus_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
+
 def subtract(lhs, rhs):
     """Returns element-wise difference of the input arrays with broadcasting.
 
@@ -1546,12 +2499,13 @@ def subtract(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_sub,
+        op.broadcast_sub,
         operator.sub,
         _internal._minus_scalar,
         _internal._rminus_scalar)
     # pylint: enable= no-member, protected-access
 
+
 def multiply(lhs, rhs):
     """Returns element-wise product of the input arrays with broadcasting.
 
@@ -1606,12 +2560,13 @@ def multiply(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_mul,
+        op.broadcast_mul,
         operator.mul,
         _internal._mul_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
+
 def divide(lhs, rhs):
     """Returns element-wise division of the input arrays with broadcasting.
 
@@ -1662,12 +2617,13 @@ def divide(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_div,
+        op.broadcast_div,
         operator.truediv,
         _internal._div_scalar,
         _internal._rdiv_scalar)
     # pylint: enable= no-member, protected-access
 
+
 def modulo(lhs, rhs):
     """Returns element-wise modulo of the input arrays with broadcasting.
 
@@ -1718,12 +2674,13 @@ def modulo(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_mod,
+        op.broadcast_mod,
         operator.mod,
         _internal._mod_scalar,
         _internal._rmod_scalar)
     # pylint: enable= no-member, protected-access
 
+
 def power(base, exp):
     """Returns result of first array elements raised to powers from second array, element-wise
     with broadcasting.
@@ -1779,12 +2736,13 @@ def power(base, exp):
     return _ufunc_helper(
         base,
         exp,
-        broadcast_power,
+        op.broadcast_power,
         operator.pow,
         _internal._power_scalar,
         _internal._rpower_scalar)
     # pylint: enable= no-member, protected-access
 
+
 def maximum(lhs, rhs):
     """Returns element-wise maximum of the input arrays with broadcasting.
 
@@ -1835,12 +2793,13 @@ def maximum(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_maximum,
+        op.broadcast_maximum,
         lambda x, y: x if x > y else y,
         _internal._maximum_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
+
 def minimum(lhs, rhs):
     """Returns element-wise minimum of the input arrays with broadcasting.
 
@@ -1891,12 +2850,13 @@ def minimum(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_minimum,
+        op.broadcast_minimum,
         lambda x, y: x if x < y else y,
         _internal._minimum_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
+
 def equal(lhs, rhs):
     """Returns the result of element-wise **equal to** (==) comparison operation with
     broadcasting.
@@ -1954,12 +2914,13 @@ def equal(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_equal,
+        op.broadcast_equal,
         lambda x, y: 1 if x == y else 0,
         _internal._equal_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
+
 def not_equal(lhs, rhs):
     """Returns the result of element-wise **not equal to** (!=) comparison operation
     with broadcasting.
@@ -2020,12 +2981,13 @@ def not_equal(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_not_equal,
+        op.broadcast_not_equal,
         lambda x, y: 1 if x != y else 0,
         _internal._not_equal_scalar,
         None)
     # pylint: enable= no-member, protected-access
 
+
 def greater(lhs, rhs):
     """Returns the result of element-wise **greater than** (>) comparison operation
     with broadcasting.
@@ -2083,12 +3045,13 @@ def greater(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_greater,
+        op.broadcast_greater,
         lambda x, y: 1 if x > y else 0,
         _internal._greater_scalar,
         _internal._lesser_scalar)
     # pylint: enable= no-member, protected-access
 
+
 def greater_equal(lhs, rhs):
     """Returns the result of element-wise **greater than or equal to** (>=) comparison
     operation with broadcasting.
@@ -2146,12 +3109,13 @@ def greater_equal(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_greater_equal,
+        op.broadcast_greater_equal,
         lambda x, y: 1 if x >= y else 0,
         _internal._greater_equal_scalar,
         _internal._lesser_equal_scalar)
     # pylint: enable= no-member, protected-access
 
+
 def lesser(lhs, rhs):
     """Returns the result of element-wise **lesser than** (<) comparison operation
     with broadcasting.
@@ -2209,7 +3173,7 @@ def lesser(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_lesser,
+        op.broadcast_lesser,
         lambda x, y: 1 if x < y else 0,
         _internal._lesser_scalar,
         _internal._greater_scalar)
@@ -2273,131 +3237,19 @@ def lesser_equal(lhs, rhs):
     return _ufunc_helper(
         lhs,
         rhs,
-        broadcast_lesser_equal,
+        op.broadcast_lesser_equal,
         lambda x, y: 1 if x <= y else 0,
         _internal._lesser_equal_scalar,
         _internal._greater_equal_scalar)
     # pylint: enable= no-member, protected-access
 
+
 def true_divide(lhs, rhs):
 
     """This function is similar to :meth:`divide`.
     """
     return divide(lhs, rhs)
 
-def negative(arr):
-    """Numerical negative, element-wise.
-
-    Equals ``-arr``
-
-    Parameters
-    ----------
-    arr : NDArray
-        The input array
-
-    Returns
-    -------
-    NDArray
-        ``-arr``
-
-    Examples
-    --------
-    >>> x = mx.nd.ones((2,3))
-    >>> (-x).asnumpy()
-    array([[-1., -1., -1.],
-           [-1., -1., -1.]], dtype=float32)
-    """
-    return multiply(arr, -1.0)
-
-
-def load(fname):
-    """Loads an array from file.
-
-    See more details in ``save``.
-
-    Parameters
-    ----------
-    fname : str
-        The filename.
-
-    Returns
-    -------
-    list of NDArray or dict of str to NDArray
-        Loaded data.
-    """
-    if not isinstance(fname, string_types):
-        raise TypeError('fname required to be a string')
-    out_size = mx_uint()
-    out_name_size = mx_uint()
-    handles = ctypes.POINTER(NDArrayHandle)()
-    names = ctypes.POINTER(ctypes.c_char_p)()
-    check_call(_LIB.MXNDArrayLoad(c_str(fname),
-                                  ctypes.byref(out_size),
-                                  ctypes.byref(handles),
-                                  ctypes.byref(out_name_size),
-                                  ctypes.byref(names)))
-    if out_name_size.value == 0:
-        return [NDArray(NDArrayHandle(handles[i])) for i in range(out_size.value)]
-    else:
-        assert out_name_size.value == out_size.value
-        return dict(
-            (py_str(names[i]), NDArray(NDArrayHandle(handles[i]))) for i in range(out_size.value))
-
-
-def save(fname, data):
-    """Saves a list of arrays or a dict of str->array to file.
-
-    Examples of filenames:
-
-    - ``/path/to/file``
-    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
-    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
-
-    Parameters
-    ----------
-    fname : str
-        The filename.
-    data : ``NDArray``, list of ``NDArray` or dict of str to ``NDArray``
-        The data to save.
-
-    Examples
-    --------
-    >>> x = mx.nd.zeros((2,3))
-    >>> y = mx.nd.ones((1,4))
-    >>> mx.nd.save('my_list', [x,y])
-    >>> mx.nd.save('my_dict', {'x':x, 'y':y})
-    >>> mx.nd.load('my_list')
-    [<NDArray 2x3 @cpu(0)>, <NDArray 1x4 @cpu(0)>]
-    >>> mx.nd.load('my_dict')
-    {'y': <NDArray 1x4 @cpu(0)>, 'x': <NDArray 2x3 @cpu(0)>}
-    """
-    if isinstance(data, NDArray):
-        data = [data]
-    handles = []
-    if isinstance(data, dict):
-        keys = []
-        for key, val in data.items():
-            if not isinstance(key, string_types):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            if not isinstance(val, NDArray):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            keys.append(c_str(key))
-            handles.append(val.handle)
-        keys = c_array(ctypes.c_char_p, keys)
-    elif isinstance(data, list):
-        for val in data:
-            if not isinstance(val, NDArray):
-                raise TypeError('save only accept dict str->NDArray or list of NDArray')
-            handles.append(val.handle)
-        keys = None
-    else:
-        raise ValueError("data needs to either be a NDArray, dict of str, NDArray pairs "
-                         "or a list of NDarrays.")
-    check_call(_LIB.MXNDArraySave(c_str(fname),
-                                  mx_uint(len(handles)),
-                                  c_array(NDArrayHandle, handles),
-                                  keys))
-
 
 def concatenate(arrays, axis=0, always_copy=True):
     """DEPRECATED, use ``concat`` instead
@@ -2455,6 +3307,8 @@ def concatenate(arrays, axis=0, always_copy=True):
 
     return ret
 
+
+# pylint: disable=redefined-outer-name
 def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mean=None):
     """DEPRECATED, use mx.img instead
 
@@ -2497,159 +3351,65 @@ def imdecode(str_img, clip_rect=(0, 0, 0, 0), out=None, index=0, channels=3, mea
                                    out=out)
 
 
-# pylint: disable=too-many-locals, invalid-name
-def _make_ndarray_function(handle, name):
-    """Create a NDArray function from the FunctionHandle."""
-    real_name = ctypes.c_char_p()
-    desc = ctypes.c_char_p()
-    num_args = mx_uint()
-    arg_names = ctypes.POINTER(ctypes.c_char_p)()
-    arg_types = ctypes.POINTER(ctypes.c_char_p)()
-    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
-    key_var_num_args = ctypes.c_char_p()
-    ret_type = ctypes.c_char_p()
-
-    check_call(_LIB.MXSymbolGetAtomicSymbolInfo(
-        handle, ctypes.byref(real_name), ctypes.byref(desc),
-        ctypes.byref(num_args),
-        ctypes.byref(arg_names),
-        ctypes.byref(arg_types),
-        ctypes.byref(arg_descs),
-        ctypes.byref(key_var_num_args),
-        ctypes.byref(ret_type)))
-    narg = int(num_args.value)
-    arg_names = [py_str(arg_names[i]) for i in range(narg)]
-    arg_types = [py_str(arg_types[i]) for i in range(narg)]
-    func_name = name
-    key_var_num_args = py_str(key_var_num_args.value)
-    ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
-    doc_str = _build_doc(func_name,
-                         py_str(desc.value),
-                         arg_names,
-                         arg_types,
-                         [py_str(arg_descs[i]) for i in range(narg)],
-                         key_var_num_args,
-                         ret_type)
-
-    dtype_name = None
-    arr_name = None
-    ndsignature = []
-    signature = []
-    ndarg_names = []
-    kwarg_names = []
-    for i in range(narg):
-        name, atype = arg_names[i], arg_types[i]
-        if name == 'dtype':
-            dtype_name = name
-            signature.append('%s=_Null'%name)
-        elif atype.startswith('NDArray') or atype.startswith('Symbol'):
-            assert not arr_name, \
-                "Op can only have one argument with variable " \
-                "size and it must be the last argument."
-            if atype.endswith('[]'):
-                ndsignature.append('*%s'%name)
-                arr_name = name
-            else:
-                ndsignature.append('%s=None'%name)
-                ndarg_names.append(name)
-        else:
-            signature.append('%s=_Null'%name)
-            kwarg_names.append(name)
-    signature.append('out=None')
-    signature.append('name=None')
-    signature.append('**kwargs')
-    signature = ndsignature + signature
-
-    code = []
-    if arr_name:
-        code.append("""
-def %s(*%s, **kwargs):"""%(func_name, arr_name))
-        code.append("""
-    ndargs = []
-    for i in {}:
-        assert isinstance(i, NDArrayBase), \\
-            "Positional arguments must have NDArray type, " \\
-            "but got %s"%str(i)
-        ndargs.append(i)""".format(arr_name))
-        if dtype_name is not None:
-            code.append("""
-    if '%s' in kwargs:
-        kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
-            dtype_name, dtype_name, dtype_name))
-        code.append("""
-    _ = kwargs.pop('name', None)
-    out = kwargs.pop('out', None)
-    keys = list(kwargs.keys())
-    vals = list(kwargs.values())""")
-    else:
-        code.append("""
-def %s(%s):
-    ndargs = []
-    keys = list(kwargs.keys())
-    vals = list(kwargs.values())"""%(func_name, ', '.join(signature)))
-        # NDArray args
-        for name in ndarg_names: # pylint: disable=redefined-argument-from-local
-            code.append("""
-    if {name} is not None:
-        assert isinstance({name}, NDArrayBase), \\
-            "Argument {name} must have NDArray type, but got %s"%str({name})
-        ndargs.append({name})""".format(name=name))
-        # kwargs
-        for name in kwarg_names: # pylint: disable=redefined-argument-from-local
-            code.append("""
-    if %s is not _Null:
-        keys.append('%s')
-        vals.append(%s)"""%(name, name, name))
-        # dtype
-        if dtype_name is not None:
-            code.append("""
-    if %s is not _Null:
-        keys.append('%s')
-        vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
-
-    code.append("""
-    return _imperative_invoke(%d, ndargs, keys, vals, out)"""%(
-        handle.value))
-
-    local = {}
-    exec(''.join(code), None, local)  # pylint: disable=exec-used
-    ndarray_function = local[func_name]
-    ndarray_function.__name__ = func_name
-    ndarray_function.__doc__ = doc_str
-    ndarray_function.__module__ = 'mxnet.ndarray'
-    return ndarray_function
-
-
-# pylint: enable=too-many-locals, invalid-name
-def _init_ndarray_module(ndarray_class, root_namespace):
-    """List and add all the ndarray functions to current module."""
-    _set_ndarray_class(ndarray_class)
-    plist = ctypes.POINTER(ctypes.c_char_p)()
-    size = ctypes.c_uint()
-
-    check_call(_LIB.MXListAllOpNames(ctypes.byref(size),
-                                     ctypes.byref(plist)))
-    op_names = []
-    for i in range(size.value):
-        op_names.append(py_str(plist[i]))
-
-    module_obj = _sys.modules["%s.ndarray" % root_namespace]
-    module_internal = _sys.modules["%s._ndarray_internal" % root_namespace]
-    module_contrib = _sys.modules["%s.contrib.ndarray" % root_namespace]
-    for name in op_names:
-        hdl = OpHandle()
-        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
-        function = _make_ndarray_function(hdl, name)
-        if function.__name__.startswith('_contrib_'):
-            function.__name__ = function.__name__[9:]
-            function.__module__ = 'mxnet.contrib.ndarray'
-            setattr(module_contrib, function.__name__, function)
-        elif function.__name__.startswith('_'):
-            setattr(module_internal, function.__name__, function)
-        else:
-            setattr(module_obj, function.__name__, function)
+def zeros(shape, ctx=None, dtype=None, **kwargs):
+    """Returns a new array filled with all zeros, with the given shape and type.
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
+    out : NDArray, optional
+        The output NDArray (default is `None`).
+
+    Returns
+    -------
+    NDArray
+        A created array
+
+    Examples
+    --------
+    >>> mx.nd.zeros(1).asnumpy()
+    array([ 0.], dtype=float32)
+    >>> mx.nd.zeros((1,2), mx.gpu(0))
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy()
+    array([[ 0.,  0.]], dtype=float16)
+    """
+    # pylint: disable= unused-argument
+    if ctx is None:
+        ctx = Context.default_ctx
+    dtype = mx_real_t if dtype is None else dtype
+    # pylint: disable= no-member, protected-access
+    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, **kwargs)
+    # pylint: enable= no-member, protected-access
+
+
+def empty(shape, ctx=None, dtype=None):
+    """Returns a new array of given shape and type, without initializing entries.
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
 
-_init_ndarray_module(NDArray, "mxnet")
+    Returns
+    -------
+    NDArray
+        A created array.
 
-# from .base import add_fileline_to_docstring
-# add_fileline_to_docstring(__name__)
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+    if ctx is None:
+        ctx = Context.default_ctx
+    if dtype is None:
+        dtype = mx_real_t
+    return NDArray(handle=_new_alloc_handle(shape, ctx, False, dtype))
diff --git a/python/mxnet/ndarray/op.py b/python/mxnet/ndarray/op.py
new file mode 100644
index 000000000000..a32867723018
--- /dev/null
+++ b/python/mxnet/ndarray/op.py
@@ -0,0 +1,27 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-wildcard-import, redefined-builtin
+"""Backend ops in mxnet.ndarray namespace"""
+from ._internal import CachedOp
+try:
+    from .gen_op import * # pylint: disable=unused-wildcard-import
+except ImportError:
+    pass
+
+__all__ = ['CachedOp']
diff --git a/python/mxnet/ndarray/random.py b/python/mxnet/ndarray/random.py
new file mode 100644
index 000000000000..af125753e5ee
--- /dev/null
+++ b/python/mxnet/ndarray/random.py
@@ -0,0 +1,433 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Random distribution generator NDArray API of MXNet."""
+
+from ..base import numeric_types, _Null
+from ..context import current_context
+from . import _internal
+from .ndarray import NDArray
+
+
+__all__ = ['uniform', 'normal', 'poisson', 'exponential', 'gamma', 'multinomial',
+           'negative_binomial', 'generalized_negative_binomial']
+
+
+def _random_helper(random, sampler, params, shape, dtype, ctx, out, kwargs):
+    """Helper function for random generators."""
+    if isinstance(params[0], NDArray):
+        for i in params[1:]:
+            assert isinstance(i, NDArray), \
+                "Distribution parameters must all have the same type, but got " \
+                "both %s and %s."%(type(params[0]), type(i))
+        return sampler(*params, shape=shape, dtype=dtype, out=out, **kwargs)
+    elif isinstance(params[0], numeric_types):
+        if ctx is None:
+            ctx = current_context()
+        if shape is _Null and out is None:
+            shape = 1
+        for i in params[1:]:
+            assert isinstance(i, numeric_types), \
+                "Distribution parameters must all have the same type, but got " \
+                "both %s and %s."%(type(params[0]), type(i))
+        return random(*params, shape=shape, dtype=dtype, ctx=ctx, out=out, **kwargs)
+
+    raise ValueError("Distribution parameters must be either NDArray or numbers, "
+                     "but got %s."%type(params[0]))
+
+
+def uniform(low=0, high=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwargs):
+    """Draw random samples from a uniform distribution.
+
+    Samples are uniformly distributed over the half-open interval *[low, high)*
+    (includes *low*, but excludes *high*).
+
+    Parameters
+    ----------
+    low : float or NDArray
+        Lower boundary of the output interval. All values generated will be
+        greater than or equal to low. The default value is 0.
+    high : float or NDArray
+        Upper boundary of the output interval. All values generated will be
+        less than high. The default value is 1.0.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `low` and
+        `high` are scalars, output shape will be `(m, n)`. If `low` and `high`
+        are NDArrays with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[low, high)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    ctx : Context
+        Device context of output. Default is current context. Overridden by
+        `low.context` when `low` is an NDArray.
+    out : NDArray
+        Store output to an existing NDArray.
+
+
+    Examples
+    --------
+    >>> mx.nd.random.uniform(0, 1)
+    [ 0.54881352]
+    <NDArray 1 @cpu(0)
+    >>>> mx.nd.random.uniform(0, 1, ctx=mx.gpu(0))
+    [ 0.92514056]
+    <NDArray 1 @gpu(0)>
+    >>> mx.nd.random.uniform(-1, 1, shape=(2,))
+    [[ 0.71589124  0.08976638]
+     [ 0.69450343 -0.15269041]]
+    <NDArray 2x2 @cpu(0)>
+    >>> low = mx.nd.array([1,2,3])
+    >>> high = mx.nd.array([2,3,4])
+    >>> mx.nd.random.uniform(low, high, shape=2)
+    [[ 1.78653979  1.93707538]
+     [ 2.01311183  2.37081361]
+     [ 3.30491424  3.69977832]]
+    <NDArray 3x2 @cpu(0)>
+    """
+    return _random_helper(_internal._random_uniform, _internal._sample_uniform,
+                          [low, high], shape, dtype, ctx, out, kwargs)
+
+
+def normal(loc=0, scale=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwargs):
+    """Draw random samples from a normal (Gaussian) distribution.
+
+    Samples are distributed according to a normal distribution parametrized
+    by *loc* (mean) and *scale* (standard deviation).
+
+
+    Parameters
+    ----------
+    loc : float or NDArray
+        Mean (centre) of the distribution.
+    scale : float or NDArray
+        Standard deviation (spread or width) of the distribution.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `loc` and
+        `scale` are scalars, output shape will be `(m, n)`. If `loc` and `scale`
+        are NDArrays with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[loc, scale)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    ctx : Context
+        Device context of output. Default is current context. Overridden by
+        `loc.context` when `loc` is an NDArray.
+    out : NDArray
+        Store output to an existing NDArray.
+
+
+    Examples
+    --------
+    >>> mx.nd.random.normal(0, 1)
+    [ 2.21220636]
+    <NDArray 1 @cpu(0)>
+    >>>> mx.nd.random.normal(0, 1, ctx=mx.gpu(0))
+    [ 0.29253659]
+    <NDArray 1 @gpu(0)>
+    >>> mx.nd.random.normal(-1, 1, shape=(2,))
+    [-0.2259962  -0.51619542]
+    <NDArray 2 @cpu(0)>
+    >>> loc = mx.nd.array([1,2,3])
+    >>> scale = mx.nd.array([2,3,4])
+    >>> mx.nd.random.normal(loc, scale, shape=2)
+    [[ 0.55912292  3.19566321]
+     [ 1.91728961  2.47706747]
+     [ 2.79666662  5.44254589]]
+    <NDArray 3x2 @cpu(0)>
+    """
+    return _random_helper(_internal._random_normal, _internal._sample_normal,
+                          [loc, scale], shape, dtype, ctx, out, kwargs)
+
+
+def poisson(lam=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwargs):
+    """Draw random samples from a Poisson distribution.
+
+    Samples are distributed according to a Poisson distribution parametrized
+    by *lambda* (rate). Samples will always be returned as a floating point data type.
+
+    Parameters
+    ----------
+    lam : float or NDArray
+        Expectation of interval, should be >= 0.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `lam` is
+        a scalar, output shape will be `(m, n)`. If `lam`
+        is an NDArray with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each entry in `lam`.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    ctx : Context
+        Device context of output. Default is current context. Overridden by
+        `lam.context` when `lam` is an NDArray.
+    out : NDArray
+        Store output to an existing NDArray.
+
+
+    Examples
+    --------
+    >>> mx.nd.random.poisson(1)
+    [ 1.]
+    <NDArray 1 @cpu(0)>
+    >>> mx.nd.random.poisson(1, shape=(2,))
+    [ 0.  2.]
+    <NDArray 2 @cpu(0)>
+    >>> lam = mx.nd.array([1,2,3])
+    >>> mx.nd.random.poisson(lam, shape=2)
+    [[ 1.  3.]
+     [ 3.  2.]
+     [ 2.  3.]]
+    <NDArray 3x2 @cpu(0)>
+    """
+    return _random_helper(_internal._random_poisson, _internal._sample_poisson,
+                          [lam], shape, dtype, ctx, out, kwargs)
+
+
+def exponential(scale=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwargs):
+    r"""Draw samples from an exponential distribution.
+
+    Its probability density function is
+
+        f(x; \frac{1}{\beta}) = \frac{1}{\beta} \exp(-\frac{x}{\beta}),
+
+    for x > 0 and 0 elsewhere. \beta is the scale parameter, which is the
+    inverse of the rate parameter \lambda = 1/\beta.
+
+    Parameters
+    ----------
+    scale : float or NDArray
+        The scale parameter, \beta = 1/\lambda.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `scale` is
+        a scalar, output shape will be `(m, n)`. If `scale`
+        is an NDArray with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each entry in `scale`.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    ctx : Context
+        Device context of output. Default is current context. Overridden by
+        `scale.context` when `scale` is an NDArray.
+    out : NDArray
+        Store output to an existing NDArray.
+
+
+    Examples
+    --------
+    >>> mx.nd.random.exponential(1)
+    [ 0.79587454]
+    <NDArray 1 @cpu(0)>
+    >>> mx.nd.random.exponential(1, shape=(2,))
+    [ 0.89856035  1.25593066]
+    <NDArray 2 @cpu(0)>
+    >>> scale = mx.nd.array([1,2,3])
+    >>> mx.nd.random.exponential(scale, shape=2)
+    [[  0.41063145   0.42140478]
+     [  2.59407091  10.12439728]
+     [  2.42544937   1.14260709]]
+    <NDArray 3x2 @cpu(0)>
+    """
+    return _random_helper(_internal._random_exponential, _internal._sample_exponential,
+                          [1.0/scale], shape, dtype, ctx, out, kwargs)
+
+
+def gamma(alpha=1, beta=1, shape=_Null, dtype=_Null, ctx=None, out=None, **kwargs):
+    """Draw random samples from a gamma distribution.
+
+    Samples are distributed according to a gamma distribution parametrized
+    by *alpha* (shape) and *beta* (scale).
+
+    Parameters
+    ----------
+    alpha : float or NDArray
+        The shape of the gamma distribution. Should be greater than zero.
+    beta : float or NDArray
+        The scale of the gamma distribution. Should be greater than zero.
+        Default is equal to 1.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `alpha` and
+        `beta` are scalars, output shape will be `(m, n)`. If `alpha` and `beta`
+        are NDArrays with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[alpha, beta)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    ctx : Context
+        Device context of output. Default is current context. Overridden by
+        `alpha.context` when `alpha` is an NDArray.
+    out : NDArray
+        Store output to an existing NDArray.
+
+
+    Examples
+    --------
+    >>> mx.nd.random.gamma(1, 1)
+    [ 1.93308783]
+    <NDArray 1 @cpu(0)>
+    >>> mx.nd.random.gamma(1, 1, shape=(2,))
+    [ 0.48216391  2.09890771]
+    <NDArray 2 @cpu(0)>
+    >>> alpha = mx.nd.array([1,2,3])
+    >>> beta = mx.nd.array([2,3,4])
+    >>> mx.nd.random.gamma(alpha, beta, shape=2)
+    [[  3.24343276   0.94137681]
+     [  3.52734375   0.45568955]
+     [ 14.26264095  14.0170126 ]]
+    <NDArray 3x2 @cpu(0)>
+    """
+    return _random_helper(_internal._random_gamma, _internal._sample_gamma,
+                          [alpha, beta], shape, dtype, ctx, out, kwargs)
+
+
+def negative_binomial(k=1, p=1, shape=_Null, dtype=_Null, ctx=None,
+                      out=None, **kwargs):
+    """Draw random samples from a negative binomial distribution.
+
+    Samples are distributed according to a negative binomial distribution
+    parametrized by *k* (limit of unsuccessful experiments) and *p* (failure
+    probability in each experiment). Samples will always be returned as a
+    floating point data type.
+
+    Parameters
+    ----------
+    k : float or NDArray
+        Limit of unsuccessful experiments, > 0.
+    p : float or NDArray
+        Failure probability in each experiment, >= 0 and <=1.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `k` and
+        `p` are scalars, output shape will be `(m, n)`. If `k` and `p`
+        are NDArrays with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[k, p)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    ctx : Context
+        Device context of output. Default is current context. Overridden by
+        `k.context` when `k` is an NDArray.
+    out : NDArray
+        Store output to an existing NDArray.
+
+
+    Examples
+    --------
+    >>> mx.nd.random.negative_binomial(10, 0.5)
+    [ 4.]
+    <NDArray 1 @cpu(0)>
+    >>> mx.nd.random.negative_binomial(10, 0.5, shape=(2,))
+    [ 3.  4.]
+    <NDArray 2 @cpu(0)>
+    >>> k = mx.nd.array([1,2,3])
+    >>> p = mx.nd.array([0.2,0.4,0.6])
+    >>> mx.nd.random.negative_binomial(k, p, shape=2)
+    [[ 3.  2.]
+     [ 4.  4.]
+     [ 0.  5.]]
+    <NDArray 3x2 @cpu(0)>
+    """
+    return _random_helper(_internal._random_negative_binomial,
+                          _internal._sample_negative_binomial,
+                          [k, p], shape, dtype, ctx, out, kwargs)
+
+
+def generalized_negative_binomial(mu=1, alpha=1, shape=_Null, dtype=_Null, ctx=None,
+                                  out=None, **kwargs):
+    """Draw random samples from a generalized negative binomial distribution.
+
+    Samples are distributed according to a generalized negative binomial
+    distribution parametrized by *mu* (mean) and *alpha* (dispersion).
+    *alpha* is defined as *1/k* where *k* is the failure limit of the
+    number of unsuccessful experiments (generalized to real numbers).
+    Samples will always be returned as a floating point data type.
+
+    Parameters
+    ----------
+    mu : float or NDArray
+        Mean of the negative binomial distribution.
+    alpha : float or NDArray
+        Alpha (dispersion) parameter of the negative binomial distribution.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `mu` and
+        `alpha` are scalars, output shape will be `(m, n)`. If `mu` and `alpha`
+        are NDArrays with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[mu, alpha)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    ctx : Context
+        Device context of output. Default is current context. Overridden by
+        `mu.context` when `mu` is an NDArray.
+    out : NDArray
+        Store output to an existing NDArray.
+
+
+    Examples
+    --------
+    >>> mx.nd.random.generalized_negative_binomial(10, 0.5)
+    [ 19.]
+    <NDArray 1 @cpu(0)>
+    >>> mx.nd.random.generalized_negative_binomial(10, 0.5, shape=(2,))
+    [ 30.  21.]
+    <NDArray 2 @cpu(0)>
+    >>> mu = mx.nd.array([1,2,3])
+    >>> alpha = mx.nd.array([0.2,0.4,0.6])
+    >>> mx.nd.random.generalized_negative_binomial(mu, alpha, shape=2)
+    [[ 4.  0.]
+     [ 3.  2.]
+     [ 6.  2.]]
+    <NDArray 3x2 @cpu(0)>
+    """
+    return _random_helper(_internal._random_generalized_negative_binomial,
+                          _internal._sample_generalized_negative_binomial,
+                          [mu, alpha], shape, dtype, ctx, out, kwargs)
+
+
+def multinomial(data, shape=_Null, get_prob=False, out=None, **kwargs):
+    """Concurrent sampling from multiple multinomial distributions.
+
+    .. note:: The input distribution must be normalized, i.e. `data` must sum to
+              1 along its last dimension.
+
+    Parameters
+    ----------
+    data : NDArray
+        An *n* dimensional array whose last dimension has length `k`, where
+        `k` is the number of possible outcomes of each multinomial distribution.
+        For example, data with shape `(m, n, k)` specifies `m*n` multinomial
+        distributions each with `k` possible outcomes.
+    shape : int or tuple of ints
+        The number of samples to draw from each distribution. If shape is empty
+        one sample will be drawn from each distribution.
+    get_prob : bool
+        If true, a second array containing log likelihood of the drawn
+        samples will also be returned.
+        This is usually used for reinforcement learning, where you can provide
+        reward as head gradient w.r.t. this array to estimate gradient.
+    out : NDArray
+        Store output to an existing NDArray.
+
+    Examples
+    --------
+    >>> probs = mx.nd.array([[0, 0.1, 0.2, 0.3, 0.4], [0.4, 0.3, 0.2, 0.1, 0]])
+    >>> mx.nd.random.multinomial(probs)
+    [3 1]
+    <NDArray 2 @cpu(0)>
+    >>> mx.nd.random.multinomial(probs, shape=2)
+    [[4 4]
+     [1 2]]
+    <NDArray 2x2 @cpu(0)>
+    >>> mx.nd.random.multinomial(probs, get_prob=True)
+    [3 2]
+    <NDArray 2 @cpu(0)>
+    [-1.20397282 -1.60943794]
+    <NDArray 2 @cpu(0)>
+    """
+    return _internal._sample_multinomial(data, shape, get_prob, out=out, **kwargs)
diff --git a/python/mxnet/ndarray/register.py b/python/mxnet/ndarray/register.py
new file mode 100644
index 000000000000..48d5c01fb436
--- /dev/null
+++ b/python/mxnet/ndarray/register.py
@@ -0,0 +1,168 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Register backend ops in mxnet.ndarray namespace"""
+import os as _os
+import ctypes
+import numpy as np  # pylint: disable=unused-import
+
+from ._internal import NDArrayBase, _imperative_invoke # pylint: disable=unused-import
+from ..ndarray_doc import _build_doc
+
+from ..base import mx_uint, check_call, _LIB, py_str, _init_op_module, _Null # pylint: disable=unused-import
+
+
+def _generate_ndarray_function_code(handle, name, func_name, signature_only=False):
+    """Generate function for ndarray op by handle and function name."""
+    real_name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+    key_var_num_args = ctypes.c_char_p()
+    ret_type = ctypes.c_char_p()
+
+    check_call(_LIB.MXSymbolGetAtomicSymbolInfo(
+        handle, ctypes.byref(real_name), ctypes.byref(desc),
+        ctypes.byref(num_args),
+        ctypes.byref(arg_names),
+        ctypes.byref(arg_types),
+        ctypes.byref(arg_descs),
+        ctypes.byref(key_var_num_args),
+        ctypes.byref(ret_type)))
+    narg = int(num_args.value)
+    arg_names = [py_str(arg_names[i]) for i in range(narg)]
+    arg_types = [py_str(arg_types[i]) for i in range(narg)]
+    key_var_num_args = py_str(key_var_num_args.value)
+    ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
+    doc_str = _build_doc(name,
+                         py_str(desc.value),
+                         arg_names,
+                         arg_types,
+                         [py_str(arg_descs[i]) for i in range(narg)],
+                         key_var_num_args,
+                         ret_type)
+
+    dtype_name = None
+    arr_name = None
+    ndsignature = []
+    signature = []
+    ndarg_names = []
+    kwarg_names = []
+    for i in range(narg):
+        name, atype = arg_names[i], arg_types[i]
+        if name == 'dtype':
+            dtype_name = name
+            signature.append('%s=_Null'%name)
+        elif atype.startswith('NDArray') or atype.startswith('Symbol'):
+            assert not arr_name, \
+                "Op can only have one argument with variable " \
+                "size and it must be the last argument."
+            if atype.endswith('[]'):
+                ndsignature.append('*%s'%name)
+                arr_name = name
+            else:
+                ndsignature.append('%s=None'%name)
+                ndarg_names.append(name)
+        else:
+            signature.append('%s=_Null'%name)
+            kwarg_names.append(name)
+    signature.append('out=None')
+    signature.append('name=None')
+    signature.append('**kwargs')
+    signature = ndsignature + signature
+
+    code = []
+    if arr_name:
+        code.append("""
+def %s(*%s, **kwargs):"""%(func_name, arr_name))
+        if not signature_only:
+            code.append("""
+    ndargs = []
+    for i in {}:
+        assert isinstance(i, NDArrayBase), \\
+            "Positional arguments must have NDArray type, " \\
+            "but got %s"%str(i)
+        ndargs.append(i)""".format(arr_name))
+            if dtype_name is not None:
+                code.append("""
+    if '%s' in kwargs:
+        kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
+            dtype_name, dtype_name, dtype_name))
+            code.append("""
+    _ = kwargs.pop('name', None)
+    out = kwargs.pop('out', None)
+    keys = list(kwargs.keys())
+    vals = list(kwargs.values())""")
+    else:
+        code.append("""
+def %s(%s):"""%(func_name, ', '.join(signature)))
+        if not signature_only:
+            code.append("""
+    ndargs = []
+    keys = list(kwargs.keys())
+    vals = list(kwargs.values())""")
+            # NDArray args
+            for name in ndarg_names: # pylint: disable=redefined-argument-from-local
+                code.append("""
+    if {name} is not None:
+        assert isinstance({name}, NDArrayBase), \\
+            "Argument {name} must have NDArray type, but got %s"%str({name})
+        ndargs.append({name})""".format(name=name))
+            # kwargs
+            for name in kwarg_names: # pylint: disable=redefined-argument-from-local
+                code.append("""
+    if %s is not _Null:
+        keys.append('%s')
+        vals.append(%s)"""%(name, name, name))
+            # dtype
+            if dtype_name is not None:
+                code.append("""
+    if %s is not _Null:
+        keys.append('%s')
+        vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
+
+    if not signature_only:
+        code.append("""
+    return _imperative_invoke(%d, ndargs, keys, vals, out)"""%(
+        handle.value))
+    else:
+        code.append("""
+    return (0,)""")
+
+    doc_str_lines = _os.linesep+''.join(['    '+s if s.strip() else s
+                                         for s in 'r"""{doc_str}"""'.format(doc_str=doc_str)
+                                         .splitlines(True)])
+    code.insert(1, doc_str_lines)
+    return ''.join(code), doc_str
+
+
+# pylint: disable=too-many-locals, invalid-name
+def _make_ndarray_function(handle, name, func_name):
+    """Create a NDArray function from the FunctionHandle."""
+    code, doc_str = _generate_ndarray_function_code(handle, name, func_name)
+
+    local = {}
+    exec(code, None, local)  # pylint: disable=exec-used
+    ndarray_function = local[func_name]
+    ndarray_function.__name__ = func_name
+    ndarray_function.__doc__ = doc_str
+    ndarray_function.__module__ = 'mxnet.ndarray'
+    return ndarray_function
+
+_init_op_module('mxnet', 'ndarray', _make_ndarray_function)
diff --git a/python/mxnet/ndarray/sparse.py b/python/mxnet/ndarray/sparse.py
new file mode 100644
index 000000000000..229044e289ac
--- /dev/null
+++ b/python/mxnet/ndarray/sparse.py
@@ -0,0 +1,1281 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-wildcard-import, too-many-lines
+"""Sparse NDArray API of MXNet."""
+
+from __future__ import absolute_import
+from __future__ import division
+try:
+    from __builtin__ import slice as py_slice
+    from __builtin__ import sum as py_sum
+except ImportError:
+    from builtins import slice as py_slice
+    from builtins import sum as py_sum
+
+import ctypes
+import warnings
+from array import array as native_array
+
+__all__ = ["_ndarray_cls", "csr_matrix", "row_sparse_array",
+           "BaseSparseNDArray", "CSRNDArray", "RowSparseNDArray"]
+
+import numpy as np
+from ..base import NotSupportedForSparseNDArray
+from ..base import _LIB, numeric_types
+from ..base import c_array_buf, mx_real_t, integer_types
+from ..base import mx_uint, NDArrayHandle, check_call
+from ..context import Context
+from . import _internal
+from . import op
+try:
+    from .gen_sparse import * # pylint: disable=redefined-builtin
+except ImportError:
+    pass
+from ._internal import _set_ndarray_class
+from .ndarray import NDArray, _storage_type, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
+from .ndarray import _STORAGE_TYPE_STR_TO_ID, _STORAGE_TYPE_ROW_SPARSE, _STORAGE_TYPE_CSR
+from .ndarray import _STORAGE_TYPE_UNDEFINED, _STORAGE_TYPE_DEFAULT
+from .ndarray import zeros as _zeros_ndarray
+from .ndarray import array as _array
+
+
+try:
+    import scipy.sparse as spsp
+except ImportError:
+    spsp = None
+
+_STORAGE_AUX_TYPES = {
+    'row_sparse': [np.int64],
+    'csr': [np.int64, np.int64]
+}
+
+
+def _new_alloc_handle(stype, shape, ctx, delay_alloc, dtype, aux_types, aux_shapes=None):
+    """Return a new handle with specified storage type, shape, dtype and context.
+
+    Empty handle is only used to hold results
+
+    Returns
+    -------
+    handle
+        A new empty ndarray handle
+    """
+    hdl = NDArrayHandle()
+    for aux_t in aux_types:
+        if np.dtype(aux_t) != np.dtype("int64"):
+            raise NotImplementedError("only int64 is supported for aux types")
+    aux_type_ids = [int(_DTYPE_NP_TO_MX[np.dtype(aux_t).type]) for aux_t in aux_types]
+    aux_shapes = [(0,) for aux_t in aux_types] if aux_shapes is None else aux_shapes
+    aux_shape_lens = [len(aux_shape) for aux_shape in aux_shapes]
+    aux_shapes = py_sum(aux_shapes, ())
+    num_aux = mx_uint(len(aux_types))
+    check_call(_LIB.MXNDArrayCreateSparseEx(
+        ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[stype])),
+        c_array_buf(mx_uint, native_array('I', shape)),
+        mx_uint(len(shape)),
+        ctypes.c_int(ctx.device_typeid),
+        ctypes.c_int(ctx.device_id),
+        ctypes.c_int(int(delay_alloc)),
+        ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
+        num_aux,
+        c_array_buf(ctypes.c_int, native_array('i', aux_type_ids)),
+        c_array_buf(mx_uint, native_array('I', aux_shape_lens)),
+        c_array_buf(mx_uint, native_array('I', aux_shapes)),
+        ctypes.byref(hdl)))
+    return hdl
+
+
+class BaseSparseNDArray(NDArray):
+    """The base class of an NDArray stored in a sparse storage format.
+
+    See CSRNDArray and RowSparseNDArray for more details.
+    """
+
+    def __repr__(self):
+        """Returns a string representation of the sparse array."""
+        shape_info = 'x'.join(['%d' % x for x in self.shape])
+        # The data content is not displayed since the array usually has big shape
+        return '\n<%s %s @%s>' % (self.__class__.__name__,
+                                  shape_info, self.context)
+
+    def __iadd__(self, other):
+        raise NotImplementedError()
+
+    def __isub__(self, other):
+        raise NotImplementedError()
+
+    def __imul__(self, other):
+        raise NotImplementedError()
+
+    def __idiv__(self, other):
+        raise NotImplementedError()
+
+    def __itruediv__(self, other):
+        raise NotImplementedError()
+
+    def _sync_copyfrom(self, source_array):
+        raise NotImplementedError()
+
+    def _at(self, idx):
+        raise NotSupportedForSparseNDArray(self._at, '[idx]', idx)
+
+    def _slice(self, start, stop):
+        raise NotSupportedForSparseNDArray(self._slice, None, start, stop)
+
+    def reshape(self, shape):
+        raise NotSupportedForSparseNDArray(self.reshape, None, shape)
+
+    @property
+    def size(self):
+        # the `size` for a sparse ndarray is ambiguous, hence disabled.
+        raise NotImplementedError()
+
+    def _aux_type(self, i):
+        """Data-type of the array's ith aux data.
+
+        Returns
+        -------
+        numpy.dtype
+            This BaseSparseNDArray's aux data type.
+        """
+        aux_type = ctypes.c_int()
+        check_call(_LIB.MXNDArrayGetAuxType(self.handle, i, ctypes.byref(aux_type)))
+        return _DTYPE_MX_TO_NP[aux_type.value]
+
+    @property
+    def _num_aux(self):
+        """The number of aux data used to help store the sparse ndarray.
+        """
+        return len(_STORAGE_AUX_TYPES[self.stype])
+
+    @property
+    def _aux_types(self):
+        """The data types of the aux data for the BaseSparseNDArray.
+        """
+        aux_types = []
+        num_aux = self._num_aux
+        for i in range(num_aux):
+            aux_types.append(self._aux_type(i))
+        return aux_types
+
+    def asnumpy(self):
+        """Return a dense ``numpy.ndarray`` object with value copied from this array
+        """
+        return self.tostype('default').asnumpy()
+
+    def astype(self, dtype):
+        """Returns a copy of the array after casting to a specified type.
+        Parameters
+        ----------
+        dtype : numpy.dtype or str
+            The type of the returned array.
+        Examples
+        --------
+        >>> x = mx.nd.sparse.zeros('row_sparse', (2,3), dtype='float32')
+        >>> y = x.astype('int32')
+        >>> y.dtype
+        <type 'numpy.int32'>
+        """
+        res = zeros(shape=self.shape, ctx=self.context,
+                    dtype=dtype, stype=self.stype)
+        self.copyto(res)
+        return res
+
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        Parameters
+        ----------
+        other : NDArray or CSRNDArray or RowSparseNDArray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        NDArray or CSRNDArray or RowSparseNDArray
+            The copied array.
+        """
+        if isinstance(other, NDArray):
+            if other.handle is self.handle:
+                warnings.warn('You are attempting to copy an array to itself', RuntimeWarning)
+                return
+            return _internal._copyto(self, out=other)
+        elif isinstance(other, Context):
+            hret = _ndarray_cls(_new_alloc_handle(self.stype, self.shape, other,
+                                                  True, self.dtype, self._aux_types))
+            return _internal._copyto(self, out=hret)
+        else:
+            raise TypeError('copyto does not support type ' + str(type(other)))
+
+    def check_format(self, full_check=True):
+        """Check whether the NDArray format is valid.
+
+        Parameters
+        ----------
+        full_check : bool, optional
+            If `True`, rigorous check, O(N) operations. Otherwise
+            basic check, O(1) operations (default True).
+        """
+        check_call(_LIB.MXNDArraySyncCheckFormat(self.handle, ctypes.c_bool(full_check)))
+
+    def _data(self):
+        """A deep copy NDArray of the data array associated with the BaseSparseNDArray.
+
+        This function blocks. Do not use it in performance critical code.
+        """
+        self.wait_to_read()
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayGetDataNDArray(self.handle, ctypes.byref(hdl)))
+        return NDArray(hdl)
+
+
+    def _aux_data(self, i):
+        """ Get a deep copy NDArray of the i-th aux data array associated with the
+        BaseSparseNDArray.
+
+        This function blocks. Do not use it in performance critical code.
+        """
+        self.wait_to_read()
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayGetAuxNDArray(self.handle, i, ctypes.byref(hdl)))
+        return NDArray(hdl)
+
+
+# pylint: disable=abstract-method
+class CSRNDArray(BaseSparseNDArray):
+    """A sparse representation of 2D NDArray in the Compressed Sparse Row format.
+
+    A CSRNDArray represents an NDArray as three separate arrays: `data`,
+    `indptr` and `indices`. It uses the CSR representation where the column indices for
+    row i are stored in ``indices[indptr[i]:indptr[i+1]]`` and their corresponding values are stored
+    in ``data[indptr[i]:indptr[i+1]]``.
+
+    The column indices for a given row are expected to be sorted in ascending order.
+    Duplicate column entries for the same row are not allowed.
+
+    Example
+    -------
+    >>> a = mx.nd.array([[0, 1, 0], [2, 0, 0], [0, 0, 0], [0, 0, 3]])
+    >>> a = a.tostype('csr')
+    >>> a.data.asnumpy()
+    array([ 1.,  2.,  3.], dtype=float32)
+    >>> a.indices.asnumpy()
+    array([1, 0, 2])
+    >>> a.indptr.asnumpy()
+    array([0, 1, 2, 2, 3])
+
+    See Also
+    --------
+    csr_matrix: Several ways to construct a CSRNDArray
+    """
+
+    def __reduce__(self):
+        return CSRNDArray, (None,), super(CSRNDArray, self).__getstate__()
+
+    def __iadd__(self, other):
+        (self + other).copyto(self)
+        return self
+
+    def __isub__(self, other):
+        (self - other).copyto(self)
+        return self
+
+    def __imul__(self, other):
+        (self * other).copyto(self)
+        return self
+
+    def __idiv__(self, other):
+        (self / other).copyto(self)
+        return self
+
+    def __itruediv__(self, other):
+        (self / other).copyto(self)
+        return self
+
+    def __getitem__(self, key):
+        """x.__getitem__(i) <=> x[i]
+
+        Returns a sliced view of this array.
+
+        Parameters
+        ----------
+        key : int or slice
+            Indexing key.
+
+        Examples
+        --------
+        >>> indptr = np.array([0, 2, 3, 6])
+        >>> indices = np.array([0, 2, 2, 0, 1, 2])
+        >>> data = np.array([1, 2, 3, 4, 5, 6])
+        >>> a = mx.nd.sparse.csr_matrix((data, indices, indptr), shape=(3, 3))
+        >>> a.asnumpy()
+        array([[ 1.,  0.,  2.],
+               [ 0.,  0.,  3.],
+               [ 4.,  5.,  6.]], dtype=float32)
+        >>> a[1:2].asnumpy()
+        array([[ 0.,  0.,  3.]], dtype=float32)
+        >>> a[1].asnumpy()
+        array([[ 0.,  0.,  3.]], dtype=float32)
+        >>> a[-1].asnumpy()
+        array([[ 4.,  5.,  6.]], dtype=float32)
+        """
+        if isinstance(key, int):
+            if key == -1:
+                begin = self.shape[0] - 1
+            else:
+                begin = key
+            return op.slice(self, begin=begin, end=begin+1)
+        if isinstance(key, py_slice):
+            if key.step is not None:
+                raise ValueError('CSRNDArray only supports continuous slicing on axis 0')
+            if key.start is not None or key.stop is not None:
+                begin = key.start if key.start else 0
+                end = key.stop if key.stop else self.shape[0]
+                return op.slice(self, begin=begin, end=end)
+            else:
+                return self
+        if isinstance(key, tuple):
+            raise ValueError('Multi-dimension indexing is not supported')
+
+    def __setitem__(self, key, value):
+        """x.__setitem__(i, y) <=> x[i]=y
+
+        Set self[key] to value. Only slice key [:] is supported.
+
+        Parameters
+        ----------
+        key : slice
+            The indexing key.
+        value : NDArray or CSRNDArray or numpy.ndarray
+            The value to set.
+
+        Examples
+        --------
+        >>> src = mx.nd.sparse.zeros('csr', (3,3))
+        >>> src.asnumpy()
+        array([[ 0.,  0.,  0.],
+               [ 0.,  0.,  0.],
+               [ 0.,  0.,  0.]], dtype=float32)
+        >>> # assign CSRNDArray with same storage type
+        >>> x = mx.nd.ones('row_sparse', (3,3)).tostype('csr')
+        >>> x[:] = src
+        >>> x.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        >>> # assign NDArray to CSRNDArray
+        >>> x[:] = mx.nd.ones((3,3)) * 2
+        >>> x.asnumpy()
+        array([[ 2.,  2.,  2.],
+               [ 2.,  2.,  2.],
+               [ 2.,  2.,  2.]], dtype=float32)
+        """
+        if not self.writable:
+            raise ValueError('Failed to assign to a readonly CSRNDArray')
+        if isinstance(key, py_slice):
+            if key.step is not None or key.start is not None or key.stop is not None:
+                raise ValueError('Assignment with slice for CSRNDArray is not ' \
+                                 'implmented yet.')
+            if isinstance(value, NDArray):
+                # avoid copying to itself
+                if value.handle is not self.handle:
+                    value.copyto(self)
+            elif isinstance(value, numeric_types):
+                raise ValueError("Assigning numeric types to CSRNDArray is " \
+                                 "not implemented yet.")
+            elif isinstance(value, (np.ndarray, np.generic)):
+                # TODO(haibin/anisub) check scipy.sparse and use _sync_copy_from to
+                # avoid the temporary copy
+                warnings.warn('Assigning non-NDArray object to CSRNDArray is not efficient',
+                              RuntimeWarning)
+                tmp = _array(value)
+                tmp.copyto(self)
+            else:
+                raise TypeError('type %s not supported' % str(type(value)))
+        else:
+            assert(isinstance(key, (int, tuple)))
+            raise Exception('CSRNDArray only supports [:] for assignment')
+
+    @property
+    def indices(self):
+        """A deep copy NDArray of the indices array of the CSRNDArray.
+        This generates a deep copy of the column indices of the current `csr` matrix.
+
+        Returns
+        -------
+        NDArray
+            This CSRNDArray's indices array.
+        """
+        return self._aux_data(1)
+
+    @property
+    def indptr(self):
+        """A deep copy NDArray of the indptr array of the CSRNDArray.
+        This generates a deep copy of the `indptr` of the current `csr` matrix.
+
+        Returns
+        -------
+        NDArray
+            This CSRNDArray's indptr array.
+        """
+        return self._aux_data(0)
+
+    @property
+    def data(self):
+        """A deep copy NDArray of the data array of the CSRNDArray.
+        This generates a deep copy of the `data` of the current `csr` matrix.
+
+        Returns
+        -------
+        NDArray
+            This CSRNDArray's data array.
+        """
+        return self._data()
+
+    @indices.setter
+    def indices(self, indices):
+        raise NotImplementedError()
+
+    @indptr.setter
+    def indptr(self, indptr):
+        raise NotImplementedError()
+
+    @data.setter
+    def data(self, data):
+        raise NotImplementedError()
+
+
+    def tostype(self, stype):
+        """Return a copy of the array with chosen storage type.
+
+        Returns
+        -------
+        NDArray or CSRNDArray
+            A copy of the array with the chosen storage stype
+        """
+        if stype == 'row_sparse':
+            raise ValueError("cast_storage from csr to row_sparse is not supported")
+        return op.cast_storage(self, stype=stype)
+
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        If ``other`` is a ``NDArray`` or ``CSRNDArray`` object, then ``other.shape`` and
+        ``self.shape`` should be the same. This function copies the value from
+        ``self`` to ``other``.
+
+        If ``other`` is a context, a new ``CSRNDArray`` will be first created on
+        the target context, and the value of ``self`` is copied.
+
+        Parameters
+        ----------
+        other : NDArray or CSRNDArray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        NDArray or CSRNDArray
+            The copied array. If ``other`` is an ``NDArray`` or ``CSRNDArray``, then the return
+            value and ``other`` will point to the same ``NDArray`` or ``CSRNDArray``.
+        """
+        if isinstance(other, Context):
+            return super(CSRNDArray, self).copyto(other)
+        elif isinstance(other, NDArray):
+            stype = other.stype
+            if stype == 'default' or stype == 'csr':
+                return super(CSRNDArray, self).copyto(other)
+            else:
+                raise TypeError('copyto does not support destination NDArray stype ' + str(stype))
+        else:
+            raise TypeError('copyto does not support type ' + str(type(other)))
+
+    def asscipy(self):
+        """Returns a ``scipy.sparse.csr.csr_matrix`` object with value copied from this array
+
+        Examples
+        --------
+        >>> x = mx.nd.sparse.zeros('csr', (2,3))
+        >>> y = x.asscipy()
+        >>> type(y)
+        <type 'scipy.sparse.csr.csr_matrix'>
+        >>> y
+        <2x3 sparse matrix of type '<type 'numpy.float32'>'
+        with 0 stored elements in Compressed Sparse Row format>
+        """
+        data = self.data.asnumpy()
+        indices = self.indices.asnumpy()
+        indptr = self.indptr.asnumpy()
+        if not spsp:
+            raise ImportError("scipy is not available. \
+                               Please check if the scipy python bindings are installed.")
+        return spsp.csr_matrix((data, indices, indptr), shape=self.shape, dtype=self.dtype)
+
+# pylint: disable=abstract-method
+class RowSparseNDArray(BaseSparseNDArray):
+    """A sparse representation of a set of NDArray row slices at given indices.
+
+    A RowSparseNDArray represents a multidimensional NDArray using two separate arrays: `data` and
+    `indices`. The number of dimensions has to be at least 2.
+
+    - data: an NDArray of any dtype with shape [D0, D1, ..., Dn].
+    - indices: a 1-D int64 NDArray with shape [D0] with values sorted in ascending order.
+
+    The `indices` stores the indices of the row slices with non-zeros,
+    while the values are stored in `data`. The corresponding NDArray ``dense``
+    represented by RowSparseNDArray ``rsp`` has
+
+    ``dense[rsp.indices[i], :, :, :, ...] = rsp.data[i, :, :, :, ...]``
+
+        >>> dense.asnumpy()
+        array([[ 1.,  2., 3.],
+               [ 0.,  0., 0.],
+               [ 4.,  0., 5.],
+               [ 0.,  0., 0.],
+               [ 0.,  0., 0.]], dtype=float32)
+        >>> rsp = dense.tostype('row_sparse')
+        >>> rsp.indices.asnumpy()
+        array([0, 2], dtype=int64)
+        >>> rsp.data.asnumpy()
+        array([[ 1.,  2., 3.],
+               [ 4.,  0., 5.]], dtype=float32)
+
+    A RowSparseNDArray is typically used to represent non-zero row slices of a large NDArray
+    of shape [LARGE0, D1, .. , Dn] where LARGE0 >> D0 and most row slices are zeros.
+
+    RowSparseNDArray is used principally in the definition of gradients for operations
+    that have sparse gradients (e.g. sparse dot and sparse embedding).
+
+    See Also
+    --------
+    row_sparse_array: Several ways to construct a RowSparseNDArray
+    """
+    def __reduce__(self):
+        return RowSparseNDArray, (None,), super(RowSparseNDArray, self).__getstate__()
+
+    def __iadd__(self, other):
+        (self + other).copyto(self)
+        return self
+
+    def __isub__(self, other):
+        (self - other).copyto(self)
+        return self
+
+    def __imul__(self, other):
+        (self * other).copyto(self)
+        return self
+
+    def __idiv__(self, other):
+        (self / other).copyto(self)
+        return self
+
+    def __itruediv__(self, other):
+        (self / other).copyto(self)
+        return self
+
+    def __getitem__(self, key):
+        """x.__getitem__(i) <=> x[i]
+
+        Returns a sliced view of this array.
+
+        Parameters
+        ----------
+        key : slice
+            Indexing key.
+
+        Examples
+        --------
+        >>> x = mx.nd.sparse.zeros('row_sparse', (2, 3))
+        >>> x[:].asnumpy()
+        array([[ 0.,  0.,  0.],
+               [ 0.,  0.,  0.]], dtype=float32)
+        """
+        if isinstance(key, int):
+            raise Exception("__getitem__ with int key is not implemented for RowSparseNDArray yet")
+        if isinstance(key, py_slice):
+            if key.step is not None or key.start is not None or key.stop is not None:
+                raise Exception('RowSparseNDArray only supports [:] for __getitem__')
+            else:
+                return self
+        if isinstance(key, tuple):
+            raise ValueError('Multi-dimension indexing is not supported')
+
+    def __setitem__(self, key, value):
+        """x.__setitem__(i, y) <=> x[i]=y
+
+        Set self[key] to value. Only slice key [:] is supported.
+
+        Parameters
+        ----------
+        key : slice
+            The indexing key.
+        value : NDArray or numpy.ndarray
+            The value to set.
+
+        Examples
+        --------
+        >>> src = mx.nd.row_sparse([[1, 0, 2], [4, 5, 6]], [0, 2], (3,3))
+        >>> src.asnumpy()
+        array([[ 1.,  0.,  2.],
+               [ 0.,  0.,  0.],
+               [ 4.,  5.,  6.]], dtype=float32)
+        >>> # assign RowSparseNDArray with same storage type
+        >>> x = mx.nd.sparse.zeros('row_sparse', (3,3))
+        >>> x[:] = src
+        >>> x.asnumpy()
+        array([[ 1.,  0.,  2.],
+               [ 0.,  0.,  0.],
+               [ 4.,  5.,  6.]], dtype=float32)
+        >>> # assign NDArray to RowSparseNDArray
+        >>> x[:] = mx.nd.ones((3,3))
+        >>> x.asnumpy()
+        array([[ 1.,  1.,  1.],
+               [ 1.,  1.,  1.],
+               [ 1.,  1.,  1.]], dtype=float32)
+        """
+        if not self.writable:
+            raise ValueError('Failed to assign to a readonly RowSparseNDArray')
+        if isinstance(key, py_slice):
+            if key.step is not None or key.start is not None or key.stop is not None:
+                raise ValueError('Assignment with slice for RowSparseNDArray ' \
+                                 'is not implmented yet.')
+            if isinstance(value, NDArray):
+                # avoid copying to itself
+                if value.handle is not self.handle:
+                    value.copyto(self)
+            elif isinstance(value, numeric_types):
+                raise ValueError("Assigning numeric types to RowSparseNDArray " \
+                                 "is not implemented yet.")
+            elif isinstance(value, (np.ndarray, np.generic)):
+                warnings.warn('Assigning non-NDArray object to RowSparseNDArray is not efficient',
+                              RuntimeWarning)
+                tmp = _array(value)
+                tmp.copyto(self)
+            else:
+                raise TypeError('type %s not supported' % str(type(value)))
+        else:
+            assert(isinstance(key, (int, tuple)))
+            raise TypeError('RowSparseNDArray only supports [:] for assignment')
+
+    @property
+    def indices(self):
+        """A deep copy NDArray of the indices array of the RowSparseNDArray.
+        This generates a deep copy of the row indices of the current `row_sparse` matrix.
+
+        Returns
+        -------
+        NDArray
+            This RowSparseNDArray's indices array.
+        """
+        return self._aux_data(0)
+
+    @property
+    def data(self):
+        """A deep copy NDArray of the data array of the RowSparseNDArray.
+        This generates a deep copy of the `data` of the current `row_sparse` matrix.
+
+        Returns
+        -------
+        NDArray
+            This RowSparseNDArray's data array.
+        """
+        return self._data()
+
+    @indices.setter
+    def indices(self, indices):
+        raise NotImplementedError()
+
+    @data.setter
+    def data(self, data):
+        raise NotImplementedError()
+
+    def tostype(self, stype):
+        """Return a copy of the array with chosen storage type.
+
+        Returns
+        -------
+        NDArray or RowSparseNDArray
+            A copy of the array with the chosen storage stype
+        """
+        if stype == 'csr':
+            raise ValueError("cast_storage from row_sparse to csr is not supported")
+        return op.cast_storage(self, stype=stype)
+
+    def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        If ``other`` is a ``NDArray`` or ``RowSparseNDArray`` object, then ``other.shape``
+        and ``self.shape`` should be the same. This function copies the value from
+        ``self`` to ``other``.
+
+        If ``other`` is a context, a new ``RowSparseNDArray`` will be first created on
+        the target context, and the value of ``self`` is copied.
+
+        Parameters
+        ----------
+        other : NDArray or RowSparseNDArray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        NDArray or RowSparseNDArray
+            The copied array. If ``other`` is an ``NDArray`` or ``RowSparseNDArray``, then the
+            return value and ``other`` will point to the same ``NDArray`` or ``RowSparseNDArray``.
+        """
+        if isinstance(other, Context):
+            return super(RowSparseNDArray, self).copyto(other)
+        elif isinstance(other, NDArray):
+            stype = other.stype
+            if stype == 'default' or stype == 'row_sparse':
+                return super(RowSparseNDArray, self).copyto(other)
+            else:
+                raise TypeError('copyto does not support destination NDArray stype ' + str(stype))
+        else:
+            raise TypeError('copyto does not support type ' + str(type(other)))
+
+    def retain(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`retain`.
+
+        The arguments are the same as for :py:func:`retain`, with
+        this array as data.
+        """
+        return retain(self, *args, **kwargs)
+
+def _prepare_src_array(source_array, dtype):
+    """Prepare `source_array` so that it can be used to construct NDArray.
+    `source_array` is converted to a `np.ndarray` if it's neither an `NDArray` \
+    nor an `np.ndarray`.
+    """
+    if not isinstance(source_array, NDArray) and not isinstance(source_array, np.ndarray):
+        try:
+            source_array = np.array(source_array, dtype=dtype)
+        except:
+            raise TypeError('values must be array like object')
+    return source_array
+
+def _prepare_default_dtype(src_array, dtype):
+    """Prepare the value of dtype if `dtype` is None. If `src_array` is an NDArray, numpy.ndarray
+    or scipy.sparse.csr.csr_matrix, return src_array.dtype. float32 is returned otherwise."""
+    if dtype is None:
+        if isinstance(src_array, (NDArray, np.ndarray)):
+            dtype = src_array.dtype
+        elif spsp and isinstance(src_array, spsp.csr.csr_matrix):
+            dtype = src_array.dtype
+        else:
+            dtype = mx_real_t
+    return dtype
+
+def _check_shape(s1, s2):
+    """check s1 == s2 if both are not None"""
+    if s1 and s2 and s1 != s2:
+        raise ValueError("Shape mismatch detected. " + str(s1) + " v.s. " + str(s2))
+
+def csr_matrix(arg1, shape=None, ctx=None, dtype=None):
+    """Creates a `CSRNDArray`, an 2D array with compressed sparse row (CSR) format.
+
+    The CSRNDArray can be instantiated in several ways:
+
+    - csr_matrix(D):
+        to construct a CSRNDArray with a dense 2D array ``D``
+            -  **D** (*array_like*) - An object exposing the array interface, an object whose \
+            `__array__` method returns an array, or any (nested) sequence.
+            - **ctx** (*Context, optional*) - Device context \
+            (default is the current default context).
+            - **dtype** (*str or numpy.dtype, optional*) - The data type of the output array. \
+            The default dtype is ``D.dtype`` if ``D`` is an NDArray or numpy.ndarray, \
+            float32 otherwise.
+
+    - csr_matrix(S)
+        to construct a CSRNDArray with a sparse 2D array ``S``
+            -  **S** (*CSRNDArray or scipy.sparse.csr.csr_matrix*) - A sparse matrix.
+            - **ctx** (*Context, optional*) - Device context \
+            (default is the current default context).
+            - **dtype** (*str or numpy.dtype, optional*) - The data type of the output array. \
+            The default dtype is ``S.dtype``.
+
+    - csr_matrix((M, N))
+        to construct an empty CSRNDArray with shape ``(M, N)``
+            -  **M** (*int*) - Number of rows in the matrix
+            -  **N** (*int*) - Number of columns in the matrix
+            - **ctx** (*Context, optional*) - Device context \
+            (default is the current default context).
+            - **dtype** (*str or numpy.dtype, optional*) - The data type of the output array. \
+            The default dtype is float32.
+
+    - csr_matrix((data, indices, indptr))
+        to construct a CSRNDArray based on the definition of compressed sparse row format \
+        using three separate arrays, \
+        where the column indices for row i are stored in ``indices[indptr[i]:indptr[i+1]]`` \
+        and their corresponding values are stored in ``data[indptr[i]:indptr[i+1]]``. \
+        The column indices for a given row are expected to be **sorted in ascending order.** \
+        Duplicate column entries for the same row are not allowed.
+            - **data** (*array_like*) - An object exposing the array interface, which \
+            holds all the non-zero entries of the matrix in row-major order.
+            - **indices** (*array_like*) - An object exposing the array interface, which \
+            stores the column index for each non-zero element in ``data``.
+            - **indptr** (*array_like*) - An object exposing the array interface, which \
+            stores the offset into ``data`` of the first non-zero element number of each \
+            row of the matrix.
+            - **shape** (*tuple of int, optional*) - The shape of the array. The default \
+            shape is inferred from the indices and indptr arrays.
+            - **ctx** (*Context, optional*) - Device context \
+            (default is the current default context).
+            - **dtype** (*str or numpy.dtype, optional*) - The data type of the output array. \
+            The default dtype is ``data.dtype`` if ``data`` is an NDArray or numpy.ndarray, \
+            float32 otherwise.
+
+    - csr_matrix((data, (row, col)))
+        to construct a CSRNDArray based on the COOrdinate format \
+        using three seperate arrays, \
+        where ``row[i]`` is the row index of the element, \
+        ``col[i]`` is the column index of the element \
+        and ``data[i]`` is the data corresponding to the element. All the missing \
+        elements in the input are taken to be zeroes.
+            - **data** (*array_like*) - An object exposing the array interface, which \
+            holds all the non-zero entries of the matrix in COO format.
+            - **row** (*array_like*) - An object exposing the array interface, which \
+            stores the row index for each non zero element in ``data``.
+            - **col** (*array_like*) - An object exposing the array interface, which \
+            stores the col index for each non zero element in ``data``.
+            - **shape** (*tuple of int, optional*) - The shape of the array. The default \
+            shape is inferred from the ``row`` and ``col`` arrays.
+            - **ctx** (*Context, optional*) - Device context \
+            (default is the current default context).
+            - **dtype** (*str or numpy.dtype, optional*) - The data type of the output array. \
+            The default dtype is float32.
+
+    Parameters
+    ----------
+    arg1: tuple of int, tuple of array_like, array_like, CSRNDArray, scipy.sparse.csr_matrix, \
+    scipy.sparse.coo_matrix, tuple of int or tuple of array_like
+        The argument to help instantiate the csr matrix. See above for further details.
+    shape : tuple of int, optional
+        The shape of the csr matrix.
+    ctx: Context, optional
+        Device context (default is the current default context).
+    dtype: str or numpy.dtype, optional
+        The data type of the output array.
+
+    Returns
+    -------
+    CSRNDArray
+        A `CSRNDArray` with the `csr` storage representation.
+
+    Example
+    -------
+    >>> a = mx.nd.sparse.csr_matrix(([1, 2, 3], [1, 0, 2], [0, 1, 2, 2, 3]), shape=(4, 3))
+    >>> a.asnumpy()
+    array([[ 0.,  1.,  0.],
+           [ 2.,  0.,  0.],
+           [ 0.,  0.,  0.],
+           [ 0.,  0.,  3.]], dtype=float32)
+
+    See Also
+    --------
+    CSRNDArray : MXNet NDArray in compressed sparse row format.
+    """
+    # construct a csr matrix from (M, N) or (data, indices, indptr)
+    if isinstance(arg1, tuple):
+        arg_len = len(arg1)
+        if arg_len == 2:
+            # construct a sparse csr matrix from
+            # scipy coo matrix if input format is coo
+            if isinstance(arg1[1], tuple) and len(arg1[1]) == 2:
+                data, (row, col) = arg1
+                if isinstance(data, NDArray):
+                    data = data.asnumpy()
+                if isinstance(row, NDArray):
+                    row = row.asnumpy()
+                if isinstance(col, NDArray):
+                    col = col.asnumpy()
+                coo = spsp.coo_matrix((data, (row, col)), shape=shape)
+                _check_shape(coo.shape, shape)
+                csr = coo.tocsr()
+                return array(csr, ctx=ctx, dtype=dtype)
+            else:
+                # empty matrix with shape
+                _check_shape(arg1, shape)
+                return empty('csr', arg1, ctx=ctx, dtype=dtype)
+        elif arg_len == 3:
+            # data, indices, indptr
+            return _csr_matrix_from_definition(arg1[0], arg1[1], arg1[2], shape=shape,
+                                               ctx=ctx, dtype=dtype)
+        else:
+            raise ValueError("Unexpected length of input tuple: " + str(arg_len))
+    else:
+        # construct a csr matrix from a sparse / dense one
+        if isinstance(arg1, CSRNDArray) or (spsp and isinstance(arg1, spsp.csr.csr_matrix)):
+            # construct a csr matrix from scipy or CSRNDArray
+            _check_shape(arg1.shape, shape)
+            return array(arg1, ctx=ctx, dtype=dtype)
+        elif isinstance(arg1, RowSparseNDArray):
+            raise ValueError("Unexpected input type: RowSparseNDArray")
+        else:
+            # construct a csr matrix from a dense one
+            # prepare default ctx and dtype since mx.nd.array doesn't use default values
+            # based on source_array
+            dtype = _prepare_default_dtype(arg1, dtype)
+            # create dns array with provided dtype. ctx is not passed since copy across
+            # ctx requires dtype to be the same
+            dns = _array(arg1, dtype=dtype)
+            if ctx is not None and dns.context != ctx:
+                dns = dns.as_in_context(ctx)
+            _check_shape(dns.shape, shape)
+            return dns.tostype('csr')
+
+def _csr_matrix_from_definition(data, indices, indptr, shape=None, ctx=None,
+                                dtype=None, indices_type=None, indptr_type=None):
+    """Create a `CSRNDArray` based on data, indices and indptr"""
+    storage_type = 'csr'
+    # context
+    ctx = Context.default_ctx if ctx is None else ctx
+    # types
+    dtype = _prepare_default_dtype(data, dtype)
+    indptr_type = _STORAGE_AUX_TYPES[storage_type][0] if indptr_type is None else indptr_type
+    indices_type = _STORAGE_AUX_TYPES[storage_type][1] if indices_type is None else indices_type
+    # prepare src array and types
+    data = _prepare_src_array(data, dtype)
+    indptr = _prepare_src_array(indptr, indptr_type)
+    indices = _prepare_src_array(indices, indices_type)
+
+    # TODO(junwu): Convert data, indptr, and indices to mxnet NDArrays
+    # if they are not for now. In the future, we should provide a c-api
+    # to accept np.ndarray types to copy from to result.data and aux_data
+    if not isinstance(data, NDArray):
+        data = _array(data, ctx, dtype)
+    if not isinstance(indptr, NDArray):
+        indptr = _array(indptr, ctx, indptr_type)
+    if not isinstance(indices, NDArray):
+        indices = _array(indices, ctx, indices_type)
+    if shape is None:
+        if indices.shape[0] == 0:
+            raise ValueError('invalid shape')
+        shape = (len(indptr) - 1, op.max(indices).asscalar() + 1)
+    # verify shapes
+    aux_shapes = [indptr.shape, indices.shape]
+    if data.ndim != 1 or indptr.ndim != 1 or indices.ndim != 1 or \
+        indptr.shape[0] == 0 or len(shape) != 2:
+        raise ValueError('invalid shape')
+    result = CSRNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype,
+                                          [indptr_type, indices_type], aux_shapes))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_int(-1)))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indptr.handle, ctypes.c_int(0)))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_int(1)))
+    return result
+
+def row_sparse_array(arg1, shape=None, ctx=None, dtype=None):
+    """Creates a `RowSparseNDArray`, a multidimensional row sparse array with a set of \
+    tensor slices at given indices.
+
+    The RowSparseNDArray can be instantiated in several ways:
+
+    - row_sparse_array(D):
+        to construct a RowSparseNDArray with a dense ndarray ``D``
+            -  **D** (*array_like*) - An object exposing the array interface, an object whose \
+            `__array__` method returns an array, or any (nested) sequence.
+            - **ctx** (*Context, optional*) - Device context \
+            (default is the current default context).
+            - **dtype** (*str or numpy.dtype, optional*) - The data type of the output array. \
+            The default dtype is ``D.dtype`` if ``D`` is an NDArray or numpy.ndarray, \
+            float32 otherwise.
+
+    - row_sparse_array(S)
+        to construct a RowSparseNDArray with a sparse ndarray ``S``
+            -  **S** (*RowSparseNDArray*) - A sparse ndarray.
+            - **ctx** (*Context, optional*) - Device context \
+            (default is the current default context).
+            - **dtype** (*str or numpy.dtype, optional*) - The data type of the output array. \
+            The default dtype is ``S.dtype``.
+
+    - row_sparse_array((D0, D1 .. Dn))
+        to construct an empty RowSparseNDArray with shape ``(D0, D1, ... Dn)``
+            -  **D0, D1 .. Dn** (*int*) - The shape of the ndarray
+            - **ctx** (*Context, optional*) - Device context \
+            (default is the current default context).
+            - **dtype** (*str or numpy.dtype, optional*) - The data type of the output array. \
+            The default dtype is float32.
+
+    - row_sparse_array((data, indices))
+        to construct a RowSparseNDArray based on the definition of row sparse format \
+        using two separate arrays, \
+        where the `indices` stores the indices of the row slices with non-zeros,
+        while the values are stored in `data`. The corresponding NDArray ``dense``
+        represented by RowSparseNDArray ``rsp`` has \
+        ``dense[rsp.indices[i], :, :, :, ...] = rsp.data[i, :, :, :, ...]``
+        The row indices for are expected to be **sorted in ascending order.** \
+            - **data** (*array_like*) - An object exposing the array interface, which \
+            holds all the non-zero row slices of the array.
+            - **indices** (*array_like*) - An object exposing the array interface, which \
+            stores the row index for each row slice with non-zero elements.
+            - **shape** (*tuple of int, optional*) - The shape of the array. The default \
+            shape is inferred from the indices and indptr arrays.
+            - **ctx** (*Context, optional*) - Device context \
+            (default is the current default context).
+            - **dtype** (*str or numpy.dtype, optional*) - The data type of the output array. \
+            The default dtype is float32.
+
+    Parameters
+    ----------
+    arg1: NDArray, numpy.ndarray, RowSparseNDArray, tuple of int or tuple of array_like
+        The argument to help instantiate the row sparse ndarray. See above for further details.
+    shape : tuple of int, optional
+        The shape of the row sparse ndarray.
+    ctx : Context, optional
+        Device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        The data type of the output array.
+
+    Returns
+    -------
+    RowSparseNDArray
+        An `RowSparseNDArray` with the `row_sparse` storage representation.
+
+    Example
+    -------
+    >>> a = mx.nd.sparse.row_sparse_array(([[1, 2], [3, 4]], [1, 4]), shape=(6, 2))
+    >>> a.asnumpy()
+    array([[ 0.,  0.],
+           [ 1.,  2.],
+           [ 0.,  0.],
+           [ 0.,  0.],
+           [ 3.,  4.],
+           [ 0.,  0.]], dtype=float32)
+
+    See Also
+    --------
+    RowSparseNDArray : MXNet NDArray in row sparse format.
+    """
+    # construct a row sparse array from (D0, D1 ..) or (data, indices)
+    if isinstance(arg1, tuple):
+        arg_len = len(arg1)
+        if arg_len < 2:
+            raise ValueError("Unexpected length of input tuple: " + str(arg_len))
+        elif arg_len > 2:
+            # empty ndarray with shape
+            _check_shape(arg1, shape)
+            return empty('row_sparse', arg1, ctx=ctx, dtype=dtype)
+        else:
+            # len(arg1) = 2, is either shape or (data, indices)
+            if isinstance(arg1[0], integer_types) and isinstance(arg1[1], integer_types):
+                # empty ndarray with shape
+                _check_shape(arg1, shape)
+                return empty('row_sparse', arg1, ctx=ctx, dtype=dtype)
+            else:
+                # data, indices, indptr
+                return _row_sparse_ndarray_from_definition(arg1[0], arg1[1], shape=shape,
+                                                           ctx=ctx, dtype=dtype)
+    else:
+        # construct a row sparse ndarray from a dense / sparse array
+        if isinstance(arg1, RowSparseNDArray):
+            # construct a row sparse ndarray from RowSparseNDArray
+            _check_shape(arg1.shape, shape)
+            return array(arg1, ctx=ctx, dtype=dtype)
+        elif isinstance(arg1, CSRNDArray):
+            raise ValueError("Unexpected input type: CSRNDArray")
+        else:
+            # construct a csr matrix from a dense one
+            # prepare default dtype since mx.nd.array doesn't use default values
+            # based on source_array
+            dtype = _prepare_default_dtype(arg1, dtype)
+            # create dns array with provided dtype. ctx is not passed since copy across
+            # ctx requires dtype to be the same
+            dns = _array(arg1, dtype=dtype)
+            if ctx is not None and dns.context != ctx:
+                dns = dns.as_in_context(ctx)
+            _check_shape(dns.shape, shape)
+            return dns.tostype('row_sparse')
+
+def _row_sparse_ndarray_from_definition(data, indices, shape=None, ctx=None,
+                                        dtype=None, indices_type=None):
+    """Create a `RowSparseNDArray` based on data and indices"""
+    storage_type = 'row_sparse'
+    # context
+    ctx = Context.default_ctx if ctx is None else ctx
+    # types
+    dtype = _prepare_default_dtype(data, dtype)
+    indices_type = _STORAGE_AUX_TYPES[storage_type][0] if indices_type is None else indices_type
+    # prepare src array and types
+    data = _prepare_src_array(data, dtype)
+    indices = _prepare_src_array(indices, indices_type)
+
+    # TODO(junwu): Convert data, indptr, and indices to mxnet NDArrays
+    # if they are not for now. In the future, we should provide a c-api
+    # to accept np.ndarray types to copy from to result.data and aux_data
+    if not isinstance(data, NDArray):
+        data = _array(data, ctx, dtype)
+    if not isinstance(indices, NDArray):
+        indices = _array(indices, ctx, indices_type)
+    if shape is None:
+        num_indices = indices.shape[0]
+        if num_indices == 0:
+            raise ValueError('invalid shape')
+        dim0 = indices[num_indices - 1].asscalar() + 1
+        shape = (dim0, ) + data.shape[1:]
+    # verify shapes
+    if data.ndim != len(shape) or indices.ndim != 1 or np.prod(shape[1:]) == 0:
+        raise ValueError("invalid shape")
+    result = RowSparseNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype,
+                                                [indices_type], [indices.shape]))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, data.handle, ctypes.c_int(-1)))
+    check_call(_LIB.MXNDArraySyncCopyFromNDArray(result.handle, indices.handle, ctypes.c_int(0)))
+    return result
+
+def _ndarray_cls(handle, writable=True, stype=_STORAGE_TYPE_UNDEFINED):
+    if stype == _STORAGE_TYPE_UNDEFINED:
+        stype = _storage_type(handle)
+    if stype == _STORAGE_TYPE_DEFAULT:
+        return NDArray(handle, writable=writable)
+    elif stype == _STORAGE_TYPE_CSR:
+        return CSRNDArray(handle, writable=writable)
+    elif stype == _STORAGE_TYPE_ROW_SPARSE:
+        return RowSparseNDArray(handle, writable=writable)
+    else:
+        raise Exception("unknown storage type: %s"%stype)
+
+
+_set_ndarray_class(_ndarray_cls)
+
+
+def zeros(stype, shape, ctx=None, dtype=None, **kwargs):
+    """Return a new array of given shape and type, filled with zeros.
+
+    Parameters
+    ----------
+    stype: string
+        The storage type of the empty array, such as 'row_sparse', 'csr', etc
+    shape : int or tuple of int
+        The shape of the empty array
+    ctx : Context, optional
+        An optional device context (default is the current default context)
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`)
+
+    Returns
+    -------
+    RowSparseNDArray or CSRNDArray
+        A created array
+    Examples
+    --------
+    >>> mx.nd.sparse.zeros('csr', (1,2))
+    <CSRNDArray 1x2 @cpu(0)>
+    >>> mx.nd.sparse.zeros('row_sparse', (1,2), ctx=mx.cpu(), dtype='float16').asnumpy()
+    array([[ 0.,  0.]], dtype=float16)
+    """
+    if stype == 'default':
+        return _zeros_ndarray(shape, ctx=ctx, dtype=dtype, **kwargs)
+    if ctx is None:
+        ctx = Context.default_ctx
+    dtype = mx_real_t if dtype is None else dtype
+    if stype == 'row_sparse' or stype == 'csr':
+        aux_types = _STORAGE_AUX_TYPES[stype]
+    else:
+        raise ValueError("unknown storage type" + stype)
+    out = _ndarray_cls(_new_alloc_handle(stype, shape, ctx, True, dtype, aux_types))
+    return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out, **kwargs)
+
+
+def empty(stype, shape, ctx=None, dtype=None):
+    """Returns a new array of given shape and type, without initializing entries.
+
+    Parameters
+    ----------
+    stype: string
+        The storage type of the empty array, such as 'row_sparse', 'csr', etc
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
+
+    Returns
+    -------
+    CSRNDArray or RowSparseNDArray
+        A created array.
+    """
+    if isinstance(shape, int):
+        shape = (shape, )
+    if ctx is None:
+        ctx = Context.default_ctx
+    if dtype is None:
+        dtype = mx_real_t
+    assert(stype is not None)
+    if stype == 'csr' or stype == 'row_sparse':
+        return zeros(stype, shape, ctx=ctx, dtype=dtype)
+    else:
+        raise Exception("unknown stype : " + str(stype))
+
+
+def array(source_array, ctx=None, dtype=None):
+    """Creates a sparse array from any object exposing the array interface.
+
+    Parameters
+    ----------
+    source_array : RowSparseNDArray, CSRNDArray or scipy.sparse.csr.csr_matrix
+        The source sparse array
+    ctx : Context, optional
+        The default context is ``source_array.context`` if ``source_array`` is an NDArray. \
+        The current default context otherwise.
+    dtype : str or numpy.dtype, optional
+        The data type of the output array. The default dtype is ``source_array.dtype``
+        if `source_array` is an `NDArray`, `numpy.ndarray` or `scipy.sparse.csr.csr_matrix`, \
+        `float32` otherwise.
+
+    Returns
+    -------
+    RowSparseNDArray or CSRNDArray
+        An array with the same contents as the `source_array`.
+
+    Examples
+    --------
+    >>> import scipy.sparse as spsp
+    >>> csr = spsp.csr_matrix((2, 100))
+    >>> mx.nd.sparse.array(csr)
+    <CSRNDArray 2x100 @cpu(0)>
+    >>> mx.nd.sparse.array(mx.nd.sparse.zeros('csr', (3, 2)))
+    <CSRNDArray 3x2 @cpu(0)>
+    >>> mx.nd.sparse.array(mx.nd.sparse.zeros('row_sparse', (3, 2)))
+    <RowSparseNDArray 3x2 @cpu(0)>
+    """
+    ctx = Context.default_ctx if ctx is None else ctx
+    if isinstance(source_array, NDArray):
+        assert(source_array.stype != 'default'), \
+               "Please use `tostype` to create RowSparseNDArray or CSRNDArray from an NDArray"
+        # prepare dtype and ctx based on source_array, if not provided
+        dtype = _prepare_default_dtype(source_array, dtype)
+        # if both dtype and ctx are different from source_array, we cannot copy directly
+        if source_array.dtype != dtype and source_array.context != ctx:
+            arr = empty(source_array.stype, source_array.shape, dtype=dtype)
+            arr[:] = source_array
+            arr = arr.as_in_context(ctx)
+        else:
+            arr = empty(source_array.stype, source_array.shape, dtype=dtype, ctx=ctx)
+            arr[:] = source_array
+        return arr
+    elif spsp and isinstance(source_array, spsp.csr.csr_matrix):
+        # TODO(haibin) implement `_sync_copy_from` with scipy csr object to reduce a copy
+        # preprocess scipy csr to canonical form
+        csr = source_array.sorted_indices()
+        csr.sum_duplicates()
+        dtype = _prepare_default_dtype(source_array, dtype)
+        return csr_matrix((csr.data, csr.indices, csr.indptr), shape=csr.shape, \
+                          dtype=dtype, ctx=ctx)
+    elif isinstance(source_array, (np.ndarray, np.generic)):
+        raise ValueError("Please use mx.nd.array to create an NDArray with source_array of type ",
+                         type(source_array))
+    else:
+        raise ValueError("Unexpected source_array type: ", type(source_array))
diff --git a/python/mxnet/ndarray/utils.py b/python/mxnet/ndarray/utils.py
new file mode 100644
index 000000000000..4f597c749f22
--- /dev/null
+++ b/python/mxnet/ndarray/utils.py
@@ -0,0 +1,236 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""Utility functions for NDArray and BaseSparseNDArray."""
+import ctypes
+
+from ..base import _LIB, check_call, py_str, c_str, string_types, mx_uint, NDArrayHandle
+from ..base import c_array, c_handle_array, c_str_array
+from .ndarray import NDArray
+from .ndarray import array as _array
+from .ndarray import empty as _empty_ndarray
+from .ndarray import zeros as _zeros_ndarray
+from .sparse import zeros as _zeros_sparse_ndarray
+from .sparse import empty as _empty_sparse_ndarray
+from .sparse import array as _sparse_array
+from .sparse import _ndarray_cls
+try:
+    import scipy.sparse as spsp
+except ImportError:
+    spsp = None
+
+__all__ = ['zeros', 'empty', 'array', 'load', 'save']
+
+
+def zeros(shape, ctx=None, dtype=None, stype=None, **kwargs):
+    """Return a new array of given shape and type, filled with zeros.
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array
+    ctx : Context, optional
+        An optional device context (default is the current default context)
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`)
+    stype: string, optional
+        The storage type of the empty array, such as 'row_sparse', 'csr', etc.
+
+    Returns
+    -------
+    NDArray, CSRNDArray or RowSparseNDArray
+        A created array
+    Examples
+    --------
+    >>> mx.nd.zeros((1,2), mx.cpu(), stype='csr')
+    <CSRNDArray 1x2 @cpu(0)>
+    >>> mx.nd.zeros((1,2), mx.cpu(), 'float16', stype='row_sparse').asnumpy()
+    array([[ 0.,  0.]], dtype=float16)
+    """
+
+    if stype is None or stype == 'default':
+        return _zeros_ndarray(shape, ctx, dtype, **kwargs)
+    else:
+        return _zeros_sparse_ndarray(stype, shape, ctx, dtype, **kwargs)
+
+
+def empty(shape, ctx=None, dtype=None, stype=None):
+    """Returns a new array of given shape and type, without initializing entries.
+
+    Parameters
+    ----------
+    shape : int or tuple of int
+        The shape of the empty array.
+    ctx : Context, optional
+        An optional device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        An optional value type (default is `float32`).
+    stype : str, optional
+        An optional storage type (default is `default`).
+
+    Returns
+    -------
+    NDArray, CSRNDArray or RowSparseNDArray
+        A created array.
+
+    Examples
+    --------
+    >>> mx.nd.empty(1)
+    <NDArray 1 @cpu(0)>
+    >>> mx.nd.empty((1,2), mx.gpu(0))
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.empty((1,2), mx.gpu(0), 'float16')
+    <NDArray 1x2 @gpu(0)>
+    >>> mx.nd.empty((1,2), stype='csr')
+    <CSRNDArray 1x2 @cpu(0)>
+    """
+    if stype is None or stype == 'default':
+        return _empty_ndarray(shape, ctx, dtype)
+    else:
+        return _empty_sparse_ndarray(stype, shape, ctx, dtype)
+
+
+def array(source_array, ctx=None, dtype=None):
+    """Creates an array from any object exposing the array interface.
+
+    Parameters
+    ----------
+    source_array : array_like
+        An object exposing the array interface, an object whose `__array__`
+        method returns an array, or any (nested) sequence.
+    ctx : Context, optional
+        Device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        The data type of the output array. The default dtype is ``source_array.dtype``
+        if `source_array` is an `NDArray`, `float32` otherwise.
+
+    Returns
+    -------
+    NDArray, RowSparseNDArray or CSRNDArray
+        An array with the same contents as the `source_array`.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> mx.nd.array([1, 2, 3])
+    <NDArray 3 @cpu(0)>
+    >>> mx.nd.array([[1, 2], [3, 4]])
+    <NDArray 2x2 @cpu(0)>
+    >>> mx.nd.array(np.zeros((3, 2)))
+    <NDArray 3x2 @cpu(0)>
+    >>> mx.nd.array(np.zeros((3, 2)), mx.gpu(0))
+    <NDArray 3x2 @gpu(0)>
+    >>> mx.nd.array(mx.nd.zeros((3, 2), stype='row_sparse'))
+    <RowSparseNDArray 3x2 @cpu(0)>
+    """
+    if spsp is not None and isinstance(source_array, spsp.csr.csr_matrix):
+        return _sparse_array(source_array, ctx=ctx, dtype=dtype)
+    elif isinstance(source_array, NDArray) and source_array.stype != 'default':
+        return _sparse_array(source_array, ctx=ctx, dtype=dtype)
+    else:
+        return _array(source_array, ctx=ctx, dtype=dtype)
+
+
+def load(fname):
+    """Loads an array from file.
+
+    See more details in ``save``.
+
+    Parameters
+    ----------
+    fname : str
+        The filename.
+
+    Returns
+    -------
+    list of NDArray, RowSparseNDArray or CSRNDArray, or \
+    dict of str to NDArray, RowSparseNDArray or CSRNDArray
+        Loaded data.
+    """
+    if not isinstance(fname, string_types):
+        raise TypeError('fname required to be a string')
+    out_size = mx_uint()
+    out_name_size = mx_uint()
+    handles = ctypes.POINTER(NDArrayHandle)()
+    names = ctypes.POINTER(ctypes.c_char_p)()
+    check_call(_LIB.MXNDArrayLoad(c_str(fname),
+                                  ctypes.byref(out_size),
+                                  ctypes.byref(handles),
+                                  ctypes.byref(out_name_size),
+                                  ctypes.byref(names)))
+    if out_name_size.value == 0:
+        return [_ndarray_cls(NDArrayHandle(handles[i])) for i in range(out_size.value)]
+    else:
+        assert out_name_size.value == out_size.value
+        return dict(
+            (py_str(names[i]), _ndarray_cls(NDArrayHandle(handles[i])))
+            for i in range(out_size.value))
+
+
+def save(fname, data):
+    """Saves a list of arrays or a dict of str->array to file.
+
+    Examples of filenames:
+
+    - ``/path/to/file``
+    - ``s3://my-bucket/path/to/file`` (if compiled with AWS S3 supports)
+    - ``hdfs://path/to/file`` (if compiled with HDFS supports)
+
+    Parameters
+    ----------
+    fname : str
+        The filename.
+    data : NDArray, RowSparseNDArray or CSRNDArray, \
+           or list of NDArray, RowSparseNDArray or CSRNDArray, \
+           or dict of str to NDArray, RowSparseNDArray or CSRNDArray
+        The data to save.
+
+    Examples
+    --------
+    >>> x = mx.nd.zeros((2,3))
+    >>> y = mx.nd.ones((1,4))
+    >>> mx.nd.save('my_list', [x,y])
+    >>> mx.nd.save('my_dict', {'x':x, 'y':y})
+    >>> mx.nd.load('my_list')
+    [<NDArray 2x3 @cpu(0)>, <NDArray 1x4 @cpu(0)>]
+    >>> mx.nd.load('my_dict')
+    {'y': <NDArray 1x4 @cpu(0)>, 'x': <NDArray 2x3 @cpu(0)>}
+    """
+    if isinstance(data, NDArray):
+        data = [data]
+        handles = c_array(NDArrayHandle, [])
+    if isinstance(data, dict):
+        str_keys = data.keys()
+        nd_vals = data.values()
+        if any(not isinstance(k, string_types) for k in str_keys) or \
+           any(not isinstance(v, NDArray) for v in nd_vals):
+            raise TypeError('save only accept dict str->NDArray or list of NDArray')
+        keys = c_str_array(str_keys)
+        handles = c_handle_array(nd_vals)
+    elif isinstance(data, list):
+        if any(not isinstance(v, NDArray) for v in data):
+            raise TypeError('save only accept dict str->NDArray or list of NDArray')
+        keys = None
+        handles = c_handle_array(data)
+    else:
+        raise ValueError("data needs to either be a NDArray, dict of str, NDArray pairs "
+                         "or a list of NDarrays.")
+    check_call(_LIB.MXNDArraySave(c_str(fname),
+                                  mx_uint(len(handles)),
+                                  handles,
+                                  keys))
diff --git a/python/mxnet/notebook/callback.py b/python/mxnet/notebook/callback.py
index 56321b715b40..776900fe59f5 100644
--- a/python/mxnet/notebook/callback.py
+++ b/python/mxnet/notebook/callback.py
@@ -367,7 +367,7 @@ def _process_batch(self, param, df_name):
             metrics = {}
         metrics['elapsed'] = datetime.datetime.now() - self.start_time
         for key, value in metrics.items():
-            if not self._data[df_name].has_key(key):
+            if key not in self._data[df_name]:
                 self._data[df_name][key] = []
             self._data[df_name][key].append(value)
 
diff --git a/python/mxnet/operator.py b/python/mxnet/operator.py
index 1337bbccc3c8..141a33806ad8 100644
--- a/python/mxnet/operator.py
+++ b/python/mxnet/operator.py
@@ -22,12 +22,13 @@
 
 import traceback
 
+from array import array
 from threading import Lock
 from ctypes import CFUNCTYPE, POINTER, Structure, pointer
 from ctypes import c_void_p, c_int, c_char, c_char_p, cast, c_bool
 
-from .base import _LIB, check_call, MXCallbackList
-from .base import c_array, c_str, mx_uint, mx_float, ctypes2numpy_shared, NDArrayHandle, py_str
+from .base import _LIB, check_call, MXCallbackList, c_array, c_array_buf
+from .base import c_str, mx_uint, mx_float, ctypes2numpy_shared, NDArrayHandle, py_str
 from . import symbol, context
 from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
 
@@ -206,7 +207,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
             assert len(ishape) == n_in
             rshape = list(ishape) + list(oshape)
             for i in range(n_in+n_out):
-                tensor_shapes[i] = cast(c_array(mx_uint, rshape[i]), POINTER(mx_uint))
+                tensor_shapes[i] = cast(c_array_buf(mx_uint,
+                                                    array('I', rshape[i])),
+                                        POINTER(mx_uint))
                 tensor_dims[i] = len(rshape[i])
 
         def list_outputs_entry(out, _):
@@ -324,7 +327,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
                 assert len(ishape) == n_in
                 rshape = list(ishape) + list(oshape)
                 for i in range(n_in+n_out):
-                    tensor_shapes[i] = cast(c_array(mx_uint, rshape[i]), POINTER(mx_uint))
+                    tensor_shapes[i] = cast(c_array_buf(mx_uint,
+                                                        array('I', rshape[i])),
+                                            POINTER(mx_uint))
                     tensor_dims[i] = len(rshape[i])
             except Exception:
                 print('Error in NDArrayOp.infer_shape: %s' % traceback.format_exc())
@@ -363,7 +368,7 @@ def declare_backward_dependency(out_grad, in_data, out_data, num_dep, deps, _):
                 out_data = [out_data[i] for i in range(len(self.list_outputs()))]
                 rdeps = self.declare_backward_dependency(out_grad, in_data, out_data)
                 num_dep[0] = len(rdeps)
-                rdeps = cast(c_array(c_int, rdeps), c_int_p)
+                rdeps = cast(c_array_buf(c_int, array('i', rdeps)), c_int_p)
                 deps[0] = rdeps
             except Exception:
                 print('Error in NDArrayOp.declare_backward_dependency: %s' % traceback.format_exc())
@@ -645,7 +650,9 @@ def infer_shape_entry(num_tensor, tensor_dims,
                         "shapes, got %d."%(n_aux, len(ashape))
                     rshape = list(ishape) + list(oshape) + list(ashape)
                     for i in range(n_in+n_out+n_aux):
-                        tensor_shapes[i] = cast(c_array(mx_uint, rshape[i]), POINTER(mx_uint))
+                        tensor_shapes[i] = cast(c_array_buf(mx_uint,
+                                                            array('I', rshape[i])),
+                                                POINTER(mx_uint))
                         tensor_dims[i] = len(rshape[i])
 
                     infer_shape_entry._ref_holder = [tensor_shapes]
@@ -741,7 +748,7 @@ def declare_backward_dependency_entry(out_grad, in_data, out_data, num_dep, deps
                     out_data = [out_data[i] for i in range(len(op_prop.list_outputs()))]
                     rdeps = op_prop.declare_backward_dependency(out_grad, in_data, out_data)
                     num_dep[0] = len(rdeps)
-                    rdeps = cast(c_array(c_int, rdeps), c_int_p)
+                    rdeps = cast(c_array_buf(c_int, array('i', rdeps)), c_int_p)
                     deps[0] = rdeps
 
                     declare_backward_dependency_entry._ref_holder = [deps]
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index dee09bf386a9..7b08a8002229 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -15,15 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# coding: utf-8
+# pylint: disable=too-many-lines
 """Weight updating functions."""
 import math
 import pickle
 import logging
 import warnings
 import numpy
-from .ndarray import (NDArray, zeros, clip, sqrt, sign, array, maximum, abs as NDabs)
+from .base import py_str
+from .ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs)
 from .ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
-                      mp_sgd_update, mp_sgd_mom_update)
+                      mp_sgd_update, mp_sgd_mom_update, square, ftrl_update)
+from .ndarray import _internal
+from .ndarray import op
+from .ndarray import sparse
 from .random import normal
 
 
@@ -57,11 +63,24 @@ class Optimizer(object):
 
     begin_num_update : int, optional
         The initial number of updates.
+
+    multi_precision : bool, optional
+       Flag to control the internal precision of the optimizer.
+       ``False`` results in using the same precision as the weights (default),
+       ``True`` makes internal 32-bit copy of the weights and applies gradients
+                in 32-bit precision even if actual weights used in the model have lower precision.
+                Turning this on can improve convergence and accuracy when training with float16.
+
+    Properties
+    ----------
+    learning_rate: float
+        The current learning rate of the optimizer. Given an Optimizer object
+        optimizer, its learning rate can be accessed as optimizer.learning_rate.
     """
     def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
                  clip_gradient=None, learning_rate=0.01,
                  lr_scheduler=None, sym=None, begin_num_update=0,
-                 param_dict=None):
+                 multi_precision=False, param_dict=None):
         self.rescale_grad = rescale_grad
         self.lr = learning_rate
         self.lr_scheduler = lr_scheduler
@@ -75,13 +94,14 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.num_update = begin_num_update
         self._index_update_count = {}
         self.clip_gradient = clip_gradient
+        self.multi_precision = multi_precision
 
         if param_idx2name is None:
             param_idx2name = {}
         assert isinstance(param_idx2name, dict), \
             'param_idx2name should be a dict of param indexes to names.'
         self.idx2name = param_idx2name.copy()
-        self.sym = sym
+        self.sym_info = (sym.attr_dict(), sym.list_arguments()) if sym is not None else ()
         self.param_dict = param_dict if param_dict else {}
 
         self.set_lr_mult({})
@@ -151,6 +171,12 @@ def create_optimizer(name, **kwargs):
         else:
             raise ValueError('Cannot find optimizer %s' % name)
 
+    @property
+    def learning_rate(self):
+        if self.lr_scheduler is not None:
+            return self.lr_scheduler(self.num_update)
+        else:
+            return self.lr
 
     def create_state(self, index, weight):
         """Creates auxiliary state for a given weight.
@@ -173,6 +199,36 @@ def create_state(self, index, weight):
             The state associated with the weight.
         """
 
+    def create_state_multi_precision(self, index, weight):
+        """Creates auxiliary state for a given weight, including FP32 high
+        precision copy if original weight is FP16.
+
+        This method is provided to perform automatic mixed precision training
+        for optimizers that do not support it themselves.
+
+        Parameters
+        ----------
+        index : int
+            An unique index to identify the weight.
+        weight : NDArray
+            The weight.
+
+        Returns
+        -------
+        state : any obj
+            The state associated with the weight.
+        """
+        weight_master_copy = None
+        if self.multi_precision and weight.dtype == numpy.float16:
+            weight_master_copy = weight.astype(numpy.float32)
+            return (weight_master_copy,) + (self.create_state(index, weight_master_copy),)
+        if weight.dtype == numpy.float16 and not self.multi_precision:
+            warnings.warn("Accumulating with float16 in optimizer can lead to "
+                          "poor accuracy or slow convergence. "
+                          "Consider using multi_precision=True option of the "
+                          "optimizer")
+        return self.create_state(index, weight)
+
     def update(self, index, weight, grad, state):
         """Updates the given parameter using the corresponding gradient and state.
 
@@ -191,6 +247,50 @@ def update(self, index, weight, grad, state):
         """
         raise NotImplementedError()
 
+    def update_multi_precision(self, index, weight, grad, state):
+        """Updates the given parameter using the corresponding gradient and state.
+        Mixed precision version.
+
+        Parameters
+        ----------
+        index : int
+            The unique index of the parameter into the individual learning
+            rates and weight decays. Learning rates and weight decay
+            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
+        weight : NDArray
+            The parameter to be updated.
+        grad : NDArray
+            The gradient of the objective with respect to this parameter.
+        state : any obj
+            The state returned by `create_state()`.
+        """
+        if self.multi_precision and weight.dtype == numpy.float16:
+            # Wrapper for mixed precision
+            weight_master_copy = state[0]
+            original_state = state[1]
+            grad32 = grad.astype(numpy.float32)
+            self.update(index, weight_master_copy, grad32, original_state)
+            cast(weight_master_copy, dtype=weight.dtype, out=weight)
+        else:
+            self.update(index, weight, grad, state)
+
+    def set_learning_rate(self, lr):
+        """Sets a new learning rate of the optimizer.
+
+        Parameters
+        ----------
+        lr : float
+            The new learning rate of the optimizer.
+        """
+        if self.lr_scheduler is not None:
+            raise UserWarning("LRScheduler of the optimizer has already been "
+                              "defined. Note that set_learning_rate can mutate "
+                              "the value of the learning rate of the optimizer "
+                              "only when the LRScheduler of the optimizer is "
+                              "undefined.")
+        else:
+            self.lr = lr
+
     def set_lr_scale(self, args_lrscale): # pylint: disable=unused-argument
         """[DEPRECATED] Sets lr scale. Use set_lr_mult instead."""
         raise DeprecationWarning
@@ -221,9 +321,9 @@ def set_lr_mult(self, args_lr_mult):
             compatibility, and we recommend to use the name instead.
         """
         self.lr_mult = {}
-        if self.sym is not None:
-            attr = self.sym.attr_dict()
-            for name in self.sym.list_arguments():
+        if self.sym_info:
+            attr, arg_names = self.sym_info
+            for name in arg_names:
                 if name in attr and '__lr_mult__' in attr[name]:
                     self.lr_mult[name] = float(attr[name]['__lr_mult__'])
         self.lr_mult.update(args_lr_mult)
@@ -258,9 +358,9 @@ def set_wd_mult(self, args_wd_mult):
         for n in self.idx2name.values():
             if not (n.endswith('_weight') or n.endswith('_gamma')):
                 self.wd_mult[n] = 0.0
-        if self.sym is not None:
-            attr = self.sym.attr_dict()
-            for name in self.sym.list_arguments():
+        if self.sym_info:
+            attr, arg_names = self.sym_info
+            for name in arg_names:
                 if name in attr and '__wd_mult__' in attr[name]:
                     self.wd_mult[name] = float(attr[name]['__wd_mult__'])
         self.wd_mult.update(args_wd_mult)
@@ -330,17 +430,34 @@ def _get_wd(self, index):
 # convenience wrapper for Optimizer.Register
 register = Optimizer.register   # pylint: disable=invalid-name
 
+# pylint: disable=line-too-long
 @register
 class SGD(Optimizer):
     """The SGD optimizer with momentum and weight decay.
 
     The optimizer updates the weight by::
 
-        state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
+        rescaled_grad = lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
+        state = momentum * state + rescaled_grad
         weight = weight - state
 
-    For details of the update algorithm see :class:`~mxnet.ndarray.sgd_update` and
-    :class:`~mxnet.ndarray.sgd_mom_update`.
+    If the storage types of weight, state and grad are all ``row_sparse``, \
+    **sparse updates** are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = lr * rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row]
+            state[row] = momentum[row] * state[row] + rescaled_grad[row]
+            weight[row] = weight[row] - state[row]
+
+    The sparse update only updates the momentum for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
+    For details of the update algorithm see
+    :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`.
 
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
@@ -352,47 +469,46 @@ class SGD(Optimizer):
     multi_precision: bool, optional
        Flag to control the internal precision of the optimizer.
        ``False`` results in using the same precision as the weights (default),
-       ``True`` makes internal 32-bit copy of the weights and applies gradients
-                in 32-bit precision even if actual weights used in the model have lower precision.
+       ``True`` makes internal 32-bit copy of the weights and applies gradients \
+                in 32-bit precision even if actual weights used in the model have lower precision.\
                 Turning this on can improve convergence and accuracy when training with float16.
     """
-    def __init__(self, momentum=0.0, multi_precision=False, **kwargs):
+    def __init__(self, momentum=0.0, **kwargs):
         super(SGD, self).__init__(**kwargs)
         self.momentum = momentum
-        self.multi_precision = multi_precision
 
-    def create_state(self, index, weight):
-        momentum = None
+    def create_state_multi_precision(self, index, weight):
         weight_master_copy = None
         if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = array(weight, ctx=weight.context, dtype=numpy.float32)
-            if self.momentum != 0.0:
-                momentum = zeros(weight.shape, weight.context, dtype=numpy.float32)
-            return (momentum, weight_master_copy)
+            weight_master_copy = weight.astype(numpy.float32)
+            return (self.create_state(index, weight_master_copy), weight_master_copy)
         if weight.dtype == numpy.float16 and not self.multi_precision:
             warnings.warn("Accumulating with float16 in optimizer can lead to "
                           "poor accuracy or slow convergence. "
                           "Consider using multi_precision=True option of the "
                           "SGD optimizer")
+        return self.create_state(index, weight)
+
+    def create_state(self, index, weight):
+        momentum = None
         if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype)
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
         return momentum
 
-    def update(self, index, weight, grad, state):
+    def _update_impl(self, index, weight, grad, state, multi_precision=False):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+        self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        self._update_count(index)
 
         kwargs = {'rescale_grad': self.rescale_grad}
         if self.momentum > 0:
             kwargs['momentum'] = self.momentum
         if self.clip_gradient:
             kwargs['clip_gradient'] = self.clip_gradient
-        use_multi_precision = isinstance(state, (list, tuple))
 
-        if not use_multi_precision:
+        if not multi_precision:
             if state is not None:
                 sgd_mom_update(weight, grad, state, out=weight,
                                lr=lr, wd=wd, **kwargs)
@@ -407,6 +523,15 @@ def update(self, index, weight, grad, state):
                 mp_sgd_update(weight, grad, state[1], out=weight,
                               lr=lr, wd=wd, **kwargs)
 
+    def update(self, index, weight, grad, state):
+        self._update_impl(index, weight, grad, state, multi_precision=False)
+
+    def update_multi_precision(self, index, weight, grad, state):
+        use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
+        self._update_impl(index, weight, grad, state,
+                          multi_precision=use_multi_precision)
+
+# pylint: enable=line-too-long
 @register
 class DCASGD(Optimizer):
     """The DCASGD optimizer.
@@ -443,9 +568,9 @@ def create_state(self, index, weight):
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+        self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        self._update_count(index)
 
         grad = grad * self.rescale_grad
         if self.clip_gradient is not None:
@@ -480,9 +605,9 @@ def __init__(self, **kwargs):
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+        self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        self._update_count(index)
 
         grad = grad * self.rescale_grad
         if self.clip_gradient is not None:
@@ -517,9 +642,9 @@ def create_state(self, index, weight):
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+        self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        self._update_count(index)
 
         grad = grad * self.rescale_grad
         if self.clip_gradient is not None:
@@ -541,10 +666,32 @@ class Adam(Optimizer):
     This class implements the optimizer described in *Adam: A Method for
     Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
 
+    The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
+        m = beta1 * m + (1 - beta1) * rescaled_grad
+        v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
+        w = w - learning_rate * m / (sqrt(v) + epsilon)
+
+    If the storage types of weight, state and grad are all ``row_sparse``, \
+    **sparse updates** are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(grad[row] * rescale_grad + wd * weight[row], clip_gradient)
+            m[row] = beta1 * m[row] + (1 - beta1) * rescaled_grad[row]
+            v[row] = beta2 * v[row] + (1 - beta2) * (rescaled_grad[row]**2)
+            w[row] = w[row] - learning_rate * m[row] / (sqrt(v[row]) + epsilon)
+
+    The sparse update only updates the mean and var for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all indices.
+    Compared with the original update, it can provide large improvements in model training
+    throughput for some applications. However, it provides slightly different semantics than
+    the original update, and may lead to different empirical results.
+
     This optimizer accepts the following parameters in addition to those accepted
     by :class:`.Optimizer`.
 
-    For details of the update algorithm, see :class:`ndarray.adam_update`.
+    For details of the update algorithm, see :class:`~mxnet.ndarray.adam_update`.
 
     Parameters
     ----------
@@ -564,15 +711,17 @@ def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,use
         self.use_tusimple_update = use_tusimple_update
 
     def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=weight.stype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=weight.stype))  # variance
 
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+        self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        self._update_count(index)
 
         t = self._index_update_count[index]
         coef1 = 1. - self.beta1**t
@@ -603,27 +752,58 @@ class AdaGrad(Optimizer):
     ----------
     eps: float, optional
         Small value to avoid division by 0.
+
     """
     def __init__(self, eps=1e-7, **kwargs):
         super(AdaGrad, self).__init__(**kwargs)
         self.float_stable_eps = eps
 
     def create_state(self, index, weight):
-        return zeros(weight.shape, weight.context)  # history
+        return zeros(weight.shape, weight.context, stype=weight.stype)  # history
 
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+        self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        self._update_count(index)
+
+        is_sparse = True if weight.stype == 'row_sparse' and grad.stype == 'row_sparse' else False
+
+        if is_sparse is True:
+            grad_indices_count = len(grad.indices)
 
         grad = grad * self.rescale_grad
+
+        if is_sparse is True:
+            grad_indices = grad.indices
+            # Make sure that the scalar multiply still has a sparse result
+            assert grad_indices_count == len(grad_indices)
+
         if self.clip_gradient is not None:
             grad = clip(grad, -self.clip_gradient, self.clip_gradient)
         history = state
-        history[:] += (grad * grad)
-        weight[:] += -lr * (grad / sqrt(history + self.float_stable_eps) + wd * weight)
+        save_history_stype = history.stype
+
+        if is_sparse:
+            history[:] = sparse.elemwise_add(sparse.square(grad),
+                                             sparse.retain(history, grad_indices))
+            history_indices = history.indices
+            assert len(history_indices) == grad_indices_count
+            adjusted_add = _internal._scatter_plus_scalar(history, self.float_stable_eps)
+            srt = op.sqrt(adjusted_add)
+            div = _internal._scatter_elemwise_div(grad, srt)
+            retained_weight = sparse.retain(weight, grad.indices)
+            to_add = sparse.elemwise_add(div, _internal._mul_scalar(retained_weight, float(wd)))
+            assert len(to_add.indices) == grad_indices_count
+            weight[:] = sparse.elemwise_add(weight, _internal._mul_scalar(to_add, float(-lr)))
+            state[:] = history
+            assert state.stype == save_history_stype
+            assert len(history_indices) == grad_indices_count
+        else:
+            history[:] += square(grad)
+            div = grad / sqrt(history + self.float_stable_eps)
+            weight[:] += (div + weight * wd) * -lr
 
 @register
 class RMSProp(Optimizer):
@@ -670,18 +850,18 @@ def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
     def create_state(self, index, weight):
         if self.centered:
             return (
-                zeros(weight.shape, weight.context),  # n
-                zeros(weight.shape, weight.context),  # g
-                zeros(weight.shape, weight.context))  # delta
+                zeros(weight.shape, weight.context, stype=weight.stype),  # n
+                zeros(weight.shape, weight.context, stype=weight.stype),  # g
+                zeros(weight.shape, weight.context, stype=weight.stype))  # delta
         else:
-            return (zeros(weight.shape, weight.context), )  # n
+            return (zeros(weight.shape, weight.context, stype=weight.stype),)  # n
 
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+        self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        self._update_count(index)
 
         kwargs = {'gamma1': self.gamma1, 'epsilon': self.epsilon,
                   'rescale_grad': self.rescale_grad}
@@ -750,6 +930,7 @@ def update(self, index, weight, grad, state):
         weight[:] -= current_delta + wd * weight
 
 #pylint: disable=invalid-name
+#pylint: disable=line-too-long
 @register
 class Ftrl(Optimizer):
     """The Ftrl optimizer.
@@ -757,6 +938,38 @@ class Ftrl(Optimizer):
     Referenced from *Ad Click Prediction: a View from the Trenches*, available at
     http://dl.acm.org/citation.cfm?id=2488200.
 
+    eta :
+        .. math::
+           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^2}}
+
+    The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
+        z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
+        n += rescaled_grad**2
+        w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
+
+    If the storage types of weight, state and grad are all ``row_sparse``, \
+    **sparse updates** are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
+            z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
+            n[row] += rescaled_grad[row]**2
+            w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
+
+    The sparse update only updates the z and n for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.ftrl_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
     Parameters
     ----------
     lamda1 : float, optional
@@ -765,9 +978,6 @@ class Ftrl(Optimizer):
         The initial learning rate.
     beta : float, optional
         Per-coordinate learning rate correlation parameter.
-    eta :
-        .. math::
-           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^t}}
     """
 
     def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, **kwargs):
@@ -777,8 +987,8 @@ def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, **kwargs):
         self.lr = learning_rate
 
     def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context),  # dn
-                zeros(weight.shape, weight.context))  # n
+        return (zeros(weight.shape, weight.context, stype=weight.stype),  # z
+                zeros(weight.shape, weight.context, stype=weight.stype))  # n
 
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
@@ -787,22 +997,16 @@ def update(self, index, weight, grad, state):
         wd = self._get_wd(index)
         lr = self._get_lr(index)
 
-        # preprocess grad
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+        kwargs = {'lamda1': self.lamda1, 'beta': self.beta, 'rescale_grad': self.rescale_grad}
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
 
         # accumulated g and delta initialization
-        dn, n = state
-
-        #update dn, n
-        dn += grad - (sqrt(n + grad * grad) - sqrt(n)) * weight / lr
-        n += grad * grad
-
-        # update weight
-        weight[:] = (sign(dn) * self.lamda1 - dn) / \
-                    ((self.beta + sqrt(n)) / lr + wd) * (NDabs(dn) > self.lamda1)
+        z, n = state
+        ftrl_update(weight, grad, z, n, out=weight,
+                    lr=lr, wd=wd, **kwargs)
 
+# pylint: enable=line-too-long
 @register
 class Adamax(Optimizer):
     """The AdaMax optimizer.
@@ -832,9 +1036,9 @@ def create_state(self, index, weight):
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+        self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        self._update_count(index)
 
         t = self._index_update_count[index]
         lr /= (1. - self.beta1**t)
@@ -890,9 +1094,9 @@ def create_state(self, index, weight):
     def update(self, index, weight, grad, state):
         assert(isinstance(weight, NDArray))
         assert(isinstance(grad, NDArray))
+        self._update_count(index)
         lr = self._get_lr(index)
         wd = self._get_wd(index)
-        self._update_count(index)
 
         t = self._index_update_count[index]
 
@@ -947,14 +1151,17 @@ def __init__(self, optimizer):
 
     def __call__(self, index, grad, weight):
         """Updates weight given gradient and index."""
+        # convert ctypes.char_p.value back to python str if needed
+        if isinstance(index, bytes):
+            index = py_str(index)
         if index not in self.states:
-            self.states[index] = self.optimizer.create_state(index, weight)
+            self.states[index] = self.optimizer.create_state_multi_precision(index, weight)
             self.states_synced[index] = True
         elif not self.states_synced[index]:
             self.states[index] = \
                 self.sync_state_context(self.states[index], weight.context)
             self.states_synced[index] = True
-        self.optimizer.update(index, weight, grad, self.states[index])
+        self.optimizer.update_multi_precision(index, weight, grad, self.states[index])
 
     def sync_state_context(self, state, context):
         if isinstance(state, NDArray):
@@ -970,12 +1177,23 @@ def sync_state_context(self, state, context):
 
     def set_states(self, states):
         """Sets updater states."""
-        self.states = pickle.loads(states)
+        states = pickle.loads(states)
+        if isinstance(states, tuple) and len(states) == 2:
+            self.states, self.optimizer = states
+        else:
+            self.states = states
         self.states_synced = dict.fromkeys(self.states.keys(), False)
 
-    def get_states(self):
-        """Gets updater states."""
-        return pickle.dumps(self.states)
+    def get_states(self, dump_optimizer=False):
+        """Gets updater states.
+
+        Parameters
+        ----------
+        dump_optimizer : bool, default False
+            Whether to also save the optimizer itself. This would also save optimizer
+            information such as learning rate and weight decay schedules.
+        """
+        return pickle.dumps((self.states, self.optimizer) if dump_optimizer else self.states)
 
 def get_updater(optimizer):
     """Returns a closure of the updater needed for kvstore.
diff --git a/python/mxnet/random.py b/python/mxnet/random.py
index 29b250d980ce..3a13b3d768c1 100644
--- a/python/mxnet/random.py
+++ b/python/mxnet/random.py
@@ -17,18 +17,14 @@
 
 # coding: utf-8
 # pylint: disable=no-member, protected-access, unused-import, no-name-in-module
+# pylint: disable=wildcard-import, unused-wildcard-import
 """Random number interface of MXNet."""
 from __future__ import absolute_import
 
 import ctypes
 from .base import _LIB, check_call
-from ._ndarray_internal import _sample_uniform as uniform
-from ._ndarray_internal import _sample_normal as normal
-from ._ndarray_internal import _sample_gamma as gamma
-from ._ndarray_internal import _sample_exponential as exponential
-from ._ndarray_internal import _sample_poisson as poisson
-from ._ndarray_internal import _sample_negbinomial as negative_binomial
-from ._ndarray_internal import _sample_gennegbinomial as generalized_negative_binomial
+from .ndarray.random import *
+
 
 def seed(seed_state):
     """Seeds the random number generators in MXNet.
@@ -48,19 +44,19 @@ def seed(seed_state):
 
     Example
     -------
-    >>> print(mx.nd.random_normal(shape=(2,2)).asnumpy())
+    >>> print(mx.nd.random.normal(shape=(2,2)).asnumpy())
     [[ 1.36481571 -0.62203991]
      [-1.4962182  -0.08511394]]
-    >>> print(mx.nd.random_normal(shape=(2,2)).asnumpy())
+    >>> print(mx.nd.random.normal(shape=(2,2)).asnumpy())
     [[ 1.09544981 -0.20014545]
      [-0.20808885  0.2527658 ]]
     >>>
     >>> mx.random.seed(128)
-    >>> print(mx.nd.random_normal(shape=(2,2)).asnumpy())
+    >>> print(mx.nd.random.normal(shape=(2,2)).asnumpy())
     [[ 0.47400656 -0.75213492]
      [ 0.20251541  0.95352972]]
     >>> mx.random.seed(128)
-    >>> print(mx.nd.random_normal(shape=(2,2)).asnumpy())
+    >>> print(mx.nd.random.normal(shape=(2,2)).asnumpy())
     [[ 0.47400656 -0.75213492]
      [ 0.20251541  0.95352972]]
     """
diff --git a/python/mxnet/rnn/rnn_cell.py b/python/mxnet/rnn/rnn_cell.py
index 1c3452041494..3301102ba905 100644
--- a/python/mxnet/rnn/rnn_cell.py
+++ b/python/mxnet/rnn/rnn_cell.py
@@ -134,6 +134,9 @@ def reset(self):
         """Reset before re-using the cell for another graph."""
         self._init_counter = -1
         self._counter = -1
+        if hasattr(self, '_cells'):
+            for cell in self._cells:
+                cell.reset()
 
     def __call__(self, inputs, states):
         """Unroll the RNN for one time step.
@@ -1242,6 +1245,10 @@ def __call__(self, inputs, states):
                                       name='%sout'%name)
         return output, [output]
 
+    @property
+    def state_info(self):
+        return [{'shape': self._state_shape, '__layout__': self._conv_layout}]
+
 
 class ConvLSTMCell(BaseConvRNNCell):
     """Convolutional LSTM network cell.
@@ -1333,6 +1340,11 @@ def __call__(self, inputs, states):
 
         return next_h, [next_h, next_c]
 
+    @property
+    def state_info(self):
+        return [{'shape': self._state_shape, '__layout__': self._conv_layout},
+                {'shape': self._state_shape, '__layout__': self._conv_layout}]
+
 class ConvGRUCell(BaseConvRNNCell):
     """Convolutional Gated Rectified Unit (GRU) network cell.
 
@@ -1397,6 +1409,10 @@ def __init__(self, input_shape, num_hidden,
     def _gate_names(self):
         return ['_r', '_z', '_o']
 
+    @property
+    def state_info(self):
+        return [{'shape': self._state_shape, '__layout__': self._conv_layout}]
+
     def __call__(self, inputs, states):
         self._counter += 1
         seq_idx = self._counter
diff --git a/python/mxnet/rtc.py b/python/mxnet/rtc.py
index 9da38c6aaaf5..4dea0e656b7e 100644
--- a/python/mxnet/rtc.py
+++ b/python/mxnet/rtc.py
@@ -18,91 +18,213 @@
 """Interface to runtime cuda kernel compile module."""
 from __future__ import absolute_import
 
+from array import array
+import re
 import ctypes
-from .base import _LIB, NDArrayHandle, RtcHandle, mx_uint, c_array, check_call
+import numpy as np
+
+from .base import _LIB, mx_uint, c_array, c_array_buf, c_str_array, check_call
+from .base import c_str, CudaModuleHandle, CudaKernelHandle, numeric_types, string_types
+from .ndarray import _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, NDArray
+
+_DTYPE_CPP_TO_NP = {
+    'float': np.float32,
+    'double': np.float64,
+    '__half': np.float16,
+    'uint8_t': np.uint8,
+    'int': np.int32,
+    'int32_t': np.int32,
+    'int8_t': np.int8,
+    'char': np.int8,
+    'int64_t': np.int64,
+}
+
+class CudaModule(object):
+    r"""Compile and run CUDA code from Python.
+
+    In CUDA 7.5, you need to prepend your kernel definitions
+    with 'extern "C"' to avoid name mangling::
+
+        source = r'''
+        extern "C" __global__ void axpy(const float *x, float *y, float alpha) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            y[i] += alpha * x[i];
+        }
+        '''
+        module = mx.rtc.CudaModule(source)
+        func = module.get_kernel("axpy", "const float *x, float *y, float alpha")
+        x = mx.nd.ones((10,), ctx=mx.gpu(0))
+        y = mx.nd.zeros((10,), ctx=mx.gpu(0))
+        func.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
+        print(y)
+
+    Starting from CUDA 8.0, you can instead export functions by name.
+    This also allows you to use templates::
+
+        source = r'''
+        template<typename DType>
+        __global__ void axpy(const DType *x, DType *y, DType alpha) {
+            int i = threadIdx.x + blockIdx.x * blockDim.x;
+            y[i] += alpha * x[i];
+        }
+        '''
+        module = mx.rtc.CudaModule(source, exports=['axpy<float>', 'axpy<double>'])
+        func32 = module.get_kernel("axpy<float>", "const float *x, float *y, float alpha")
+        x = mx.nd.ones((10,), dtype='float32', ctx=mx.gpu(0))
+        y = mx.nd.zeros((10,), dtype='float32', ctx=mx.gpu(0))
+        func32.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
+        print(y)
+
+        func64 = module.get_kernel("axpy<double>", "const double *x, double *y, double alpha")
+        x = mx.nd.ones((10,), dtype='float64', ctx=mx.gpu(0))
+        y = mx.nd.zeros((10,), dtype='float64', ctx=mx.gpu(0))
+        func32.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
+        print(y)
 
-class Rtc(object):
-    """MXRtc object in mxnet.
-    This class allow you to write CUDA kernels in Python
-    and call them with NDArray.
 
     Parameters
     ----------
-    name : str
-        Name of the kernel.
-    inputs : tuple of (str, mxnet.ndarray)
-        List of input names and ndarray.
-    outputs : tuple of (str, mxnet.ndarray)
-        List of output names and ndarray.
-    kernel : str
-        The actual kernel code.
-        Note that this is only the body of the kernel, i.e.
-        after { and before }. Rtc will decorate the kernel.
-        For example, if ``name = "mykernel"`` and
-        inputs = [('x', mx.nd.zeros((10,)))]
-        outputs = [('y', mx.nd.zeros((10,)))]
-        kernel = "y[threadIdx.x] = x[threadIdx.x];",
-        then the compiled kernel will be:
-        extern "C" __global__ mykernel(float *x, float *y) {
-            const int x_ndim = 1;
-            const int x_dims = { 10 };
-            const int y_ndim = 1;
-            const int y_dims = { 10 };
-
-            y[threadIdx.x] = x[threadIdx.x];
-        }
+    source : str
+        Complete source code.
+    options : tuple of str
+        Compiler flags. For example, use "-I/usr/local/cuda/include" to
+        add cuda headers to include path.
+    exports : tuple of str
+        Export kernel names.
     """
-    def __init__(self, name, inputs, outputs, kernel):
-        self.handle = RtcHandle()
-        input_names = ctypes.cast(c_array(ctypes.c_char_p, [i[0] for i in inputs]),
-                                  ctypes.POINTER(ctypes.c_char_p))
-        output_names = ctypes.cast(c_array(ctypes.c_char_p, [i[0] for i in outputs]),
-                                   ctypes.POINTER(ctypes.c_char_p))
-        input_nds = ctypes.cast(c_array(NDArrayHandle, [i[1].handle for i in inputs]),
-                                ctypes.POINTER(NDArrayHandle))
-        output_nds = ctypes.cast(c_array(NDArrayHandle, [i[1].handle for i in outputs]),
-                                 ctypes.POINTER(NDArrayHandle))
-        check_call(_LIB.MXRtcCreate(ctypes.c_char_p(name),
-                                    mx_uint(len(inputs)),
-                                    mx_uint(len(outputs)),
-                                    input_names,
-                                    output_names,
-                                    input_nds,
-                                    output_nds,
-                                    ctypes.c_char_p(kernel),
-                                    ctypes.byref(self.handle)))
+    def __init__(self, source, options=(), exports=()):
+        if isinstance(options, string_types):
+            options = (options,)
+        if isinstance(exports, string_types):
+            exports = (exports,)
+        self.handle = CudaModuleHandle()
+        check_call(_LIB.MXRtcCudaModuleCreate(
+            c_str(source),
+            len(options),
+            c_str_array(options),
+            len(exports),
+            c_str_array(exports),
+            ctypes.byref(self.handle)))
 
     def __del__(self):
-        check_call(_LIB.MXRtcFree(self.handle))
+        check_call(_LIB.MXRtcCudaModuleFree(self.handle))
 
-    def push(self, inputs, outputs, grid_dims, block_dims):
-        """Run the kernel.
+    def get_kernel(self, name, signature):
+        r"""Get CUDA kernel from compiled module.
 
         Parameters
         ----------
-        inputs : list of NDArray
-            List of inputs. Can contain different NDArrays than those used for the constructor,
-            but its elements must have the same shapes and appear in the same order.
-        outputs : list of NDArray
-            List of outputs. Can contain different ndarrays than used for the constructor,
-            but must have the same shapes and appear in the same order.
-        grid_dims : tuple of 3 uint
-            Grid dimension for kernel launch.
-        block_dims : tuple of 3 uint
-            Block dimension for kernel launch.
+        name : str
+            String name of the kernel.
+        signature : str
+            Function signature for the kernel. For example, if a kernel is
+            declared as::
+
+                extern "C" __global__ void axpy(const float *x, double *y, int alpha)
+
+            Then its signature should be::
+
+                const float *x, double *y, int alpha
+
+            or::
+
+                const float *, double *, int
+
+            Note that `*` in signature marks an argument as array and
+            `const` marks an argument as constant (input) array.
+
+        Returns
+        -------
+        CudaKernel
+            CUDA kernels that can be launched on GPUs.
+        """
+        hdl = CudaKernelHandle()
+        is_ndarray = []
+        is_const = []
+        dtypes = []
+        pattern = re.compile(r"""^\s*(const)?\s*([\w_]+)\s*(\*)?\s*([\w_]+)?\s*$""")
+        args = re.sub(r"\s+", " ", signature).split(",")
+        for arg in args:
+            match = pattern.match(arg)
+            if not match or match.groups()[1] == 'const':
+                raise ValueError(
+                    'Invalid function prototype "%s". Must be in the '
+                    'form of "(const) type (*) (name)"'%arg)
+            is_const.append(bool(match.groups()[0]))
+            dtype = match.groups()[1]
+            is_ndarray.append(bool(match.groups()[2]))
+            if dtype not in _DTYPE_CPP_TO_NP:
+                raise TypeError(
+                    "Unsupported kernel argument type %s. Supported types are: %s."%(
+                        arg, ','.join(_DTYPE_CPP_TO_NP.keys())))
+            dtypes.append(_DTYPE_NP_TO_MX[_DTYPE_CPP_TO_NP[dtype]])
+
+        check_call(_LIB.MXRtcCudaKernelCreate(
+            self.handle,
+            c_str(name),
+            len(dtypes),
+            c_array_buf(ctypes.c_int, array('i', is_ndarray)),
+            c_array_buf(ctypes.c_int, array('i', is_const)),
+            c_array_buf(ctypes.c_int, array('i', dtypes)),
+            ctypes.byref(hdl)))
+
+        return CudaKernel(hdl, name, is_ndarray, dtypes)
+
+class CudaKernel(object):
+    """Constructs CUDA kernel. Should be created by `CudaModule.get_kernel`,
+    not intended to be used by users."""
+    def __init__(self, handle, name, is_ndarray, dtypes):
+        self.handle = handle
+        self._name = name
+        self._is_ndarray = is_ndarray
+        self._dtypes = [_DTYPE_MX_TO_NP[i] for i in dtypes]
+
+    def __del__(self):
+        check_call(_LIB.MXRtcCudaKernelFree(self.handle))
+
+    def launch(self, args, ctx, grid_dims, block_dims, shared_mem=0):
+        """Launch cuda kernel.
+
+        Parameters
+        ----------
+        args : tuple of NDArray or numbers
+            List of arguments for kernel. NDArrays are expected for pointer
+            types (e.g. `float*`, `double*`) while numbers are expected for
+            non-pointer types (e.g. `int`, `float`).
+        ctx : Context
+            The context to launch kernel on. Must be GPU context.
+        grid_dims : tuple of 3 integers
+            Grid dimensions for CUDA kernel.
+        block_dims : tuple of 3 integers
+            Block dimensions for CUDA kernel.
+        shared_mem : integer, optional
+            Size of dynamically allocated shared memory. Defaults to 0.
         """
-        input_nds = ctypes.cast(c_array(NDArrayHandle, [i.handle for i in inputs]),
-                                ctypes.POINTER(NDArrayHandle))
-        output_nds = ctypes.cast(c_array(NDArrayHandle, [i.handle for i in outputs]),
-                                 ctypes.POINTER(NDArrayHandle))
-        check_call(_LIB.MXRtcPush(self.handle,
-                                  mx_uint(len(inputs)),
-                                  mx_uint(len(outputs)),
-                                  input_nds,
-                                  output_nds,
-                                  mx_uint(grid_dims[0]),
-                                  mx_uint(grid_dims[1]),
-                                  mx_uint(grid_dims[2]),
-                                  mx_uint(block_dims[0]),
-                                  mx_uint(block_dims[1]),
-                                  mx_uint(block_dims[2])))
+        assert ctx.device_type == 'gpu', "Cuda kernel can only be launched on GPU"
+        assert len(grid_dims) == 3, "grid_dims must be a tuple of 3 integers"
+        assert len(block_dims) == 3, "grid_dims must be a tuple of 3 integers"
+        assert len(args) == len(self._dtypes), \
+            "CudaKernel(%s) expects %d arguments but got %d"%(
+                self._name, len(self._dtypes), len(args))
+        void_args = []
+        ref_holder = []
+        for i, (arg, is_nd, dtype) in enumerate(zip(args, self._is_ndarray, self._dtypes)):
+            if is_nd:
+                assert isinstance(arg, NDArray), \
+                    "The %d-th argument is expected to be a NDArray but got %s"%(
+                        i, type(arg))
+                void_args.append(arg.handle)
+            else:
+                assert isinstance(arg, numeric_types), \
+                    "The %d-th argument is expected to be a number, but got %s"%(
+                        i, type(arg))
+                ref_holder.append(np.array(arg, dtype=dtype))
+                void_args.append(ref_holder[-1].ctypes.data_as(ctypes.c_void_p))
+
+        check_call(_LIB.MXRtcCudaKernelCall(
+            self.handle,
+            ctx.device_id,
+            c_array(ctypes.c_void_p, void_args),
+            mx_uint(grid_dims[0]), mx_uint(grid_dims[1]), mx_uint(grid_dims[2]),
+            mx_uint(block_dims[0]), mx_uint(block_dims[1]), mx_uint(block_dims[2]),
+            mx_uint(shared_mem)))
diff --git a/python/mxnet/symbol/__init__.py b/python/mxnet/symbol/__init__.py
new file mode 100644
index 000000000000..a07025e16758
--- /dev/null
+++ b/python/mxnet/symbol/__init__.py
@@ -0,0 +1,31 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Symbol API of MXNet."""
+
+from . import _internal, contrib, linalg, op, random, sparse
+# pylint: disable=wildcard-import, redefined-builtin
+try:
+    from .gen_op import * # pylint: disable=unused-wildcard-import
+except ImportError:
+    pass
+from . import register
+from .op import *
+from .symbol import *
+# pylint: enable=wildcard-import
+
+__all__ = op.__all__ + symbol.__all__ + ['contrib', 'linalg', 'random', 'sparse']
diff --git a/python/mxnet/symbol/_internal.py b/python/mxnet/symbol/_internal.py
new file mode 100644
index 000000000000..53fc684008cf
--- /dev/null
+++ b/python/mxnet/symbol/_internal.py
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import, unused-import
+"""Symbol namespace used to register internal functions."""
+# Use different version of SymbolBase
+# When possible, use cython to speedup part of computation.
+import sys as _sys
+import os as _os
+
+import numpy as np
+
+try:
+    if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
+        from .._ctypes.symbol import SymbolBase, _set_symbol_class
+        from .._ctypes.symbol import _symbol_creator
+    elif _sys.version_info >= (3, 0):
+        from .._cy3.symbol import SymbolBase, _set_symbol_class
+        from .._cy3.symbol import _symbol_creator
+    else:
+        from .._cy2.symbol import SymbolBase, _set_symbol_class
+        from .._cy2.symbol import _symbol_creator
+except ImportError:
+    if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
+        raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
+    from .._ctypes.symbol import SymbolBase, _set_symbol_class
+    from .._ctypes.symbol import _symbol_creator
+from ..attribute import AttrScope
+from ..base import _Null
+from ..name import NameManager
+try:
+    from .gen__internal import * # pylint: disable=unused-wildcard-import
+except ImportError:
+    pass
+
+__all__ = ['SymbolBase', '_set_symbol_class', '_symbol_creator']
diff --git a/python/mxnet/symbol/contrib.py b/python/mxnet/symbol/contrib.py
new file mode 100644
index 000000000000..13feb78e37c7
--- /dev/null
+++ b/python/mxnet/symbol/contrib.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-wildcard-import
+"""Contrib Symbol API of MXNet."""
+try:
+    from .gen_contrib import *
+except ImportError:
+    pass
+
+__all__ = []
diff --git a/python/mxnet/symbol/linalg.py b/python/mxnet/symbol/linalg.py
new file mode 100644
index 000000000000..2bf543ea3217
--- /dev/null
+++ b/python/mxnet/symbol/linalg.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-wildcard-import
+"""Linear Algebra Symbol API of MXNet."""
+try:
+    from .gen_linalg import *
+except ImportError:
+    pass
+
+__all__ = []
diff --git a/python/mxnet/symbol/op.py b/python/mxnet/symbol/op.py
new file mode 100644
index 000000000000..2d63dfc1656c
--- /dev/null
+++ b/python/mxnet/symbol/op.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-wildcard-import, redefined-builtin
+"""Backend ops in mxnet.symbol namespace."""
+try:
+    from .gen_op import *
+except ImportError:
+    pass
+
+__all__ = []
diff --git a/python/mxnet/symbol/random.py b/python/mxnet/symbol/random.py
new file mode 100644
index 000000000000..f0d05ad05613
--- /dev/null
+++ b/python/mxnet/symbol/random.py
@@ -0,0 +1,249 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Random distribution generator Symbol API of MXNet."""
+
+from ..base import numeric_types, _Null
+from . import _internal
+from .symbol import Symbol
+
+
+__all__ = ['uniform', 'normal', 'poisson', 'exponential', 'gamma', 'multinomial',
+           'negative_binomial', 'generalized_negative_binomial']
+
+
+def _random_helper(random, sampler, params, shape, dtype, kwargs):
+    """Helper function for random generators."""
+    if isinstance(params[0], Symbol):
+        for i in params[1:]:
+            assert isinstance(i, Symbol), \
+                "Distribution parameters must all have the same type, but got " \
+                "both %s and %s."%(type(params[0]), type(i))
+        return sampler(*params, shape=shape, dtype=dtype, **kwargs)
+    elif isinstance(params[0], numeric_types):
+        for i in params[1:]:
+            assert isinstance(i, numeric_types), \
+                "Distribution parameters must all have the same type, but got " \
+                "both %s and %s."%(type(params[0]), type(i))
+        return random(*params, shape=shape, dtype=dtype, **kwargs)
+
+    raise ValueError("Distribution parameters must be either Symbol or numbers, "
+                     "but got %s."%type(params[0]))
+
+
+def uniform(low=0, high=1, shape=_Null, dtype=_Null, **kwargs):
+    """Draw random samples from a uniform distribution.
+
+    Samples are uniformly distributed over the half-open interval *[low, high)*
+    (includes *low*, but excludes *high*).
+
+    Parameters
+    ----------
+    low : float or Symbol
+        Lower boundary of the output interval. All values generated will be
+        greater than or equal to low. The default value is 0.
+    high : float or Symbol
+        Upper boundary of the output interval. All values generated will be
+        less than high. The default value is 1.0.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `low` and
+        `high` are scalars, output shape will be `(m, n)`. If `low` and `high`
+        are Symbols with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[low, high)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    """
+    return _random_helper(_internal._random_uniform, _internal._sample_uniform,
+                          [low, high], shape, dtype, kwargs)
+
+
+def normal(loc=0, scale=1, shape=_Null, dtype=_Null, **kwargs):
+    """Draw random samples from a normal (Gaussian) distribution.
+
+    Samples are distributed according to a normal distribution parametrized
+    by *loc* (mean) and *scale* (standard deviation).
+
+
+    Parameters
+    ----------
+    loc : float or Symbol
+        Mean (centre) of the distribution.
+    scale : float or Symbol
+        Standard deviation (spread or width) of the distribution.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `loc` and
+        `scale` are scalars, output shape will be `(m, n)`. If `loc` and `scale`
+        are Symbols with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[loc, scale)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    """
+    return _random_helper(_internal._random_normal, _internal._sample_normal,
+                          [loc, scale], shape, dtype, kwargs)
+
+
+def poisson(lam=1, shape=_Null, dtype=_Null, **kwargs):
+    """Draw random samples from a Poisson distribution.
+
+    Samples are distributed according to a Poisson distribution parametrized
+    by *lambda* (rate). Samples will always be returned as a floating point data type.
+
+    Parameters
+    ----------
+    lam : float or Symbol
+        Expectation of interval, should be >= 0.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `lam` is
+        a scalar, output shape will be `(m, n)`. If `lam`
+        is an Symbol with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each entry in `lam`.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    """
+    return _random_helper(_internal._random_poisson, _internal._sample_poisson,
+                          [lam], shape, dtype, kwargs)
+
+
+def exponential(scale=1, shape=_Null, dtype=_Null, **kwargs):
+    r"""Draw samples from an exponential distribution.
+
+    Its probability density function is
+
+        f(x; \frac{1}{\beta}) = \frac{1}{\beta} \exp(-\frac{x}{\beta}),
+
+    for x > 0 and 0 elsewhere. \beta is the scale parameter, which is the
+    inverse of the rate parameter \lambda = 1/\beta.
+
+    Parameters
+    ----------
+    scale : float or Symbol
+        The scale parameter, \beta = 1/\lambda.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `scale` is
+        a scalar, output shape will be `(m, n)`. If `scale`
+        is an Symbol with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each entry in `scale`.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    """
+    return _random_helper(_internal._random_exponential, _internal._sample_exponential,
+                          [1.0/scale], shape, dtype, kwargs)
+
+
+def gamma(alpha=1, beta=1, shape=_Null, dtype=_Null, **kwargs):
+    """Draw random samples from a gamma distribution.
+
+    Samples are distributed according to a gamma distribution parametrized
+    by *alpha* (shape) and *beta* (scale).
+
+    Parameters
+    ----------
+    alpha : float or Symbol
+        The shape of the gamma distribution. Should be greater than zero.
+    beta : float or Symbol
+        The scale of the gamma distribution. Should be greater than zero.
+        Default is equal to 1.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `alpha` and
+        `beta` are scalars, output shape will be `(m, n)`. If `alpha` and `beta`
+        are Symbols with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[alpha, beta)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    """
+    return _random_helper(_internal._random_gamma, _internal._sample_gamma,
+                          [alpha, beta], shape, dtype, kwargs)
+
+
+def negative_binomial(k=1, p=1, shape=_Null, dtype=_Null, **kwargs):
+    """Draw random samples from a negative binomial distribution.
+
+    Samples are distributed according to a negative binomial distribution
+    parametrized by *k* (limit of unsuccessful experiments) and *p* (failure
+    probability in each experiment). Samples will always be returned as a
+    floating point data type.
+
+    Parameters
+    ----------
+    k : float or Symbol
+        Limit of unsuccessful experiments, > 0.
+    p : float or Symbol
+        Failure probability in each experiment, >= 0 and <=1.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `k` and
+        `p` are scalars, output shape will be `(m, n)`. If `k` and `p`
+        are Symbols with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[k, p)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    """
+    return _random_helper(_internal._random_negative_binomial,
+                          _internal._sample_negative_binomial,
+                          [k, p], shape, dtype, kwargs)
+
+
+def generalized_negative_binomial(mu=1, alpha=1, shape=_Null, dtype=_Null, **kwargs):
+    """Draw random samples from a generalized negative binomial distribution.
+
+    Samples are distributed according to a generalized negative binomial
+    distribution parametrized by *mu* (mean) and *alpha* (dispersion).
+    *alpha* is defined as *1/k* where *k* is the failure limit of the
+    number of unsuccessful experiments (generalized to real numbers).
+    Samples will always be returned as a floating point data type.
+
+    Parameters
+    ----------
+    mu : float or Symbol
+        Mean of the negative binomial distribution.
+    alpha : float or Symbol
+        Alpha (dispersion) parameter of the negative binomial distribution.
+    shape : int or tuple of ints
+        The number of samples to draw. If shape is, e.g., `(m, n)` and `mu` and
+        `alpha` are scalars, output shape will be `(m, n)`. If `mu` and `alpha`
+        are Symbols with shape, e.g., `(x, y)`, then output will have shape
+        `(x, y, m, n)`, where `m*n` samples are drawn for each `[mu, alpha)` pair.
+    dtype : {'float16','float32', 'float64'}
+        Data type of output samples. Default is 'float32'
+    """
+    return _random_helper(_internal._random_generalized_negative_binomial,
+                          _internal._sample_generalized_negative_binomial,
+                          [mu, alpha], shape, dtype, kwargs)
+
+
+def multinomial(data, shape=_Null, get_prob=True, **kwargs):
+    """Concurrent sampling from multiple multinomial distributions.
+
+    .. note:: The input distribution must be normalized, i.e. `data` must sum to
+              1 along its last dimension.
+
+    Parameters
+    ----------
+    data : Symbol
+        An *n* dimensional array whose last dimension has length `k`, where
+        `k` is the number of possible outcomes of each multinomial distribution.
+        For example, data with shape `(m, n, k)` specifies `m*n` multinomial
+        distributions each with `k` possible outcomes.
+    shape : int or tuple of ints
+        The number of samples to draw from each distribution. If shape is empty
+        one sample will be drawn from each distribution.
+    get_prob : bool
+        If true, a second array containing log likelihood of the drawn
+        samples will also be returned.
+        This is usually used for reinforcement learning, where you can provide
+        reward as head gradient w.r.t. this array to estimate gradient.
+    """
+    return _internal._sample_multinomial(data, shape, get_prob, **kwargs)
diff --git a/python/mxnet/symbol/register.py b/python/mxnet/symbol/register.py
new file mode 100644
index 000000000000..cf279d1403da
--- /dev/null
+++ b/python/mxnet/symbol/register.py
@@ -0,0 +1,202 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=unused-import
+"""Register backend ops in mxnet.symbol namespace."""
+import os as _os
+import ctypes
+import numpy as np
+
+from . import _internal
+from ._internal import SymbolBase, _symbol_creator
+from ..attribute import AttrScope
+from ..base import mx_uint, check_call, _LIB, py_str
+from ..symbol_doc import _build_doc
+from ..base import _Null, _init_op_module
+from ..name import NameManager
+# pylint: enable=unused-import
+
+
+def _generate_symbol_function_code(handle, name, func_name, signature_only=False):
+    """Generate function for symbol op by handle and function name."""
+    real_name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+    key_var_num_args = ctypes.c_char_p()
+    ret_type = ctypes.c_char_p()
+
+    check_call(_LIB.MXSymbolGetAtomicSymbolInfo(
+        handle, ctypes.byref(real_name), ctypes.byref(desc),
+        ctypes.byref(num_args),
+        ctypes.byref(arg_names),
+        ctypes.byref(arg_types),
+        ctypes.byref(arg_descs),
+        ctypes.byref(key_var_num_args),
+        ctypes.byref(ret_type)))
+    narg = int(num_args.value)
+    arg_names = [py_str(arg_names[i]) for i in range(narg)]
+    arg_types = [py_str(arg_types[i]) for i in range(narg)]
+    key_var_num_args = py_str(key_var_num_args.value)
+    ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
+    doc_str = _build_doc(name,
+                         py_str(desc.value),
+                         arg_names,
+                         arg_types,
+                         [py_str(arg_descs[i]) for i in range(narg)],
+                         key_var_num_args,
+                         ret_type)
+
+    dtype_name = None
+    arr_name = None
+    ndsignature = []
+    signature = []
+    ndarg_names = []
+    kwarg_names = []
+    for i in range(narg):
+        name, atype = arg_names[i], arg_types[i]
+        if name == 'dtype':
+            dtype_name = name
+            signature.append('%s=_Null'%name)
+        elif atype.startswith('NDArray') or atype.startswith('Symbol'):
+            assert not arr_name, \
+                "Op can only have one argument with variable " \
+                "size and it must be the last argument."
+            if atype.endswith('[]'):
+                ndsignature.append('*%s'%name)
+                arr_name = name
+            else:
+                ndsignature.append('%s=None'%name)
+                ndarg_names.append(name)
+        else:
+            signature.append('%s=_Null'%name)
+            kwarg_names.append(name)
+    #signature.append('is_train=False')
+    signature.append('name=None')
+    signature.append('attr=None')
+    signature.append('out=None')
+    signature.append('**kwargs')
+    signature = ndsignature + signature
+
+    code = []
+    if arr_name:
+        code.append("""
+def %s(*%s, **kwargs):"""%(func_name, arr_name))
+        if not signature_only:
+            code.append("""
+    sym_args = []
+    for i in {}:
+        assert isinstance(i, SymbolBase), \\
+            "Positional arguments must be Symbol instances, " \\
+            "but got %s"%str(i)
+        sym_args.append(i)""".format(arr_name))
+            if dtype_name is not None:
+                code.append("""
+    if '%s' in kwargs:
+        kwargs['%s'] = np.dtype(kwargs['%s']).name"""%(
+            dtype_name, dtype_name, dtype_name))
+            code.append("""
+    attr = kwargs.pop('attr', None)
+    kwargs.update(AttrScope.current.get(attr))
+    name = kwargs.pop('name', None)
+    name = NameManager.current.get(name, '%s')
+    _ = kwargs.pop('out', None)
+    keys = []
+    vals = []
+    sym_kwargs = dict()
+    for k, v in kwargs.items():
+        if isinstance(v, SymbolBase):
+            sym_kwargs[k] = v
+        else:
+            keys.append(k)
+            vals.append(v)"""%(func_name.lower()))
+            if key_var_num_args:
+                code.append("""
+    if '%s' not in kwargs:
+        keys.append('%s')
+        vals.append(len(sym_args) + len(sym_kwargs))"""%(
+            key_var_num_args, key_var_num_args))
+
+            code.append("""
+    return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name)"""%(
+        handle.value))
+    else:
+        code.append("""
+def %s(%s):"""%(func_name, ', '.join(signature)))
+        if not signature_only:
+            code.append("""
+    kwargs.update(AttrScope.current.get(attr))
+    sym_kwargs = dict()
+    keys = []
+    vals = []
+    for k, v in kwargs.items():
+        if isinstance(v, SymbolBase):
+            sym_kwargs[k] = v
+        else:
+            keys.append(k)
+            vals.append(v)""")
+            # NDArray args
+            for name in ndarg_names: # pylint: disable=redefined-argument-from-local
+                code.append("""
+    if {name} is not None:
+        assert isinstance({name}, SymbolBase), \\
+            "Argument {name} must be Symbol instances, but got %s"%str({name})
+        sym_kwargs['{name}'] = {name}""".format(name=name))
+            # kwargs
+            for name in kwarg_names: # pylint: disable=redefined-argument-from-local
+                code.append("""
+    if %s is not _Null:
+        keys.append('%s')
+        vals.append(%s)"""%(name, name, name))
+            # dtype
+            if dtype_name is not None:
+                code.append("""
+    if %s is not _Null:
+        keys.append('%s')
+        vals.append(np.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
+
+            code.append("""
+    name = NameManager.current.get(name, '%s')
+    return _symbol_creator(%d, None, sym_kwargs, keys, vals, name)"""%(
+        func_name.lower(), handle.value))
+
+    if signature_only:
+        code.append("""
+    return (0,)""")
+
+    doc_str_lines = _os.linesep+''.join(['    '+s if s.strip() else s
+                                         for s in 'r"""{doc_str}"""'.format(doc_str=doc_str)
+                                         .splitlines(True)])
+    code.insert(1, doc_str_lines)
+    return ''.join(code), doc_str
+
+
+def _make_symbol_function(handle, name, func_name):
+    """Create a symbol function by handle and function name."""
+    code, doc_str = _generate_symbol_function_code(handle, name, func_name)
+
+    local = {}
+    exec(code, None, local)  # pylint: disable=exec-used
+    symbol_function = local[func_name]
+    symbol_function.__name__ = func_name
+    symbol_function.__doc__ = doc_str
+    symbol_function.__module__ = 'mxnet.symbol'
+    return symbol_function
+
+_init_op_module('mxnet', 'symbol', _make_symbol_function)
diff --git a/python/mxnet/symbol/sparse.py b/python/mxnet/symbol/sparse.py
new file mode 100644
index 000000000000..66f4dbec0526
--- /dev/null
+++ b/python/mxnet/symbol/sparse.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import, unused-wildcard-import
+"""Sparse Symbol API of MXNet."""
+try:
+    from .gen_sparse import * # pylint: disable=redefined-builtin
+except ImportError:
+    pass
+
+__all__ = []
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol/symbol.py
similarity index 73%
rename from python/mxnet/symbol.py
rename to python/mxnet/symbol/symbol.py
index 14cb3811deeb..ce7776d94844 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -25,43 +25,29 @@
 except ImportError:
     from builtins import slice as py_slice
 
+from array import array
 import ctypes
 import warnings
 from numbers import Number
 
-import os as _os
-import sys as _sys
 import numpy as _numpy
 
-from .base import _LIB, numeric_types
-from .base import c_array, c_str, mx_uint, py_str, string_types
-from .base import NDArrayHandle, ExecutorHandle, SymbolHandle, OpHandle
-from .base import check_call, MXNetError, NotImplementedForSymbol, _Null  # pylint: disable=unused-import
-from .context import Context
-from .ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, _GRAD_REQ_MAP
-from .name import NameManager  # pylint: disable=unused-import
-from .executor import Executor
-from . import _symbol_internal as _internal
-from .attribute import AttrScope
-from .symbol_doc import _build_doc
-
-# Use different version of SymbolBase
-# When possible, use cython to speedup part of computation.
-try:
-    if int(_os.environ.get("MXNET_ENABLE_CYTHON", True)) == 0:
-        from ._ctypes.symbol import SymbolBase, _set_symbol_class
-        from ._ctypes.symbol import _symbol_creator  # pylint: disable=unused-import
-    elif _sys.version_info >= (3, 0):
-        from ._cy3.symbol import SymbolBase, _set_symbol_class
-        from ._cy3.symbol import _symbol_creator  # pylint: disable=unused-import
-    else:
-        from ._cy2.symbol import SymbolBase, _set_symbol_class
-        from ._cy2.symbol import _symbol_creator  # pylint: disable=unused-import
-except ImportError:
-    if int(_os.environ.get("MXNET_ENFORCE_CYTHON", False)) != 0:
-        raise ImportError("Cython Module cannot be loaded but MXNET_ENFORCE_CYTHON=1")
-    from ._ctypes.symbol import SymbolBase, _set_symbol_class
-    from ._ctypes.symbol import _symbol_creator  # pylint: disable=unused-import
+from ..attribute import AttrScope
+from ..base import _LIB, numeric_types, c_array, c_array_buf, c_str, c_str_array, c_handle_array
+from ..base import mx_uint, py_str, string_types
+from ..base import NDArrayHandle, ExecutorHandle, SymbolHandle
+from ..base import check_call, MXNetError, NotImplementedForSymbol
+from ..context import Context
+from ..ndarray import NDArray, _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP, _GRAD_REQ_MAP
+from ..ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
+from ..ndarray import _ndarray_cls
+from ..executor import Executor
+from . import _internal
+from . import op
+from ._internal import SymbolBase, _set_symbol_class
+
+__all__ = ["Symbol", "var", "Variable", "Group", "load", "load_json",
+           "pow", "maximum", "minimum", "hypot", "zeros", "ones", "full", "arange"]
 
 
 class Symbol(SymbolBase):
@@ -84,8 +70,8 @@ def __iter__(self):
 
         One can loop through the returned object list to get outputs.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.Variable('a')
         >>> b = mx.sym.Variable('b')
         >>> c = a+b
@@ -140,8 +126,8 @@ def __rsub__(self, other):
 
         Only `NDArray` is supported for now.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> x = mx.nd.ones((2,3))*3
         >>> y = mx.nd.ones((2,3))
         >>> x.__rsub__(y).asnumpy()
@@ -188,8 +174,8 @@ def __rdiv__(self, other):
 
         Only `NDArray` is supported for now.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> x = mx.nd.ones((2,3))*3
         >>> y = mx.nd.ones((2,3))
         >>> x.__rdiv__(y).asnumpy()
@@ -218,8 +204,8 @@ def __rmod__(self, other):
 
         Only `NDArray` is supported for now.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> x = mx.nd.ones((2,3))*3
         >>> y = mx.nd.ones((2,3))
         >>> x.__rmod__(y).asnumpy()
@@ -263,8 +249,8 @@ def __neg__(self):
 
         Numerical negative, element-wise.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.Variable('a')
         >>> a
         <Symbol a>
@@ -290,8 +276,8 @@ def __deepcopy__(self, _):
 
         Any changes made to the deep copy do not reflect in the original object.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> import copy
         >>> data = mx.sym.Variable('data')
         >>> data_1 = copy.deepcopy(data)
@@ -403,8 +389,8 @@ def __call__(self, *args, **kwargs):
         This function internally calls `_compose` to compose the symbol and
         returns the composed symbol.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> data = mx.symbol.Variable('data')
         >>> net1 = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=10)
         >>> net2 = mx.symbol.FullyConnected(name='fc3', num_hidden=10)
@@ -438,8 +424,8 @@ def _compose(self, *args, **kwargs):
 
         This function mutates the current symbol.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> data = mx.symbol.Variable('data')
         >>> net1 = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=10)
         >>> net2 = mx.symbol.FullyConnected(name='fc3', num_hidden=10)
@@ -478,11 +464,11 @@ def _compose(self, *args, **kwargs):
 
         num_args = len(args) + len(kwargs)
         if len(kwargs) != 0:
-            keys = c_array(ctypes.c_char_p, [c_str(key) for key in kwargs])
-            args = c_array(SymbolHandle, [s.handle for s in kwargs.values()])
+            keys = c_str_array(kwargs.keys())
+            args = c_handle_array(kwargs.values())
         else:
             keys = None
-            args = c_array(SymbolHandle, [s.handle for s in args])
+            args = c_handle_array(args)
         check_call(_LIB.MXSymbolCompose(
             self.handle, name, num_args, keys, args))
 
@@ -491,8 +477,8 @@ def __getitem__(self, index):
 
         Returns a sliced view of the input symbol.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.var('a')
         >>> a.__getitem__(0)
         <Symbol a>
@@ -556,8 +542,8 @@ def attr(self, key):
 
         This function only works for non-grouped symbols.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> data = mx.sym.Variable('data', attr={'mood': 'angry'})
         >>> data.attr('mood')
         'angry'
@@ -584,8 +570,8 @@ def attr(self, key):
     def list_attr(self, recursive=False):
         """Gets all attributes from the symbol.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> data = mx.sym.Variable('data', attr={'mood': 'angry'})
         >>> data.list_attr()
         {'mood': 'angry'}
@@ -607,8 +593,8 @@ def list_attr(self, recursive=False):
     def attr_dict(self):
         """Recursively gets all attributes from the symbol and its children.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.Variable('a', attr={'a1':'a2'})
         >>> b = mx.sym.Variable('b', attr={'b1':'b2'})
         >>> c = a+b
@@ -658,8 +644,8 @@ def get_internals(self):
 
         Consider the following code:
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.var('a')
         >>> b = mx.sym.var('b')
         >>> c = a + b
@@ -684,8 +670,8 @@ def get_children(self):
         """Gets a new grouped symbol whose output contains
         inputs to output nodes of the original symbol.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> x = mx.sym.Variable('x')
         >>> y = mx.sym.Variable('y')
         >>> z = mx.sym.Variable('z')
@@ -715,8 +701,8 @@ def get_children(self):
     def list_arguments(self):
         """Lists all the arguments in the symbol.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.var('a')
         >>> b = mx.sym.var('b')
         >>> c = a + b
@@ -737,8 +723,8 @@ def list_arguments(self):
     def list_outputs(self):
         """Lists all the outputs in the symbol.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.var('a')
         >>> b = mx.sym.var('b')
         >>> c = a + b
@@ -762,8 +748,8 @@ def list_outputs(self):
     def list_auxiliary_states(self):
         """Lists all the auxiliary states in the symbol.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.var('a')
         >>> b = mx.sym.var('b')
         >>> c = a + b
@@ -831,8 +817,8 @@ def infer_type(self, *args, **kwargs):
 
         Inconsistencies in the known types will cause an error to be raised.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.var('a')
         >>> b = mx.sym.var('b')
         >>> c = a + b
@@ -871,7 +857,7 @@ def infer_type(self, *args, **kwargs):
                     types either by positional or kwargs way.')
         sdata = []
         if len(args) != 0:
-            keys = None
+            keys = c_array(ctypes.c_char_p, [])
             for s in args:
                 if s is not None:
                     s = _numpy.dtype(s).type
@@ -881,12 +867,13 @@ def infer_type(self, *args, **kwargs):
                 else:
                     sdata.append(-1)
         else:
-            keys = []
+            str_keys = []
             for k, v in kwargs.items():
                 v = _numpy.dtype(v).type
                 if v in _DTYPE_NP_TO_MX:
-                    keys.append(c_str(k))
+                    str_keys.append(k)
                     sdata.append(_DTYPE_NP_TO_MX[v])
+            keys = c_str_array(str_keys)
         arg_type_size = mx_uint()
         arg_type_data = ctypes.POINTER(ctypes.c_int)()
         out_type_size = mx_uint()
@@ -897,8 +884,8 @@ def infer_type(self, *args, **kwargs):
         check_call(_LIB.MXSymbolInferType(
             self.handle,
             mx_uint(len(sdata)),
-            c_array(ctypes.c_char_p, keys),
-            c_array(ctypes.c_int, sdata),
+            keys,
+            c_array_buf(ctypes.c_int, array('i', sdata)),
             ctypes.byref(arg_type_size),
             ctypes.byref(arg_type_data),
             ctypes.byref(out_type_size),
@@ -926,8 +913,8 @@ def infer_shape(self, *args, **kwargs):
         or keyword argument way as input. It returns a tuple of `None` values
         if there is not enough information to deduce the missing shapes.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.var('a')
         >>> b = mx.sym.var('b')
         >>> c = a + b
@@ -1009,8 +996,8 @@ def infer_shape_partial(self, *args, **kwargs):
         In the following example, information about fc2 is not available. So, `infer_shape`
         will return a tuple of `None` values but `infer_shape_partial` will return partial values.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> data = mx.sym.Variable('data')
         >>> prev = mx.sym.Variable('prev')
         >>> fc1  = mx.sym.FullyConnected(data=data, name='fc1', num_hidden=128)
@@ -1058,7 +1045,7 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
         sdata = []
         indptr = [0]
         if len(args) != 0:
-            keys = None
+            keys = c_array(ctypes.c_char_p, [])
             for i, s in enumerate(args):
                 if s is not None:
                     if not isinstance(s, tuple):
@@ -1067,14 +1054,15 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
                     sdata.extend(s)
                 indptr.append(len(sdata))
         else:
-            keys = []
+            str_keys = []
             for k, v in kwargs.items():
                 if not isinstance(v, tuple):
                     raise TypeError("Arguments need to be shapes (tuple), "
                                     "but '%s' is %s." % (k, type(v)))
-                keys.append(c_str(k))
+                str_keys.append(k)
                 sdata.extend(v)
                 indptr.append(len(sdata))
+            keys = c_str_array(str_keys)
         arg_shape_size = mx_uint()
         arg_shape_ndim = ctypes.POINTER(mx_uint)()
         arg_shape_data = ctypes.POINTER(ctypes.POINTER(mx_uint))()
@@ -1092,9 +1080,9 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
         check_call(infer_func(
             self.handle,
             mx_uint(len(indptr) - 1),
-            c_array(ctypes.c_char_p, keys),
-            c_array(mx_uint, indptr),
-            c_array(mx_uint, sdata),
+            keys,
+            c_array_buf(mx_uint, array('I', indptr)),
+            c_array_buf(mx_uint, array('I', sdata)),
             ctypes.byref(arg_shape_size),
             ctypes.byref(arg_shape_ndim),
             ctypes.byref(arg_shape_data),
@@ -1263,8 +1251,9 @@ def _get_ndarray_inputs(arg_key, args, arg_names, allow_missing):
             raise TypeError('Only accept list of NDArrays or dict of str to NDArray')
         return c_array(NDArrayHandle, arg_handles), arg_arrays
 
-    def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
-                    shared_arg_names=None, shared_exec=None, shared_buffer=None, **kwargs):
+    def simple_bind(self, ctx, grad_req='write', type_dict=None, stype_dict=None,
+                    group2ctx=None, shared_arg_names=None, shared_exec=None,
+                    shared_buffer=None, **kwargs):
         """Bind current symbol to get an executor, allocate all the arguments needed.
         Allows specifying data types.
 
@@ -1272,8 +1261,8 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
         Before binding the executor, the function allocates arguments and auxiliary states
         that were not explicitly specified. Allows specifying data types.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> x = mx.sym.Variable('x')
         >>> y = mx.sym.FullyConnected(x, num_hidden=4)
         >>> exe = y.simple_bind(mx.cpu(), x=(5,4), grad_req='null')
@@ -1306,6 +1295,9 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
         type_dict  : Dict of str->numpy.dtype
             Input type dictionary, name->dtype
 
+        stype_dict  : Dict of str->str
+            Input storage type dictionary, name->storage_type
+
         group2ctx : Dict of string to mx.Context
             The dict mapping the `ctx_group` attribute to the context assignment.
 
@@ -1320,7 +1312,8 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
         shared_buffer : Dict of string to `NDArray`
             The dict mapping argument names to the `NDArray` that can be reused for initializing
             the current executor. This buffer will be checked for reuse if one argument name
-            of the current executor is not found in `shared_arg_names`.
+            of the current executor is not found in `shared_arg_names`. The `NDArray`s are
+            expected have default storage type.
 
         kwargs : Dict of str->shape
             Input shape dictionary, name->shape
@@ -1330,6 +1323,7 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
         executor : mxnet.Executor
             The generated executor
         """
+        # data types
         num_provided_arg_types = 0
         provided_arg_type_names = ctypes.POINTER(ctypes.c_char_p)()  # provided type argument names
         provided_arg_type_data = ctypes.POINTER(mx_uint)()  # provided types
@@ -1339,11 +1333,27 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
             for k, v in type_dict.items():
                 v = _numpy.dtype(v).type
                 if v in _DTYPE_NP_TO_MX:
-                    provided_arg_type_names.append(c_str(k))
-                    provided_arg_type_data.append(ctypes.c_int(_DTYPE_NP_TO_MX[v]))
+                    provided_arg_type_names.append(k)
+                    provided_arg_type_data.append(_DTYPE_NP_TO_MX[v])
             num_provided_arg_types = mx_uint(len(provided_arg_type_names))
-            provided_arg_type_names = c_array(ctypes.c_char_p, provided_arg_type_names)
-            provided_arg_type_data = c_array(ctypes.c_int, provided_arg_type_data)
+            provided_arg_type_names = c_str_array(provided_arg_type_names)
+            provided_arg_type_data = c_array_buf(ctypes.c_int, array('i', provided_arg_type_data))
+
+        # storage types
+        num_provided_arg_stypes = 0
+        # provided storage type argument names
+        provided_arg_stype_names = ctypes.POINTER(ctypes.c_char_p)()
+        provided_arg_stype_data = ctypes.POINTER(mx_uint)()  # provided storage types
+        if stype_dict is not None:
+            provided_arg_stype_names = []
+            provided_arg_stype_data = []
+            for k, v in stype_dict.items():
+                if v in _STORAGE_TYPE_STR_TO_ID:
+                    provided_arg_stype_names.append(k)
+                    provided_arg_stype_data.append(_STORAGE_TYPE_STR_TO_ID[v])
+            num_provided_arg_stypes = mx_uint(len(provided_arg_stype_names))
+            provided_arg_stype_names = c_str_array(provided_arg_stype_names)
+            provided_arg_stype_data = c_array_buf(ctypes.c_int, array('i', provided_arg_stype_data))
 
         provided_arg_shape_data = []  # shape data
         # argument shape index in sdata,
@@ -1354,7 +1364,7 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
             # if k not in listed_arguments and k not in listed_aux_states:
             #   raise ValueError('arg name %s is not valid', k)
             if isinstance(v, tuple):
-                provided_arg_shape_names.append(c_str(k))
+                provided_arg_shape_names.append(k)
                 provided_arg_shape_data.extend(v)
                 provided_arg_shape_idx.append(len(provided_arg_shape_data))
 
@@ -1365,11 +1375,11 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
             if isinstance(grad_req, string_types):
                 # use provided_req_type_list_len = 0 to indicate this situation
                 provided_req_type_list_len = 0
-                provided_grad_req_types = [c_str(grad_req)]
+                provided_grad_req_types = [grad_req]
             elif isinstance(grad_req, list):
                 if len(grad_req) == 0:
                     raise RuntimeError('grad_req in simple_bind cannot be an empty list')
-                provided_grad_req_types = [c_str(item) for item in grad_req]
+                provided_grad_req_types = grad_req
                 provided_req_type_list_len = len(provided_grad_req_types)
             elif isinstance(grad_req, dict):
                 if len(grad_req) == 0:
@@ -1377,11 +1387,11 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
                 provided_grad_req_names = []
                 provided_grad_req_types = []
                 for k, v in grad_req.items():
-                    provided_grad_req_names.append(c_str(k))
-                    provided_grad_req_types.append(c_str(v))
-                provided_grad_req_names = c_array(ctypes.c_char_p, provided_grad_req_names)
+                    provided_grad_req_names.append(k)
+                    provided_grad_req_types.append(v)
+                provided_grad_req_names = c_str_array(provided_grad_req_names)
                 provided_req_type_list_len = len(provided_grad_req_types)
-            provided_grad_req_types = c_array(ctypes.c_char_p, provided_grad_req_types)
+            provided_grad_req_types = c_str_array(provided_grad_req_types)
 
         num_ctx_map_keys = mx_uint(0)
         ctx_map_keys = ctypes.POINTER(ctypes.c_char_p)()
@@ -1392,20 +1402,20 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
             ctx_map_dev_types = []
             ctx_map_dev_ids = []
             for key, val in group2ctx.items():
-                ctx_map_keys.append(c_str(key))
-                ctx_map_dev_types.append(ctypes.c_int(val.device_typeid))
-                ctx_map_dev_ids.append(ctypes.c_int(val.device_id))
+                ctx_map_keys.append(key)
+                ctx_map_dev_types.append(val.device_typeid)
+                ctx_map_dev_ids.append(val.device_id)
             num_ctx_map_keys = mx_uint(len(ctx_map_keys))
-            ctx_map_keys = c_array(ctypes.c_char_p, ctx_map_keys)
-            ctx_map_dev_types = c_array(ctypes.c_int, ctx_map_dev_types)
-            ctx_map_dev_ids = c_array(ctypes.c_int, ctx_map_dev_ids)
+            ctx_map_keys = c_str_array(ctx_map_keys)
+            ctx_map_dev_types = c_array(ctypes.c_int, array('i', ctx_map_dev_types))
+            ctx_map_dev_ids = c_array(ctypes.c_int, array('i', ctx_map_dev_ids))
 
         # prepare param names
         shared_arg_name_list = []
         if shared_arg_names is not None:
             if not isinstance(shared_arg_names, list):
                 raise ValueError('shared_arg_names in simple_bind must be a list or None')
-            shared_arg_name_list = [c_str(name) for name in shared_arg_names]
+            shared_arg_name_list = shared_arg_names
 
         # prepare shared_buffer
         if shared_buffer is None:
@@ -1415,14 +1425,14 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
         else:
             if not isinstance(shared_buffer, dict):
                 raise ValueError('shared_buffer in simple_bind must be dict or None')
-            shared_buffer_names = []
-            shared_buffer_handles = []
-            for k, v in shared_buffer.items():
-                shared_buffer_names.append(c_str(k))
-                shared_buffer_handles.append(v.handle)
-            shared_buffer_names = c_array(ctypes.c_char_p, shared_buffer_names)
-            shared_buffer_len = ctypes.c_int(len(shared_buffer_handles))
-            shared_buffer_handles = c_array(NDArrayHandle, shared_buffer_handles)
+            buffer_names = shared_buffer.keys()
+            buffer_arrays = shared_buffer.values()
+            for v in buffer_arrays:
+                assert(v.stype == 'default'), \
+                    "shared_buffer is expected to only contain NDArrays with default storage"
+            shared_buffer_names = c_str_array(buffer_names)
+            shared_buffer_len = ctypes.c_int(len(buffer_arrays))
+            shared_buffer_handles = c_handle_array(buffer_arrays)
         updated_shared_buffer_names = ctypes.POINTER(ctypes.c_char_p)()
         updated_shared_buffer_handles = ctypes.POINTER(NDArrayHandle)()
 
@@ -1451,14 +1461,19 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
                                                  provided_grad_req_names,
                                                  provided_grad_req_types,
                                                  mx_uint(len(provided_arg_shape_names)),
-                                                 c_array(ctypes.c_char_p, provided_arg_shape_names),
-                                                 c_array(mx_uint, provided_arg_shape_data),
-                                                 c_array(mx_uint, provided_arg_shape_idx),
+                                                 c_str_array(provided_arg_shape_names),
+                                                 c_array_buf(mx_uint,
+                                                             array('I', provided_arg_shape_data)),
+                                                 c_array_buf(mx_uint,
+                                                             array('I', provided_arg_shape_idx)),
                                                  num_provided_arg_types,
                                                  provided_arg_type_names,
                                                  provided_arg_type_data,
+                                                 num_provided_arg_stypes,
+                                                 provided_arg_stype_names,
+                                                 provided_arg_stype_data,
                                                  mx_uint(len(shared_arg_name_list)),
-                                                 c_array(ctypes.c_char_p, shared_arg_name_list),
+                                                 c_str_array(shared_arg_name_list),
                                                  ctypes.byref(shared_buffer_len),
                                                  shared_buffer_names,
                                                  shared_buffer_handles,
@@ -1486,11 +1501,12 @@ def simple_bind(self, ctx, grad_req='write', type_dict=None, group2ctx=None,
                 shared_buffer[k] = v
 
         # create in_args, arg_grads, and aux_states for the current executor
-        arg_arrays = [NDArray(NDArrayHandle(in_arg_handles[i])) for i in range(num_in_args.value)]
-        grad_arrays = [NDArray(NDArrayHandle(arg_grad_handles[i]))
+        arg_arrays = [_ndarray_cls(NDArrayHandle(in_arg_handles[i]))
+                      for i in range(num_in_args.value)]
+        grad_arrays = [_ndarray_cls(NDArrayHandle(arg_grad_handles[i]))
                        if arg_grad_handles[i] is not None
                        else None for i in range(num_in_args.value)]
-        aux_arrays = [NDArray(NDArrayHandle(aux_state_handles[i]))
+        aux_arrays = [_ndarray_cls(NDArrayHandle(aux_state_handles[i]))
                       for i in range(num_aux_states.value)]
 
         executor = Executor(exe_handle, self, ctx, grad_req, group2ctx)
@@ -1507,8 +1523,8 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
         This function returns an executor which provides method `forward()` method for evaluation
         and a `outputs()` method to get all the results.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.Variable('a')
         >>> b = mx.sym.Variable('b')
         >>> c = a + b
@@ -1610,19 +1626,19 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
         if isinstance(grad_req, string_types):
             if grad_req not in _GRAD_REQ_MAP:
                 raise ValueError('grad_req must be in %s' % str(_GRAD_REQ_MAP))
-            reqs_array = c_array(
-                mx_uint,
-                [mx_uint(_GRAD_REQ_MAP[grad_req])] * len(listed_arguments))
+            reqs_array = c_array_buf(mx_uint,
+                                     array('I', [_GRAD_REQ_MAP[grad_req]] * len(listed_arguments)))
         elif isinstance(grad_req, list):
-            reqs_array = c_array(mx_uint, [mx_uint(_GRAD_REQ_MAP[item]) for item in grad_req])
+            reqs_array = c_array_buf(mx_uint,
+                                     array('I', [_GRAD_REQ_MAP[item] for item in grad_req]))
         elif isinstance(grad_req, dict):
             req_array = []
             for name in listed_arguments:
                 if name in grad_req:
-                    req_array.append(mx_uint(_GRAD_REQ_MAP[grad_req[name]]))
+                    req_array.append(_GRAD_REQ_MAP[grad_req[name]])
                 else:
-                    req_array.append(mx_uint(0))
-            reqs_array = c_array(mx_uint, req_array)
+                    req_array.append(0)
+            reqs_array = c_array_buf(mx_uint, array('I', req_array))
 
         ctx_map_keys = []
         ctx_map_dev_types = []
@@ -1630,9 +1646,9 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
 
         if group2ctx:
             for key, val in group2ctx.items():
-                ctx_map_keys.append(c_str(key))
-                ctx_map_dev_types.append(ctypes.c_int(val.device_typeid))
-                ctx_map_dev_ids.append(ctypes.c_int(val.device_id))
+                ctx_map_keys.append(key)
+                ctx_map_dev_types.append(val.device_typeid)
+                ctx_map_dev_ids.append(val.device_id)
 
         handle = ExecutorHandle()
         shared_handle = shared_exec.handle if shared_exec is not None else ExecutorHandle()
@@ -1640,9 +1656,9 @@ def bind(self, ctx, args, args_grad=None, grad_req='write',
                                          ctypes.c_int(ctx.device_typeid),
                                          ctypes.c_int(ctx.device_id),
                                          mx_uint(len(ctx_map_keys)),
-                                         c_array(ctypes.c_char_p, ctx_map_keys),
-                                         c_array(ctypes.c_int, ctx_map_dev_types),
-                                         c_array(ctypes.c_int, ctx_map_dev_ids),
+                                         c_str_array(ctx_map_keys),
+                                         c_array_buf(ctypes.c_int, array('i', ctx_map_dev_types)),
+                                         c_array_buf(ctypes.c_int, array('i', ctx_map_dev_ids)),
                                          mx_uint(len(args)),
                                          args_handle,
                                          args_grad_handle,
@@ -1675,7 +1691,7 @@ def gradient(self, wrt):
             A gradient Symbol with returns to be the corresponding gradients.
         """
         handle = SymbolHandle()
-        c_wrt = c_array(ctypes.c_char_p, [c_str(key) for key in wrt])
+        c_wrt = c_str_array(wrt)
         check_call(_LIB.MXSymbolGrad(self.handle,
                                      mx_uint(len(wrt)),
                                      c_wrt,
@@ -1694,8 +1710,8 @@ def eval(self, ctx=None, **kwargs):
         In that case, you should call `bind` once and then repeatedly call forward.
         This function allows simpler syntax for less cumbersome introspection.
 
-        Example usage:
-        ----------
+        Example
+        -------
         >>> a = mx.sym.Variable('a')
         >>> b = mx.sym.Variable('b')
         >>> c = a + b
@@ -1724,24 +1740,597 @@ def eval(self, ctx=None, **kwargs):
             ctx = Context.default_ctx
         return self.bind(ctx, kwargs).forward()
 
-    def reshape(self, shape):
-        """Shorthand for mxnet.sym.reshape.
+    def reshape(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reshape`.
 
-        Parameters
-        ----------
-        shape : tuple of int
-            The new shape should not change the array size, namely
-            ``np.prod(new_shape)`` should be equal to ``np.prod(self.shape)``.
-            One shape dimension can be -1. In this case, the value is inferred
-            from the length of the array and remaining dimensions.
+        The arguments are the same as for :py:func:`reshape`, with
+        this array as data.
+        """
+        return op.reshape(self, *args, **kwargs)
 
+    def reshape_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reshape_like`.
 
-        Returns
-        -------
-        Symbol
-            A reshaped symbol.
+        The arguments are the same as for :py:func:`reshape_like`, with
+        this array as data.
+        """
+        return op.reshape_like(self, *args, **kwargs)
+
+    def astype(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cast`.
+
+        The arguments are the same as for :py:func:`cast`, with
+        this array as data.
+        """
+        return op.cast(self, *args, **kwargs)
+
+    def zeros_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`zeros_like`.
+
+        The arguments are the same as for :py:func:`zeros_like`, with
+        this array as data.
+        """
+        return op.zeros_like(self, *args, **kwargs)
+
+    def ones_like(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ones_like`.
+
+        The arguments are the same as for :py:func:`ones_like`, with
+        this array as data.
+        """
+        return op.ones_like(self, *args, **kwargs)
+
+    def broadcast_axes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`broadcast_axes`.
+
+        The arguments are the same as for :py:func:`broadcast_axes`, with
+        this array as data.
+        """
+        return op.broadcast_axes(self, *args, **kwargs)
+
+    def repeat(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`repeat`.
+
+        The arguments are the same as for :py:func:`repeat`, with
+        this array as data.
+        """
+        return op.repeat(self, *args, **kwargs)
+
+    def pad(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pad`.
+
+        The arguments are the same as for :py:func:`pad`, with
+        this array as data.
+        """
+        return op.pad(self, *args, **kwargs)
+
+    def swapaxes(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`swapaxes`.
+
+        The arguments are the same as for :py:func:`swapaxes`, with
+        this array as data.
+        """
+        return op.swapaxes(self, *args, **kwargs)
+
+    def split(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`split`.
+
+        The arguments are the same as for :py:func:`split`, with
+        this array as data.
+        """
+        return op.split(self, *args, **kwargs)
+
+    def slice(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice`.
+
+        The arguments are the same as for :py:func:`slice`, with
+        this array as data.
+        """
+        return op.slice(self, *args, **kwargs)
+
+    def slice_axis(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`slice_axis`.
+
+        The arguments are the same as for :py:func:`slice_axis`, with
+        this array as data.
+        """
+        return op.slice_axis(self, *args, **kwargs)
+
+    def take(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`take`.
+
+        The arguments are the same as for :py:func:`take`, with
+        this array as data.
+        """
+        return op.take(self, *args, **kwargs)
+
+    def one_hot(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`one_hot`.
+
+        The arguments are the same as for :py:func:`one_hot`, with
+        this array as data.
+        """
+        return op.one_hot(self, *args, **kwargs)
+
+    def pick(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`pick`.
+
+        The arguments are the same as for :py:func:`pick`, with
+        this array as data.
+        """
+        return op.pick(self, *args, **kwargs)
+
+    def sort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sort`.
+
+        The arguments are the same as for :py:func:`sort`, with
+        this array as data.
+        """
+        return op.sort(self, *args, **kwargs)
+
+    def topk(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`topk`.
+
+        The arguments are the same as for :py:func:`topk`, with
+        this array as data.
+        """
+        return op.topk(self, *args, **kwargs)
+
+    def argsort(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argsort`.
+
+        The arguments are the same as for :py:func:`argsort`, with
+        this array as data.
+        """
+        return op.argsort(self, *args, **kwargs)
+
+    def argmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax`.
+
+        The arguments are the same as for :py:func:`argmax`, with
+        this array as data.
+        """
+        return op.argmax(self, *args, **kwargs)
+
+    def argmax_channel(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmax_channel`.
+
+        The arguments are the same as for :py:func:`argmax_channel`, with
+        this array as data.
+        """
+        return op.argmax_channel(self, *args, **kwargs)
+
+    def argmin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`argmin`.
+
+        The arguments are the same as for :py:func:`argmin`, with
+        this array as data.
+        """
+        return op.argmin(self, *args, **kwargs)
+
+    def clip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`clip`.
+
+        The arguments are the same as for :py:func:`clip`, with
+        this array as data.
+        """
+        return op.clip(self, *args, **kwargs)
+
+    def abs(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`abs`.
+
+        The arguments are the same as for :py:func:`abs`, with
+        this array as data.
+        """
+        return op.abs(self, *args, **kwargs)
+
+    def sign(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sign`.
+
+        The arguments are the same as for :py:func:`sign`, with
+        this array as data.
+        """
+        return op.sign(self, *args, **kwargs)
+
+    def flatten(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flatten`.
+
+        The arguments are the same as for :py:func:`flatten`, with
+        this array as data.
+        """
+        return op.flatten(self, *args, **kwargs)
+
+    def expand_dims(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expand_dims`.
+
+        The arguments are the same as for :py:func:`expand_dims`, with
+        this array as data.
+        """
+        return op.expand_dims(self, *args, **kwargs)
+
+    def broadcast_to(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`broadcast_to`.
+
+        The arguments are the same as for :py:func:`broadcast_to`, with
+        this array as data.
+        """
+        return op.broadcast_to(self, *args, **kwargs)
+
+    def tile(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tile`.
+
+        The arguments are the same as for :py:func:`tile`, with
+        this array as data.
+        """
+        return op.tile(self, *args, **kwargs)
+
+    def transpose(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`transpose`.
+
+        The arguments are the same as for :py:func:`transpose`, with
+        this array as data.
+        """
+        return op.transpose(self, *args, **kwargs)
+
+    def flip(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`flip`.
+
+        The arguments are the same as for :py:func:`flip`, with
+        this array as data.
+        """
+        return op.flip(self, *args, **kwargs)
+
+    def sum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sum`.
+
+        The arguments are the same as for :py:func:`sum`, with
+        this array as data.
         """
-        return reshape(self, shape=shape)
+        return op.sum(self, *args, **kwargs)
+
+    def nansum(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nansum`.
+
+        The arguments are the same as for :py:func:`nansum`, with
+        this array as data.
+        """
+        return op.nansum(self, *args, **kwargs)
+
+    def prod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`prod`.
+
+        The arguments are the same as for :py:func:`prod`, with
+        this array as data.
+        """
+        return op.prod(self, *args, **kwargs)
+
+    def nanprod(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`nanprod`.
+
+        The arguments are the same as for :py:func:`nanprod`, with
+        this array as data.
+        """
+        return op.nanprod(self, *args, **kwargs)
+
+    def mean(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`mean`.
+
+        The arguments are the same as for :py:func:`mean`, with
+        this array as data.
+        """
+        return op.mean(self, *args, **kwargs)
+
+    def max(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`max`.
+
+        The arguments are the same as for :py:func:`max`, with
+        this array as data.
+        """
+        return op.max(self, *args, **kwargs)
+
+    def min(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`min`.
+
+        The arguments are the same as for :py:func:`min`, with
+        this array as data.
+        """
+        return op.min(self, *args, **kwargs)
+
+    def norm(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`norm`.
+
+        The arguments are the same as for :py:func:`norm`, with
+        this array as data.
+        """
+        return op.norm(self, *args, **kwargs)
+
+    def round(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`round`.
+
+        The arguments are the same as for :py:func:`round`, with
+        this array as data.
+        """
+        return op.round(self, *args, **kwargs)
+
+    def rint(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rint`.
+
+        The arguments are the same as for :py:func:`rint`, with
+        this array as data.
+        """
+        return op.rint(self, *args, **kwargs)
+
+    def fix(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`fix`.
+
+        The arguments are the same as for :py:func:`fix`, with
+        this array as data.
+        """
+        return op.fix(self, *args, **kwargs)
+
+    def floor(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`floor`.
+
+        The arguments are the same as for :py:func:`floor`, with
+        this array as data.
+        """
+        return op.floor(self, *args, **kwargs)
+
+    def ceil(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`ceil`.
+
+        The arguments are the same as for :py:func:`ceil`, with
+        this array as data.
+        """
+        return op.ceil(self, *args, **kwargs)
+
+    def trunc(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`trunc`.
+
+        The arguments are the same as for :py:func:`trunc`, with
+        this array as data.
+        """
+        return op.trunc(self, *args, **kwargs)
+
+    def sin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sin`.
+
+        The arguments are the same as for :py:func:`sin`, with
+        this array as data.
+        """
+        return op.sin(self, *args, **kwargs)
+
+    def cos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cos`.
+
+        The arguments are the same as for :py:func:`cos`, with
+        this array as data.
+        """
+        return op.cos(self, *args, **kwargs)
+
+    def tan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tan`.
+
+        The arguments are the same as for :py:func:`tan`, with
+        this array as data.
+        """
+        return op.tan(self, *args, **kwargs)
+
+    def arcsin(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsin`.
+
+        The arguments are the same as for :py:func:`arcsin`, with
+        this array as data.
+        """
+        return op.arcsin(self, *args, **kwargs)
+
+    def arccos(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccos`.
+
+        The arguments are the same as for :py:func:`arccos`, with
+        this array as data.
+        """
+        return op.arccos(self, *args, **kwargs)
+
+    def arctan(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctan`.
+
+        The arguments are the same as for :py:func:`arctan`, with
+        this array as data.
+        """
+        return op.arctan(self, *args, **kwargs)
+
+    def degrees(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`degrees`.
+
+        The arguments are the same as for :py:func:`degrees`, with
+        this array as data.
+        """
+        return op.degrees(self, *args, **kwargs)
+
+    def radians(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`radians`.
+
+        The arguments are the same as for :py:func:`radians`, with
+        this array as data.
+        """
+        return op.radians(self, *args, **kwargs)
+
+    def sinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sinh`.
+
+        The arguments are the same as for :py:func:`sinh`, with
+        this array as data.
+        """
+        return op.sinh(self, *args, **kwargs)
+
+    def cosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cosh`.
+
+        The arguments are the same as for :py:func:`cosh`, with
+        this array as data.
+        """
+        return op.cosh(self, *args, **kwargs)
+
+    def tanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`tanh`.
+
+        The arguments are the same as for :py:func:`tanh`, with
+        this array as data.
+        """
+        return op.tanh(self, *args, **kwargs)
+
+    def arcsinh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arcsinh`.
+
+        The arguments are the same as for :py:func:`arcsinh`, with
+        this array as data.
+        """
+        return op.arcsinh(self, *args, **kwargs)
+
+    def arccosh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arccosh`.
+
+        The arguments are the same as for :py:func:`arccosh`, with
+        this array as data.
+        """
+        return op.arccosh(self, *args, **kwargs)
+
+    def arctanh(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`arctanh`.
+
+        The arguments are the same as for :py:func:`arctanh`, with
+        this array as data.
+        """
+        return op.arctanh(self, *args, **kwargs)
+
+    def exp(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`exp`.
+
+        The arguments are the same as for :py:func:`exp`, with
+        this array as data.
+        """
+        return op.exp(self, *args, **kwargs)
+
+    def expm1(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`expm1`.
+
+        The arguments are the same as for :py:func:`expm1`, with
+        this array as data.
+        """
+        return op.expm1(self, *args, **kwargs)
+
+    def log(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log`.
+
+        The arguments are the same as for :py:func:`log`, with
+        this array as data.
+        """
+        return op.log(self, *args, **kwargs)
+
+    def log10(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log10`.
+
+        The arguments are the same as for :py:func:`log10`, with
+        this array as data.
+        """
+        return op.log10(self, *args, **kwargs)
+
+    def log2(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log2`.
+
+        The arguments are the same as for :py:func:`log2`, with
+        this array as data.
+        """
+        return op.log2(self, *args, **kwargs)
+
+    def log1p(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log1p`.
+
+        The arguments are the same as for :py:func:`log1p`, with
+        this array as data.
+        """
+        return op.log1p(self, *args, **kwargs)
+
+    def sqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sqrt`.
+
+        The arguments are the same as for :py:func:`sqrt`, with
+        this array as data.
+        """
+        return op.sqrt(self, *args, **kwargs)
+
+    def rsqrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rsqrt`.
+
+        The arguments are the same as for :py:func:`rsqrt`, with
+        this array as data.
+        """
+        return op.rsqrt(self, *args, **kwargs)
+
+    def cbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`cbrt`.
+
+        The arguments are the same as for :py:func:`cbrt`, with
+        this array as data.
+        """
+        return op.cbrt(self, *args, **kwargs)
+
+    def rcbrt(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`rcbrt`.
+
+        The arguments are the same as for :py:func:`rcbrt`, with
+        this array as data.
+        """
+        return op.rcbrt(self, *args, **kwargs)
+
+    def square(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`square`.
+
+        The arguments are the same as for :py:func:`square`, with
+        this array as data.
+        """
+        return op.square(self, *args, **kwargs)
+
+    def reciprocal(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`reciprocal`.
+
+        The arguments are the same as for :py:func:`reciprocal`, with
+        this array as data.
+        """
+        return op.reciprocal(self, *args, **kwargs)
+
+    def relu(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`relu`.
+
+        The arguments are the same as for :py:func:`relu`, with
+        this array as data.
+        """
+        return op.relu(self, *args, **kwargs)
+
+    def sigmoid(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`sigmoid`.
+
+        The arguments are the same as for :py:func:`sigmoid`, with
+        this array as data.
+        """
+        return op.sigmoid(self, *args, **kwargs)
+
+    def softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`softmax`.
+
+        The arguments are the same as for :py:func:`softmax`, with
+        this array as data.
+        """
+        return op.softmax(self, *args, **kwargs)
+
+    def log_softmax(self, *args, **kwargs):
+        """Convenience fluent method for :py:func:`log_softmax`.
+
+        The arguments are the same as for :py:func:`log_softmax`, with
+        this array as data.
+        """
+        return op.log_softmax(self, *args, **kwargs)
 
     def wait_to_read(self):
         raise NotImplementedForSymbol(self.wait_to_read, None)
@@ -1752,9 +2341,6 @@ def asnumpy(self):
     def asscalar(self):
         raise NotImplementedForSymbol(self.asscalar, None)
 
-    def astype(self):
-        raise NotImplementedForSymbol(self.astype, None)
-
     def copy(self):
         raise NotImplementedForSymbol(self.copy, None)
 
@@ -1767,14 +2353,21 @@ def detach(self):
     def backward(self):
         raise NotImplementedForSymbol(self.backward, None)
 
-def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, init=None, **kwargs):
+def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None,
+        init=None, stype=None, **kwargs):
     """Creates a symbolic variable with specified name.
 
-    Example usage:
-    ----------
+    Example
+    -------
     >>> data = mx.sym.Variable('data', attr={'a': 'b'})
     >>> data
     <Symbol data>
+    >>> csr_data = mx.sym.Variable('csr_data', stype='csr')
+    >>> csr_data
+    <Symbol csr_data>
+    >>> row_sparse_weight = mx.sym.Variable('weight', stype='row_sparse')
+    >>> row_sparse_weight
+    <Symbol weight>
 
     Parameters
     ----------
@@ -1794,6 +2387,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini
         The dtype for input variable. If not specified, this value will be inferred.
     init : initializer (mxnet.init.*)
         Initializer for this variable to (optionally) override the default initializer.
+    stype : str
+        The storage type of the variable, such as 'row_sparse', 'csr', 'default', etc
     kwargs : Additional attribute variables
         Additional attributes must start and end with double underscores.
 
@@ -1821,6 +2416,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini
         if not isinstance(init, string_types):
             init = init.dumps()
         attr['__init__'] = init
+    if stype is not None:
+        attr['__storage_type__'] = str(_STORAGE_TYPE_STR_TO_ID[stype])
     for k, v in kwargs.items():
         if k.startswith('__') and k.endswith('__'):
             attr[k] = str(v)
@@ -1839,8 +2436,8 @@ def var(name, attr=None, shape=None, lr_mult=None, wd_mult=None, dtype=None, ini
 def Group(symbols):
     """Creates a symbol that contains a collection of other symbols, grouped together.
 
-    Example usage:
-    ----------
+    Example
+    -------
     >>> a = mx.sym.Variable('a')
     >>> b = mx.sym.Variable('b')
     >>> mx.sym.Group([a,b])
@@ -1856,15 +2453,12 @@ def Group(symbols):
     sym : Symbol
         A group symbol.
      """
-    ihandles = []
-    for sym in symbols:
-        if not isinstance(sym, Symbol):
-            raise TypeError('Expected a list of symbols as input')
-        ihandles.append(sym.handle)
+    if any(not isinstance(sym, Symbol) for sym in symbols):
+        raise TypeError('Expected a list of symbols as input')
     handle = SymbolHandle()
     check_call(_LIB.MXSymbolCreateGroup(
-        mx_uint(len(ihandles)),
-        c_array(SymbolHandle, ihandles), ctypes.byref(handle)))
+        mx_uint(len(symbols)),
+        c_handle_array(symbols), ctypes.byref(handle)))
     return Symbol(handle)
 
 
@@ -2165,9 +2759,9 @@ def full(shape, val, dtype=None, **kwargs):
     """
     if dtype is None:
         dtype = _numpy.float32
-    return _internal._MulScalar(ones(shape=shape, dtype=dtype, **kwargs), scalar=val)
-
+    return _internal._full(shape=shape, dtype=dtype, value=float(val), **kwargs)
 
+# pylint: disable=redefined-outer-name
 def arange(start, stop=None, step=1.0, repeat=1, name=None, dtype=None):
     """Returns evenly spaced values within a given interval.
 
@@ -2195,188 +2789,4 @@ def arange(start, stop=None, step=1.0, repeat=1, name=None, dtype=None):
     return _internal._arange(start=start, stop=stop, step=step, repeat=repeat,
                              name=name, dtype=dtype)
 
-
-def _make_atomic_symbol_function(handle, name):
-    """Create an atomic symbol function by handle and function name."""
-    real_name = ctypes.c_char_p()
-    desc = ctypes.c_char_p()
-    num_args = mx_uint()
-    arg_names = ctypes.POINTER(ctypes.c_char_p)()
-    arg_types = ctypes.POINTER(ctypes.c_char_p)()
-    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
-    key_var_num_args = ctypes.c_char_p()
-    ret_type = ctypes.c_char_p()
-
-    check_call(_LIB.MXSymbolGetAtomicSymbolInfo(
-        handle, ctypes.byref(real_name), ctypes.byref(desc),
-        ctypes.byref(num_args),
-        ctypes.byref(arg_names),
-        ctypes.byref(arg_types),
-        ctypes.byref(arg_descs),
-        ctypes.byref(key_var_num_args),
-        ctypes.byref(ret_type)))
-    narg = int(num_args.value)
-    arg_names = [py_str(arg_names[i]) for i in range(narg)]
-    arg_types = [py_str(arg_types[i]) for i in range(narg)]
-    func_name = name
-    key_var_num_args = py_str(key_var_num_args.value)
-    ret_type = py_str(ret_type.value) if ret_type.value is not None else ''
-    doc_str = _build_doc(func_name,
-                         py_str(desc.value),
-                         arg_names,
-                         arg_types,
-                         [py_str(arg_descs[i]) for i in range(narg)],
-                         key_var_num_args,
-                         ret_type)
-
-    dtype_name = None
-    arr_name = None
-    ndsignature = []
-    signature = []
-    ndarg_names = []
-    kwarg_names = []
-    for i in range(narg):
-        name, atype = arg_names[i], arg_types[i]
-        if name == 'dtype':
-            dtype_name = name
-            signature.append('%s=_Null'%name)
-        elif atype.startswith('NDArray') or atype.startswith('Symbol'):
-            assert not arr_name, \
-                "Op can only have one argument with variable " \
-                "size and it must be the last argument."
-            if atype.endswith('[]'):
-                ndsignature.append('*%s'%name)
-                arr_name = name
-            else:
-                ndsignature.append('%s=None'%name)
-                ndarg_names.append(name)
-        else:
-            signature.append('%s=_Null'%name)
-            kwarg_names.append(name)
-    #signature.append('is_train=False')
-    signature.append('name=None')
-    signature.append('attr=None')
-    signature.append('out=None')
-    signature.append('**kwargs')
-    signature = ndsignature + signature
-
-    code = []
-    if arr_name:
-        code.append("""
-def %s(*%s, **kwargs):"""%(func_name, arr_name))
-        code.append("""
-    sym_args = []
-    for i in {}:
-        assert isinstance(i, SymbolBase), \\
-            "Positional arguments must be Symbol instances, " \\
-            "but got %s"%str(i)
-        sym_args.append(i)""".format(arr_name))
-        if dtype_name is not None:
-            code.append("""
-    if '%s' in kwargs:
-        kwargs['%s'] = _numpy.dtype(kwargs['%s']).name"""%(
-            dtype_name, dtype_name, dtype_name))
-        code.append("""
-    attr = kwargs.pop('attr', None)
-    kwargs.update(AttrScope.current.get(attr))
-    name = kwargs.pop('name', None)
-    name = NameManager.current.get(name, '%s')
-    _ = kwargs.pop('out', None)
-    keys = []
-    vals = []
-    sym_kwargs = dict()
-    for k, v in kwargs.items():
-        if isinstance(v, SymbolBase):
-            sym_kwargs[k] = v
-        else:
-            keys.append(k)
-            vals.append(v)"""%(func_name.lower()))
-        if key_var_num_args:
-            code.append("""
-    if '%s' not in kwargs:
-        keys.append('%s')
-        vals.append(len(sym_args) + len(sym_kwargs))"""%(
-            key_var_num_args, key_var_num_args))
-
-        code.append("""
-    return _symbol_creator(%d, sym_args, sym_kwargs, keys, vals, name)"""%(
-        handle.value))
-    else:
-        code.append("""
-def %s(%s):
-    kwargs.update(AttrScope.current.get(attr))
-    sym_kwargs = dict()
-    keys = []
-    vals = []"""%(func_name, ', '.join(signature)))
-        code.append("""
-    for k, v in kwargs.items():
-        if isinstance(v, SymbolBase):
-            sym_kwargs[k] = v
-        else:
-            keys.append(k)
-            vals.append(v)""")
-        # NDArray args
-        for name in ndarg_names: # pylint: disable=redefined-argument-from-local
-            code.append("""
-    if {name} is not None:
-        assert isinstance({name}, SymbolBase), \\
-            "Argument {name} must be Symbol instances, but got %s"%str({name})
-        sym_kwargs['{name}'] = {name}""".format(name=name))
-        # kwargs
-        for name in kwarg_names: # pylint: disable=redefined-argument-from-local
-            code.append("""
-    if %s is not _Null:
-        keys.append('%s')
-        vals.append(%s)"""%(name, name, name))
-        # dtype
-        if dtype_name is not None:
-            code.append("""
-    if %s is not _Null:
-        keys.append('%s')
-        vals.append(_numpy.dtype(%s).name)"""%(dtype_name, dtype_name, dtype_name))
-
-        code.append("""
-    name = NameManager.current.get(name, '%s')
-    return _symbol_creator(%d, None, sym_kwargs, keys, vals, name)"""%(
-        func_name.lower(), handle.value))
-
-    local = {}
-    exec(''.join(code), None, local)  # pylint: disable=exec-used
-    symbol_function = local[func_name]
-    symbol_function.__name__ = func_name
-    symbol_function.__doc__ = doc_str
-    symbol_function.__module__ = 'mxnet.symbol'
-    return symbol_function
-
-
-def _init_symbol_module(symbol_class, root_namespace):
-    """List and add all the atomic symbol functions to current module."""
-    _set_symbol_class(symbol_class)
-    plist = ctypes.POINTER(ctypes.c_char_p)()
-    size = ctypes.c_uint()
-
-    check_call(_LIB.MXListAllOpNames(ctypes.byref(size),
-                                     ctypes.byref(plist)))
-    op_names = []
-    for i in range(size.value):
-        op_names.append(py_str(plist[i]))
-
-    module_obj = _sys.modules["%s.symbol" % root_namespace]
-    module_internal = _sys.modules["%s._symbol_internal" % root_namespace]
-    module_contrib = _sys.modules["%s.contrib.symbol" % root_namespace]
-    for name in op_names:
-        hdl = OpHandle()
-        check_call(_LIB.NNGetOpHandle(c_str(name), ctypes.byref(hdl)))
-        function = _make_atomic_symbol_function(hdl, name)
-        if function.__name__.startswith('_contrib_'):
-            function.__name__ = function.__name__[9:]
-            function.__module__ = 'mxnet.contrib.symbol'
-            setattr(module_contrib, function.__name__, function)
-        elif function.__name__.startswith('_'):
-            setattr(module_internal, function.__name__, function)
-        else:
-            setattr(module_obj, function.__name__, function)
-
-
-# Initialize the atomic symbol in startups
-_init_symbol_module(Symbol, "mxnet")
+_set_symbol_class(Symbol)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index c5587f8d80a8..3e667364cd5b 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -28,18 +28,21 @@
 import os
 import errno
 import logging
+import bz2
 from contextlib import contextmanager
 import numpy as np
 import numpy.testing as npt
-import mxnet as mx
-from .context import Context
-from .ndarray import array
-from .symbol import Symbol
+import numpy.random as rnd
 try:
     import requests
 except ImportError:
     # in rare cases requests may be not installed
     pass
+import mxnet as mx
+from .context import Context
+from .ndarray.ndarray import _STORAGE_TYPE_STR_TO_ID
+from .ndarray import array
+from .symbol import Symbol
 
 _rng = np.random.RandomState(1234)
 
@@ -85,6 +88,322 @@ def random_arrays(*shapes):
     return arrays
 
 
+def random_sample(population, k):
+    """Return a k length list of the elements chosen from the population sequence."""
+    assert 0 <= k <= len(population)
+    population_copy = population[:]
+    np.random.shuffle(population_copy)
+    return population_copy[0:k]
+
+
+def _validate_csr_generation_inputs(num_rows, num_cols, density,
+                                    distribution="uniform"):
+    """Validates inputs for csr generation helper functions
+    """
+    total_nnz = int(num_rows * num_cols * density)
+    if density < 0 or density > 1:
+        raise ValueError("density has to be between 0 and 1")
+
+    if num_rows <= 0 or num_cols <= 0:
+        raise ValueError("num_rows or num_cols should be greater than 0")
+
+    if distribution == "powerlaw":
+        if total_nnz < 2 * num_rows:
+            raise ValueError("not supported for this density: %s"
+                             " for this shape (%s, %s)"
+                             " Please keep :"
+                             " num_rows * num_cols * density >= 2 * num_rows"
+                             % (density, num_rows, num_cols))
+
+
+def shuffle_csr_column_indices(csr):
+    """Shuffle CSR column indices per row
+    This allows validation of unordered column indices, which is not a requirement
+    for a valid CSR matrix
+    """
+    row_count = len(csr.indptr) - 1
+    for i in range(row_count):
+        start_index = csr.indptr[i]
+        end_index = csr.indptr[i + 1]
+        sublist = np.array(csr.indices[start_index : end_index])
+        np.random.shuffle(sublist)
+        csr.indices[start_index : end_index] = sublist
+
+
+def _get_uniform_dataset_csr(num_rows, num_cols, density=0.1, dtype=None,
+                             data_init=None, shuffle_csr_indices=False):
+    """Returns CSRNDArray with uniform distribution
+    This generates a csr matrix with totalnnz unique randomly chosen numbers
+    from num_rows*num_cols and arranges them in the 2d array in the
+    following way:
+    row_index = (random_number_generated / num_rows)
+    col_index = random_number_generated - row_index * num_cols
+    """
+    _validate_csr_generation_inputs(num_rows, num_cols, density,
+                                    distribution="uniform")
+    try:
+        from scipy import sparse as spsp
+        csr = spsp.rand(num_rows, num_cols, density, dtype=dtype, format="csr")
+        if data_init is not None:
+            csr.data.fill(data_init)
+        if shuffle_csr_indices is True:
+            shuffle_csr_column_indices(csr)
+        result = mx.nd.sparse.csr_matrix((csr.data, csr.indices, csr.indptr),
+                                         shape=(num_rows, num_cols), dtype=dtype)
+    except ImportError:
+        assert(data_init is None), \
+               "data_init option is not supported when scipy is absent"
+        assert(not shuffle_csr_indices), \
+               "shuffle_csr_indices option is not supported when scipy is absent"
+        # scipy not available. try to generate one from a dense array
+        dns = mx.nd.random.uniform(shape=(num_rows, num_cols), dtype=dtype)
+        masked_dns = dns * (dns < density)
+        result = masked_dns.tostype('csr')
+    return result
+
+def _get_powerlaw_dataset_csr(num_rows, num_cols, density=0.1, dtype=None):
+    """Returns CSRNDArray with powerlaw distribution
+    with exponentially increasing number of non zeros in each row.
+    Not supported for cases where total_nnz < 2*num_rows. This is because
+    the algorithm first tries to ensure that there are rows with no zeros by
+    putting non zeros at beginning of each row.
+    """
+
+    _validate_csr_generation_inputs(num_rows, num_cols, density,
+                                    distribution="powerlaw")
+
+    total_nnz = int(num_rows * num_cols * density)
+
+    unused_nnz = total_nnz
+    output_arr = np.zeros((num_rows, num_cols), dtype=dtype)
+    # Start with ones on each row so that no row is empty
+    for row in range(num_rows):
+        output_arr[row][0] = 1 + rnd.uniform(0.001, 2)
+        unused_nnz = unused_nnz - 1
+        if unused_nnz <= 0:
+            return mx.nd.array(output_arr).tostype("csr")
+
+    # Populate rest of matrix with 2^i items in ith row.
+    # if we have used all total nnz return the sparse matrix
+    # else if we reached max column size then fill up full columns until we use all nnz
+    col_max = 2
+    for row in range(num_rows):
+        col_limit = min(num_cols, col_max)
+        # In case col_limit reached assign same value to all elements, which is much faster
+        if col_limit == num_cols and unused_nnz > col_limit:
+            output_arr[row] = 1 + rnd.uniform(0.001, 2)
+            unused_nnz = unused_nnz - col_limit + 1
+            if unused_nnz <= 0:
+                return mx.nd.array(output_arr).tostype("csr")
+            else:
+                continue
+        for col_index in range(1, col_limit):
+            output_arr[row][col_index] = 1 + rnd.uniform(0.001, 2)
+            unused_nnz = unused_nnz - 1
+            if unused_nnz <= 0:
+                return mx.nd.array(output_arr).tostype("csr")
+        col_max = col_max * 2
+
+    if unused_nnz > 0:
+        raise ValueError("not supported for this density: %s"
+                         " for this shape (%s,%s)" % (density, num_rows, num_cols))
+    else:
+        return mx.nd.array(output_arr).tostype("csr")
+
+
+def assign_each(the_input, function):
+    """Return ndarray composed of passing each array value through some function"""
+    if function is not None:
+        it_input = np.nditer(the_input, flags=['f_index'])
+
+        output = np.zeros(the_input.shape)
+        it_out = np.nditer(output, flags=['f_index'], op_flags=['writeonly'])
+
+        while not it_input.finished:
+            val_input = it_input[0]
+            it_out[0] = function(val_input)
+            it_input.iternext()
+            it_out.iternext()
+
+        return output
+    else:
+        return np.array(the_input)
+
+def assign_each2(input1, input2, function):
+    """Return ndarray composed of passing two array values through some function"""
+    if function is not None:
+        assert input1.shape == input2.shape
+        it_input1 = np.nditer(input1, flags=['f_index'])
+        it_input2 = np.nditer(input2, flags=['f_index'])
+
+        output = np.zeros(input1.shape)
+        it_out = np.nditer(output, flags=['f_index'], op_flags=['writeonly'])
+
+        while not it_input1.finished:
+            val_input1 = it_input1[0]
+            val_input2 = it_input2[0]
+            it_out[0] = function(val_input1, val_input2)
+            it_input1.iternext()
+            it_input2.iternext()
+            it_out.iternext()
+
+        return output
+    else:
+        return np.array(input1)
+
+def rand_sparse_ndarray(shape, stype, density=None, dtype=None, distribution=None,
+                        data_init=None, rsp_indices=None, modifier_func=None,
+                        shuffle_csr_indices=False):
+    """Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np)
+
+    Parameters
+    ----------
+    shape: list or tuple
+    stype: str, valid values: "csr" or "row_sparse"
+    density, optional: float, should be between 0 and 1
+    distribution, optional: str, valid values: "uniform" or "powerlaw"
+    dtype, optional: numpy.dtype, default value is None
+
+    Returns
+    -------
+    Result of type CSRNDArray or RowSparseNDArray
+
+    Examples
+    --------
+    Below is an example of the powerlaw distribution with csr as the stype.
+    It calculates the nnz using the shape and density.
+    It fills up the ndarray with exponentially increasing number of elements.
+    If there are enough unused_nnzs, n+1th row will have twice more nnzs compared to nth row.
+    else, remaining unused_nnzs will be used in n+1th row
+    If number of cols is too small and we have already reached column size it will fill up
+    all following columns in all followings rows until we reach the required density.
+
+    >>> csr_arr, _ = rand_sparse_ndarray(shape=(5, 16), stype="csr",
+                                         density=0.50, distribution="powerlaw")
+    >>> indptr = csr_arr.indptr.asnumpy()
+    >>> indices = csr_arr.indices.asnumpy()
+    >>> data = csr_arr.data.asnumpy()
+    >>> row2nnz = len(data[indptr[1]:indptr[2]])
+    >>> row3nnz = len(data[indptr[2]:indptr[3]])
+    >>> assert(row3nnz == 2*row2nnz)
+    >>> row4nnz = len(data[indptr[3]:indptr[4]])
+    >>> assert(row4nnz == 2*row3nnz)
+
+    """
+    density = rnd.rand() if density is None else density
+    dtype = default_dtype() if dtype is None else dtype
+    distribution = "uniform" if distribution is None else distribution
+    if stype == 'row_sparse':
+        assert (distribution == "uniform"), \
+               "Distribution %s not supported for row_sparse" % (distribution)
+        # sample index
+        if rsp_indices is not None:
+            indices = rsp_indices
+            assert(len(indices) <= shape[0])
+        else:
+            idx_sample = rnd.rand(shape[0])
+            indices = np.argwhere(idx_sample < density).flatten()
+        if indices.shape[0] == 0:
+            result = mx.nd.zeros(shape, stype='row_sparse', dtype=dtype)
+            return result, (np.array([], dtype=dtype), np.array([]))
+        # generate random values
+        val = rnd.rand(indices.shape[0], *shape[1:]).astype(dtype)
+
+        # Allow caller to override or adjust random values
+        if data_init is not None:
+            val.fill(data_init)
+        if modifier_func is not None:
+            val = assign_each(val, modifier_func)
+
+        arr = mx.nd.sparse.row_sparse_array((val, indices), shape=shape, dtype=dtype)
+        return arr, (val, indices)
+    elif stype == 'csr':
+        assert len(shape) == 2
+        if distribution == "uniform":
+            csr = _get_uniform_dataset_csr(shape[0], shape[1], density,
+                                           data_init=data_init,
+                                           shuffle_csr_indices=shuffle_csr_indices, dtype=dtype)
+            return csr, (csr.indptr, csr.indices, csr.data)
+        elif distribution == "powerlaw":
+            csr = _get_powerlaw_dataset_csr(shape[0], shape[1], density=density, dtype=dtype)
+            return csr, (csr.indptr, csr.indices, csr.data)
+        else:
+            assert(False), "Distribution not supported: %s" % (distribution)
+    else:
+        assert(False), "unknown storage type"
+
+
+def rand_ndarray(shape, stype, density=None, dtype=None,
+                 modifier_func=None, shuffle_csr_indices=False, distribution=None):
+    if stype == 'default':
+        arr = mx.nd.array(random_arrays(shape), dtype=dtype)
+    else:
+        arr, _ = rand_sparse_ndarray(shape, stype, density=density,
+                                     modifier_func=modifier_func, dtype=dtype,
+                                     shuffle_csr_indices=shuffle_csr_indices,
+                                     distribution=distribution)
+    return arr
+
+
+def create_sparse_array(shape, stype, data_init=None, rsp_indices=None,
+                        dtype=None, modifier_func=None, density=.5,
+                        shuffle_csr_indices=False):
+    """Create a sparse array, For Rsp, assure indices are in a canonical format"""
+    if stype == 'row_sparse':
+        if rsp_indices is not None:
+            arr_indices = np.asarray(rsp_indices)
+            arr_indices.sort()
+        else:
+            arr_indices = None
+        arr_data, (_, _) = rand_sparse_ndarray(shape, stype,
+                                               density=density,
+                                               data_init=data_init,
+                                               rsp_indices=arr_indices,
+                                               dtype=dtype,
+                                               modifier_func=modifier_func)
+    elif stype == 'csr':
+        arr_data, (_, _, _) = rand_sparse_ndarray(shape,
+                                                  stype,
+                                                  density=density,
+                                                  data_init=data_init,
+                                                  dtype=dtype,
+                                                  modifier_func=modifier_func,
+                                                  shuffle_csr_indices=shuffle_csr_indices)
+    else:
+        msg = "Unknown storage type: " + stype
+        raise AssertionError(msg)
+
+    return arr_data
+
+
+def create_sparse_array_zd(shape, stype, density, data_init=None,
+                           rsp_indices=None, dtype=None, modifier_func=None,
+                           shuffle_csr_indices=False):
+    """Create sparse array, using only rsp_indices to determine density"""
+    if stype == 'row_sparse':
+        density = 0.0
+        if rsp_indices is not None:
+            assert len(rsp_indices) <= shape[0]
+    return create_sparse_array(shape, stype,
+                               data_init=data_init,
+                               rsp_indices=rsp_indices,
+                               dtype=dtype,
+                               modifier_func=modifier_func,
+                               density=density,
+                               shuffle_csr_indices=shuffle_csr_indices)
+
+def rand_shape_2d(dim0=10, dim1=10):
+    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1)
+
+
+def rand_shape_3d(dim0=10, dim1=10, dim2=10):
+    return rnd.randint(1, dim0 + 1), rnd.randint(1, dim1 + 1), rnd.randint(1, dim2 + 1)
+
+
+def rand_shape_nd(num_dim, dim=10):
+    return tuple(rnd.randint(1, dim+1, size=num_dim))
+
+
 def np_reduce(dat, axis, keepdims, numpy_reduce_func):
     """Compatible reduce for old version of NumPy.
 
@@ -140,12 +459,12 @@ def same(a, b):
     return np.array_equal(a, b)
 
 
-def almost_equal(a, b, rtol=None, atol=None):
+def almost_equal(a, b, rtol=None, atol=None, equal_nan=False):
     """Test if two numpy arrays are almost equal."""
-    return np.allclose(a, b, rtol=get_rtol(rtol), atol=get_atol(atol))
+    return np.allclose(a, b, rtol=get_rtol(rtol), atol=get_atol(atol), equal_nan=equal_nan)
 
 
-def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b')):
+def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=False):
     """Test that two numpy arrays are almost equal. Raise exception message if not.
 
     Parameters
@@ -158,7 +477,7 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b')):
     rtol = get_rtol(rtol)
     atol = get_atol(atol)
 
-    if almost_equal(a, b, rtol, atol):
+    if almost_equal(a, b, rtol, atol, equal_nan=equal_nan):
         return
 
     index, rel = find_max_violation(a, b, rtol, atol)
@@ -195,6 +514,7 @@ def almost_equal_ignore_nan(a, b, rtol=None, atol=None):
 
     return almost_equal(a, b, rtol, atol)
 
+
 def assert_almost_equal_ignore_nan(a, b, rtol=None, atol=None, names=('a', 'b')):
     """Test that two NumPy arrays are almost equal (ignoring NaN in either array).
     Combines a relative and absolute measure of approximate eqality.
@@ -219,6 +539,13 @@ def assert_almost_equal_ignore_nan(a, b, rtol=None, atol=None, names=('a', 'b'))
 
     assert_almost_equal(a, b, rtol, atol, names)
 
+def assert_exception(f, exception_type, *args, **kwargs):
+    """Test that function f will throw an exception of type given by `exception_type`"""
+    try:
+        f(*args, **kwargs)
+        assert(False)
+    except exception_type:
+        return
 
 def retry(n):
     """Retry n times before failing for stochastic test cases."""
@@ -267,7 +594,7 @@ def simple_forward(sym, ctx=None, is_train=False, **inputs):
     return outputs
 
 
-def _parse_location(sym, location, ctx):
+def _parse_location(sym, location, ctx, dtype=default_dtype()):
     """Parses the given location to a dictionary.
 
     Arguments of the provided op `sym` are used as dictionary keys
@@ -288,6 +615,8 @@ def _parse_location(sym, location, ctx):
         *In either case, value of all the arguments must be provided.*
     ctx : Context
         Device context.
+    dtype: np.float16 or np.float32 or np.float64
+        Datatype for mx.nd.array.
 
     Returns
     -------
@@ -309,6 +638,7 @@ def _parse_location(sym, location, ctx):
     ValueError: Symbol arguments and keys of the given location do not match.
     """
     assert isinstance(location, (dict, list, tuple))
+    assert dtype == np.float16 or dtype == np.float32 or dtype == np.float64
     if isinstance(location, dict):
         if set(location.keys()) != set(sym.list_arguments()):
             raise ValueError("Symbol arguments and keys of the given location do not match."
@@ -316,11 +646,12 @@ def _parse_location(sym, location, ctx):
                              % (str(set(sym.list_arguments())), str(set(location.keys()))))
     else:
         location = {k: v for k, v in zip(sym.list_arguments(), location)}
-    location = {k: mx.nd.array(v, ctx=ctx) for k, v in location.items()}
+    location = {k: mx.nd.array(v, ctx=ctx, dtype=dtype) if isinstance(v, np.ndarray) \
+               else v for k, v in location.items()}
     return location
 
 
-def _parse_aux_states(sym, aux_states, ctx):
+def _parse_aux_states(sym, aux_states, ctx, dtype=default_dtype()):
     """Parses the given auxiliary states to a dictionary.
 
     Auxiliary states of the provided op `sym` are used as dictionary
@@ -339,6 +670,10 @@ def _parse_aux_states(sym, aux_states, ctx):
         - if type is dict of str -> `np.ndarray`
             maps the name of arguments to the corresponding `np.ndarray`.
         *In either case, all aux states of `sym` must be provided.*
+    ctx : Context
+        Device context.
+    dtype: np.float16 or np.float32 or np.float64
+        Datatype for mx.nd.array.
 
     Returns
     -------
@@ -362,6 +697,7 @@ def _parse_aux_states(sym, aux_states, ctx):
     >>> _parse_aux_states(fc2, {'batchnorm0_moving_var': mean_states}, None)
     ValueError: Symbol aux_states names and given aux_states do not match.
     """
+    assert dtype == np.float16 or dtype == np.float32 or dtype == np.float64
     if aux_states is not None:
         if isinstance(aux_states, dict):
             if set(aux_states.keys()) != set(sym.list_auxiliary_states()):
@@ -372,11 +708,12 @@ def _parse_aux_states(sym, aux_states, ctx):
         elif isinstance(aux_states, (list, tuple)):
             aux_names = sym.list_auxiliary_states()
             aux_states = {k:v for k, v in zip(aux_names, aux_states)}
-        aux_states = {k: mx.nd.array(v, ctx=ctx) for k, v in aux_states.items()}
+        aux_states = {k: mx.nd.array(v, ctx=ctx, dtype=dtype) for k, v in aux_states.items()}
     return aux_states
 
 
-def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_train=True):
+def numeric_grad(executor, location, aux_states=None, eps=1e-4,
+                 use_forward_train=True, dtype=default_dtype()):
     """Calculates a numeric gradient via finite difference method.
 
     Class based on Theano's `theano.gradient.numeric_grad` [1]
@@ -397,24 +734,34 @@ def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_trai
         Epsilon for the finite-difference method.
     use_forward_train : bool, optional
         Whether to use `is_train=True` in testing.
+    dtype: np.float16 or np.float32 or np.float64
+        Datatype for mx.nd.array.
+
     References
     ---------
     ..[1] https://github.com/Theano/Theano/blob/master/theano/gradient.py
     """
-    approx_grads = {k: np.zeros(v.shape, dtype=np.float32)
+    def as_stype(var, stype, dtype):
+        return mx.nd.cast_storage(mx.nd.array(var, dtype=dtype), stype=stype)
+
+    assert dtype == np.float16 or dtype == np.float32 or dtype == np.float64
+    approx_grads = {k: np.zeros(v.shape, dtype=dtype)
                     for k, v in location.items()}
     for k, v in location.items():
-        executor.arg_dict[k][:] = v
+        stype = executor.arg_dict[k].stype
+        if stype == 'default':
+            executor.arg_dict[k][:] = as_stype(v, stype, dtype=dtype)
     for k in location:
         location[k] = np.ascontiguousarray(location[k])
     for k, v in location.items():
         if v.dtype.kind != 'f':
             continue
+        stype = executor.arg_dict[k].stype
         old_value = v.copy()
         for i in range(np.prod(v.shape)):
             # inplace update
             v.ravel()[i] += eps/2.0
-            executor.arg_dict[k][:] = v
+            executor.arg_dict[k][:] = as_stype(v, stype, dtype=dtype)
             if aux_states is not None:
                 for key, val in aux_states.items():
                     executor.aux_dict[key][:] = val
@@ -422,22 +769,26 @@ def numeric_grad(executor, location, aux_states=None, eps=1e-4, use_forward_trai
             f_peps = executor.outputs[0].asnumpy()
 
             v.ravel()[i] -= eps
-            executor.arg_dict[k][:] = v
+            executor.arg_dict[k][:] = as_stype(v, stype, dtype=dtype)
             if aux_states is not None:
                 for key, val in aux_states.items():
-                    executor.aux_dict[key][:] = val
+                    adstype = executor.aux_dict[key].stype
+                    executor.aux_dict[key][:] = as_stype(val, adstype, dtype=dtype)
             executor.forward(is_train=use_forward_train)
             f_neps = executor.outputs[0].asnumpy()
 
-            approx_grads[k].ravel()[i] = (f_peps - f_neps).sum() / eps
+            approx_grad = (f_peps - f_neps).sum() / eps
+            approx_grads[k].ravel()[i] = approx_grad
             v.ravel()[i] = old_value.ravel()[i]
         # copy back the original value
-        executor.arg_dict[k][:] = old_value
+        executor.arg_dict[k][:] = as_stype(old_value, stype, dtype=dtype)
+
     return approx_grads
 
 
 def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rtol=1e-2,
-                           atol=None, grad_nodes=None, use_forward_train=True, ctx=None):
+                           atol=None, grad_nodes=None, use_forward_train=True, ctx=None,
+                           grad_stype_dict=None, dtype=default_dtype()):
     """Verify an operation by checking backward pass via finite difference method.
 
     Based on Theano's `theano.gradient.verify_grad` [1]
@@ -454,7 +805,7 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
         - if type is dict of str -> numpy.ndarray
             maps the name of arguments to the corresponding numpy.ndarray.
         *In either case, value of all the arguments must be provided.*
-    aux_states : ist or tuple or dict, optional
+    aux_states : list or tuple or dict, optional
         The auxiliary states required when generating the executor for the symbol.
     numeric_eps : float, optional
         Delta for the finite difference method that approximates the gradient.
@@ -466,10 +817,16 @@ def check_numeric_gradient(sym, location, aux_states=None, numeric_eps=1e-3, rto
         Whether to use is_train=True when computing the finite-difference.
     ctx : Context, optional
         Check the gradient computation on the specified device.
+    grad_stype_dict : dict of str->str, optional
+        Storage type dictionary for gradient ndarrays.
+    dtype: np.float16 or np.float32 or np.float64
+        Datatype for mx.nd.array.
+
     References
     ---------
     ..[1] https://github.com/Theano/Theano/blob/master/theano/gradient.py
     """
+    assert dtype == np.float16 or dtype == np.float32 or dtype == np.float64
     if ctx is None:
         ctx = default_context()
 
@@ -485,11 +842,12 @@ def random_projection(shape):
         plain = _rng.rand(*shape) + 0.1
         return plain
 
-    location = _parse_location(sym=sym, location=location, ctx=ctx)
+    location = _parse_location(sym=sym, location=location, ctx=ctx, dtype=dtype)
     location_npy = {k:v.asnumpy() for k, v in location.items()}
-    aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx)
+    aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx,
+                                   dtype=dtype)
     if aux_states is not None:
-        aux_states_npy = {k:v.asnumpy() for k, v in aux_states.items()}
+        aux_states_npy = {k: v.asnumpy() for k, v in aux_states.items()}
     else:
         aux_states_npy = None
     if grad_nodes is None:
@@ -508,14 +866,23 @@ def random_projection(shape):
     _, out_shape, _ = sym.infer_shape(**input_shape)
     proj = mx.sym.Variable("__random_proj")
     out = sym * proj
-    out = mx.sym.MakeLoss(out)
+    out = mx.sym.make_loss(out)
 
     location = dict(list(location.items()) +
-                    [("__random_proj", mx.nd.array(random_projection(out_shape[0]), ctx=ctx))])
+                    [("__random_proj", mx.nd.array(random_projection(out_shape[0]),
+                                                   ctx=ctx, dtype=dtype))])
     args_grad_npy = dict([(k, _rng.normal(0, 0.01, size=location[k].shape)) for k in grad_nodes]
                          + [("__random_proj", _rng.normal(0, 0.01, size=out_shape[0]))])
 
-    args_grad = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()}
+    args_grad = {k: mx.nd.array(v, ctx=ctx, dtype=dtype) for k, v in args_grad_npy.items()}
+    if grad_stype_dict is not None:
+        assert isinstance(grad_stype_dict, dict), "grad_stype_dict must be a dict"
+        for k, v in grad_stype_dict.items():
+            if k in args_grad and v in _STORAGE_TYPE_STR_TO_ID and v != 'default':
+                # create an uninitialized sparse ndarray for executor
+                # if the symbolic grad is expected to be zero, it should not be initialized at all
+                args_grad[k] = mx.nd.zeros(args_grad[k].shape, args_grad[k].context,
+                                           args_grad[k].dtype, v)
 
     executor = out.bind(ctx, grad_req=grad_req,
                         args=location, args_grad=args_grad, aux_states=aux_states)
@@ -530,8 +897,10 @@ def random_projection(shape):
     executor.backward()
     symbolic_grads = {k:executor.grad_dict[k].asnumpy() for k in grad_nodes}
 
-    numeric_gradients = numeric_grad(executor, location_npy, aux_states_npy,
-                                     eps=numeric_eps, use_forward_train=use_forward_train)
+    numeric_gradients = numeric_grad(
+        executor, location_npy, aux_states_npy,
+        eps=numeric_eps, use_forward_train=use_forward_train, dtype=dtype)
+
     for name in grad_nodes:
         fd_grad = numeric_gradients[name]
         orig_grad = args_grad_npy[name]
@@ -550,7 +919,8 @@ def random_projection(shape):
 
 
 def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
-                           aux_states=None, ctx=None):
+                           aux_states=None, ctx=None, equal_nan=False,
+                           dtype=default_dtype()):
     """Compares a symbol's forward results with the expected ones.
     Prints error messages if the forward results are not the same as the expected ones.
 
@@ -581,6 +951,11 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
             Contains the mapping between names of auxiliary states and their values.
     ctx : Context, optional
         running context
+    dtype: np.float16 or np.float32 or np.float64
+        Datatype for mx.nd.array.
+
+    equal_nan: Boolean
+        if True, `nan` is a valid value for checking equivalency (ie `nan` == `nan`)
 
     Example
     -------
@@ -593,29 +968,33 @@ def check_symbolic_forward(sym, location, expected, rtol=1E-4, atol=None,
     >>> ret_expected = np.array([[19, 22], [43, 50]])
     >>> check_symbolic_forward(sym_dot, [mat1, mat2], [ret_expected])
     """
+    assert dtype == np.float16 or dtype == np.float32 or dtype == np.float64
     if ctx is None:
         ctx = default_context()
 
-    location = _parse_location(sym=sym, location=location, ctx=ctx)
-    aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx)
+    location = _parse_location(sym=sym, location=location, ctx=ctx, dtype=dtype)
+    aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx,
+                                   dtype=dtype)
     if isinstance(expected, dict):
         expected = [expected[k] for k in sym.list_outputs()]
-    args_grad_data = {k:mx.nd.empty(v.shape, ctx=ctx) for k, v in location.items()}
+    args_grad_data = {k:mx.nd.empty(v.shape, ctx=ctx, dtype=dtype) for k, v in location.items()}
 
     executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states)
     for g in executor.grad_arrays:
         g[:] = 0
 
     executor.forward(is_train=False)
-    outputs = [x.asnumpy() for x in executor.outputs]
 
+    outputs = [x.asnumpy() for x in executor.outputs]
     for output_name, expect, output in zip(sym.list_outputs(), expected, outputs):
         assert_almost_equal(expect, output, rtol, atol,
-                            ("EXPECTED_%s"%output_name, "FORWARD_%s"%output_name))
-
+                            ("EXPECTED_%s"%output_name, "FORWARD_%s"%output_name),
+                            equal_nan=equal_nan)
+    return executor.outputs
 
 def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=None,
-                            aux_states=None, grad_req='write', ctx=None):
+                            aux_states=None, grad_req='write', ctx=None, grad_stypes=None,
+                            equal_nan=False, dtype=default_dtype()):
     """Compares a symbol's backward results with the expected ones.
     Prints error messages if the backward results are not the same as the expected results.
 
@@ -651,6 +1030,12 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
         Gradient requirements. 'write', 'add' or 'null'.
     ctx : Context, optional
         Running context.
+    grad_stypes: dict of str->str
+        dictionary of mapping argument name to stype for the gradient
+    equal_nan: Boolean
+        if True, `nan` is a valid value for checking equivalency (ie `nan` == `nan`)
+    dtype: np.float16 or np.float32 or np.float64
+        Datatype for mx.nd.array.
 
     Example
     -------
@@ -668,44 +1053,78 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
     >>> grad_expected = ograd.copy().asnumpy()
     >>> check_symbolic_backward(sym_add, [mat1, mat2], [ograd], [grad_expected, grad_expected])
     """
+    assert dtype == np.float16 or dtype == np.float32 or dtype == np.float64
     if ctx is None:
         ctx = default_context()
 
-    location = _parse_location(sym=sym, location=location, ctx=ctx)
-    aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx)
+    location = _parse_location(sym=sym, location=location, ctx=ctx, dtype=dtype)
+    aux_states = _parse_aux_states(sym=sym, aux_states=aux_states, ctx=ctx,
+                                   dtype=dtype)
     if isinstance(expected, (list, tuple)):
         expected = {k:v for k, v in zip(sym.list_arguments(), expected)}
+
     args_grad_npy = {k:_rng.normal(size=v.shape) for k, v in expected.items()}
-    args_grad_data = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()}
+    args_grad_data = {}
+    for k, v in args_grad_npy.items():
+        nd = mx.nd.array(v, ctx=ctx, dtype=dtype)
+        if grad_stypes is not None and k in grad_stypes:
+            stype = grad_stypes[k]
+            if stype is not None and stype != 'default':
+                out = create_sparse_array(v.shape, stype, density=0.0)
+            else:
+                out = nd
+            args_grad_data[k] = out
+        else:
+            args_grad_data[k] = nd
+
     if isinstance(grad_req, str):
         grad_req = {k:grad_req for k in sym.list_arguments()}
     elif isinstance(grad_req, (list, tuple)):
         grad_req = {k:v for k, v in zip(sym.list_arguments(), grad_req)}
 
-    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states)
+    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data,
+                        aux_states=aux_states, grad_req=grad_req)
     executor.forward(is_train=True)
+
     if isinstance(out_grads, (tuple, list)):
-        out_grads = [mx.nd.array(v, ctx=ctx) for v in out_grads]
-    elif isinstance(out_grads, (dict)):
-        out_grads = {k:mx.nd.array(v, ctx=ctx) for k, v in out_grads.items()}
+        outg = list()
+        for arr in out_grads:
+            if isinstance(arr, np.ndarray):
+                outg.append(mx.nd.array(arr, ctx=ctx, dtype=dtype))
+            else:
+                outg.append(arr)
+        out_grads = outg
+    elif isinstance(out_grads, dict):
+        outg = dict()
+        for k, v in out_grads.items():
+            if isinstance(v, np.ndarray):
+                outg[k] = mx.nd.array(v, ctx=ctx, dtype=dtype)
+            else:
+                outg[k] = v
+        out_grads = outg
     else:
         assert out_grads is None
+
     executor.backward(out_grads)
 
     grads = {k: v.asnumpy() for k, v in args_grad_data.items()}
+
     for name in expected:
         if grad_req[name] == 'write':
             assert_almost_equal(expected[name], grads[name], rtol, atol,
-                                ("EXPECTED_%s"%name, "BACKWARD_%s"%name))
+                                ("EXPECTED_%s"%name, "BACKWARD_%s"%name),
+                                equal_nan=equal_nan)
         elif grad_req[name] == 'add':
             assert_almost_equal(expected[name], grads[name] - args_grad_npy[name],
-                                rtol, atol, ("EXPECTED_%s"%name, "BACKWARD_%s"%name))
+                                rtol, atol, ("EXPECTED_%s"%name, "BACKWARD_%s"%name),
+                                equal_nan=equal_nan)
         elif grad_req[name] == 'null':
             assert_almost_equal(args_grad_npy[name], grads[name],
-                                rtol, atol, ("EXPECTED_%s"%name, "BACKWARD_%s"%name))
+                                rtol, atol, ("EXPECTED_%s"%name, "BACKWARD_%s"%name),
+                                equal_nan=equal_nan)
         else:
             raise ValueError("Invalid grad_req %s for argument %s"%(grad_req[name], name))
-
+    return args_grad_data
 
 def check_speed(sym, location=None, ctx=None, N=20, grad_req=None, typ="whole",
                 **kwargs):
@@ -783,7 +1202,7 @@ def check_speed(sym, location=None, ctx=None, N=20, grad_req=None, typ="whole",
 
 def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
                       arg_params=None, aux_params=None, tol=None,
-                      raise_on_err=True, ground_truth=None):
+                      raise_on_err=True, ground_truth=None, equal_nan=False):
     """Check symbol gives the same output for different running context
 
     Parameters
@@ -883,7 +1302,8 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
             gtarr = gt[name].astype(dtypes[i]).asnumpy()
             arr = arr.asnumpy()
             try:
-                assert_almost_equal(arr, gtarr, rtol=tol[dtypes[i]], atol=tol[dtypes[i]])
+                assert_almost_equal(arr, gtarr, rtol=tol[dtypes[i]], atol=tol[dtypes[i]],
+                                    equal_nan=equal_nan)
             except AssertionError as e:
                 print('Predict Err: ctx %d vs ctx %d at %s'%(i, max_idx, name))
                 traceback.print_exc()
@@ -909,7 +1329,8 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write',
                 gtarr = gt[name].astype(dtypes[i]).asnumpy()
                 arr = arr.asnumpy()
                 try:
-                    assert_almost_equal(arr, gtarr, rtol=tol[dtypes[i]], atol=tol[dtypes[i]])
+                    assert_almost_equal(arr, gtarr, rtol=tol[dtypes[i]], atol=tol[dtypes[i]],
+                                        equal_nan=equal_nan)
                 except AssertionError as e:
                     print('Train Err: ctx %d vs ctx %d at %s'%(i, max_idx, name))
                     traceback.print_exc()
@@ -1019,6 +1440,41 @@ def read_data(label_url, image_url):
     return {'train_data':train_img, 'train_label':train_lbl,
             'test_data':test_img, 'test_label':test_lbl}
 
+def get_bz2_data(data_dir, data_name, url, data_origin_name):
+    """Download and extract bz2 data.
+
+    Parameters
+    ----------
+
+    data_dir : str
+        Absolute or relative path of the directory name to store bz2 files
+    data_name : str
+        Name of the output file in which bz2 contents will be extracted
+    url : str
+        URL to download data from
+    data_origin_name : str
+        Name of the downloaded b2 file
+
+    Examples
+    --------
+    >>> get_bz2_data("data_dir", "kdda.t",
+                     "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/kdda.t.bz2",
+                     "kdda.t.bz2")
+    """
+
+    data_name = os.path.join(data_dir, data_name)
+    data_origin_name = os.path.join(data_dir, data_origin_name)
+    if not os.path.exists(data_name):
+        download(url, dirname=data_dir, overwrite=False)
+        bz_file = bz2.BZ2File(data_origin_name, 'rb')
+        with open(data_name, 'wb') as fout:
+            try:
+                content = bz_file.read()
+                fout.write(content)
+            finally:
+                bz_file.close()
+        os.remove(data_origin_name)
+
 def set_env_var(key, val, default_val=""):
     """Set environment variable
 
diff --git a/python/mxnet/torch.py b/python/mxnet/torch.py
index b7fce6d5c8fd..fc815b146921 100644
--- a/python/mxnet/torch.py
+++ b/python/mxnet/torch.py
@@ -23,8 +23,8 @@
 import ctypes
 import sys
 from .base import _LIB
-from .base import c_array, py_str, build_param_doc as _build_param_doc
-from .base import mx_uint, mx_float, NDArrayHandle, FunctionHandle
+from .base import c_array, c_str_array, c_handle_array, py_str, build_param_doc as _build_param_doc
+from .base import mx_uint, mx_float, FunctionHandle
 from .base import check_call
 from .ndarray import NDArray, _new_empty_handle
 
@@ -144,12 +144,12 @@ def generic_torch_function(*args, **kwargs):
 
         check_call(_LIB.MXFuncInvokeEx( \
                    handle, \
-                   c_array(NDArrayHandle, [x.handle for x in ndargs[n_mutate_vars:]]), \
+                   c_handle_array(ndargs[n_mutate_vars:]), \
                    c_array(mx_float, []), \
-                   c_array(NDArrayHandle, [x.handle for x in ndargs[:n_mutate_vars]]),
+                   c_handle_array(ndargs[:n_mutate_vars]),
                    ctypes.c_int(len(kwargs)),
-                   c_array(ctypes.c_char_p, kwargs.keys()),
-                   c_array(ctypes.c_char_p, kwargs.values()),))
+                   c_str_array(kwargs.keys()),
+                   c_str_array(kwargs.values())))
         if n_mutate_vars == 1:
             return ndargs[0]
         else:
diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py
index 4dbf680c2e3a..8c4cc3b920d7 100644
--- a/python/mxnet/visualization.py
+++ b/python/mxnet/visualization.py
@@ -134,12 +134,20 @@ def print_layer_summary(node, out_shape):
                             pre_filter = pre_filter + int(shape[0])
         cur_param = 0
         if op == 'Convolution':
-            cur_param = pre_filter * int(node["attr"]["num_filter"])
-            for k in _str2tuple(node["attr"]["kernel"]):
-                cur_param *= int(k)
-            cur_param += int(node["attr"]["num_filter"])
+            if ("no_bias" in node["attrs"]) and int(node["attrs"]["no_bias"]):
+                cur_param = pre_filter * int(node["attrs"]["num_filter"])
+                for k in _str2tuple(node["attrs"]["kernel"]):
+                    cur_param *= int(k)
+            else:
+                cur_param = pre_filter * int(node["attrs"]["num_filter"])
+                for k in _str2tuple(node["attrs"]["kernel"]):
+                    cur_param *= int(k)
+                cur_param += int(node["attrs"]["num_filter"])
         elif op == 'FullyConnected':
-            cur_param = pre_filter * (int(node["attr"]["num_hidden"]) + 1)
+            if ("no_bias" in node["attrs"]) and int(node["attrs"]["no_bias"]):
+                cur_param = pre_filter * (int(node["attrs"]["num_hidden"]))
+            else:
+                cur_param = (pre_filter+1) * (int(node["attrs"]["num_hidden"]))
         elif op == 'BatchNorm':
             key = node["name"] + "_output"
             if show_shape:
@@ -283,24 +291,24 @@ def looks_like_weight(name):
             label = node["name"]
             attr["fillcolor"] = cm[0]
         elif op == "Convolution":
-            label = r"Convolution\n%s/%s, %s" % ("x".join(_str2tuple(node["attr"]["kernel"])),
-                                                 "x".join(_str2tuple(node["attr"]["stride"]))
-                                                 if "stride" in node["attr"] else "1",
-                                                 node["attr"]["num_filter"])
+            label = r"Convolution\n%s/%s, %s" % ("x".join(_str2tuple(node["attrs"]["kernel"])),
+                                                 "x".join(_str2tuple(node["attrs"]["stride"]))
+                                                 if "stride" in node["attrs"] else "1",
+                                                 node["attrs"]["num_filter"])
             attr["fillcolor"] = cm[1]
         elif op == "FullyConnected":
-            label = r"FullyConnected\n%s" % node["attr"]["num_hidden"]
+            label = r"FullyConnected\n%s" % node["attrs"]["num_hidden"]
             attr["fillcolor"] = cm[1]
         elif op == "BatchNorm":
             attr["fillcolor"] = cm[3]
         elif op == "Activation" or op == "LeakyReLU":
-            label = r"%s\n%s" % (op, node["attr"]["act_type"])
+            label = r"%s\n%s" % (op, node["attrs"]["act_type"])
             attr["fillcolor"] = cm[2]
         elif op == "Pooling":
-            label = r"Pooling\n%s, %s/%s" % (node["attr"]["pool_type"],
-                                             "x".join(_str2tuple(node["attr"]["kernel"])),
-                                             "x".join(_str2tuple(node["attr"]["stride"]))
-                                             if "stride" in node["attr"] else "1")
+            label = r"Pooling\n%s, %s/%s" % (node["attrs"]["pool_type"],
+                                             "x".join(_str2tuple(node["attrs"]["kernel"])),
+                                             "x".join(_str2tuple(node["attrs"]["stride"]))
+                                             if "stride" in node["attrs"] else "1")
             attr["fillcolor"] = cm[4]
         elif op == "Concat" or op == "Flatten" or op == "Reshape":
             attr["fillcolor"] = cm[5]
@@ -309,7 +317,7 @@ def looks_like_weight(name):
         else:
             attr["fillcolor"] = cm[7]
             if op == "Custom":
-                label = node["attr"]["op_type"]
+                label = node["attrs"]["op_type"]
 
         dot.node(name=name, label=label, **attr)
 
@@ -330,8 +338,8 @@ def looks_like_weight(name):
                     if draw_shape:
                         if input_node["op"] != "null":
                             key = input_name + "_output"
-                            if "attr" in input_node:
-                                params = input_node["attr"]
+                            if "attrs" in input_node:
+                                params = input_node["attrs"]
                                 if "num_outputs" in params:
                                     key += str(int(params["num_outputs"]) - 1)
                             shape = shape_dict[key][1:]
diff --git a/python/setup.py b/python/setup.py
index 14c8121d35ee..029b3afa06fb 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -46,6 +46,17 @@
 LIB_PATH = libinfo['find_lib_path']()
 __version__ = libinfo['__version__']
 
+sys.path.insert(0, CURRENT_DIR)
+
+# Try to generate auto-complete code
+try:
+    from mxnet.base import _generate_op_module_signature
+    from mxnet.ndarray.register import _generate_ndarray_function_code
+    from mxnet.symbol.register import _generate_symbol_function_code
+    _generate_op_module_signature('mxnet', 'symbol', _generate_symbol_function_code)
+    _generate_op_module_signature('mxnet', 'ndarray', _generate_ndarray_function_code)
+except: # pylint: disable=bare-except
+    pass
 
 def config_cython():
     """Try to configure cython and return cython configuration"""
diff --git a/scala-package/assembly/linux-x86_64-cpu/pom.xml b/scala-package/assembly/linux-x86_64-cpu/pom.xml
index 2c25e6856fd3..5f1b4a58e8dc 100644
--- a/scala-package/assembly/linux-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +38,12 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-cpu</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml b/scala-package/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml
index 97e34c819dbc..f221a67f87dc 100644
--- a/scala-package/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml
+++ b/scala-package/assembly/linux-x86_64-cpu/src/main/assembly/assembly.xml
@@ -1,3 +1,23 @@
+<?xml version='1.0'?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <assembly>
   <id>full</id>
   <formats>
diff --git a/scala-package/assembly/linux-x86_64-gpu/pom.xml b/scala-package/assembly/linux-x86_64-gpu/pom.xml
index 892851281655..d67a703aa447 100644
--- a/scala-package/assembly/linux-x86_64-gpu/pom.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +38,12 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-scala-linux-x86_64-gpu</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>so</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml b/scala-package/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml
index ba5030c91887..3ee716729a35 100644
--- a/scala-package/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml
+++ b/scala-package/assembly/linux-x86_64-gpu/src/main/assembly/assembly.xml
@@ -1,3 +1,23 @@
+<?xml version='1.0'?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <assembly>
   <id>full</id>
   <formats>
diff --git a/scala-package/assembly/osx-x86_64-cpu/main/assembly/assembly.xml b/scala-package/assembly/osx-x86_64-cpu/main/assembly/assembly.xml
index fecafecad31e..9f015eebd8d9 100644
--- a/scala-package/assembly/osx-x86_64-cpu/main/assembly/assembly.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/main/assembly/assembly.xml
@@ -1,3 +1,23 @@
+<?xml version='1.0'?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <assembly>
   <id>full</id>
   <formats>
diff --git a/scala-package/assembly/osx-x86_64-cpu/pom.xml b/scala-package/assembly/osx-x86_64-cpu/pom.xml
index e3f433f673e4..a0be325a6856 100644
--- a/scala-package/assembly/osx-x86_64-cpu/pom.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-full-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -18,12 +38,12 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-scala-osx-x86_64-cpu</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jnilib</type>
     </dependency>
   </dependencies>
diff --git a/scala-package/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml b/scala-package/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml
index 1abf81dd9ca1..56ba127b9127 100644
--- a/scala-package/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml
+++ b/scala-package/assembly/osx-x86_64-cpu/src/main/assembly/assembly.xml
@@ -1,3 +1,23 @@
+<?xml version='1.0'?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <assembly>
   <id>full</id>
   <formats>
diff --git a/scala-package/assembly/pom.xml b/scala-package/assembly/pom.xml
index 52a2cc42228f..1502686bdc37 100644
--- a/scala-package/assembly/pom.xml
+++ b/scala-package/assembly/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/core/pom.xml b/scala-package/core/pom.xml
index 51e8a3596b1a..adccfae119d7 100644
--- a/scala-package/core/pom.xml
+++ b/scala-package/core/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -71,13 +91,13 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-macros_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
   </dependencies>
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala
index 22f926972286..d3c86919914c 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/KVStoreServer.scala
@@ -20,10 +20,6 @@ package ml.dmlc.mxnet
 import ml.dmlc.mxnet.Base._
 import org.slf4j.{Logger, LoggerFactory}
 
-/**
- * Server node for the key value store
- * @author Yizhi Liu
- */
 private[mxnet] class KVStoreServer(private val kvStore: KVStore) {
   private val logger: Logger = LoggerFactory.getLogger(classOf[KVStoreServer])
   private val handle: KVStoreHandle = kvStore.handle
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
index 71586d3de7f8..210e61d8b7eb 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/Random.scala
@@ -45,7 +45,7 @@ object Random {
       require(shape != null, "shape is required when out is not specified")
       outCopy = NDArray.empty(shape, ctx)
     }
-    NDArray.genericNDArrayFunctionInvoke("_sample_uniform", Seq(low, high),
+    NDArray.genericNDArrayFunctionInvoke("_random_uniform", Seq(low, high),
       Map("shape" -> outCopy.shape, "out" -> outCopy))
   }
 
@@ -72,7 +72,7 @@ object Random {
       require(shape != null, "shape is required when out is not specified")
       outCopy = NDArray.empty(shape, ctx)
     }
-    NDArray.genericNDArrayFunctionInvoke("_sample_normal", Seq.empty[NDArray],
+    NDArray.genericNDArrayFunctionInvoke("_random_normal", Seq.empty[NDArray],
       Map("loc" -> loc, "scale" -> scale, "shape" -> outCopy.shape, "out" -> outCopy))
   }
 
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala
index 0a73e1afcde1..f6f2e837f87e 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BaseModule.scala
@@ -22,9 +22,32 @@ import java.io.IOException
 import ml.dmlc.mxnet.optimizer.SGD
 import ml.dmlc.mxnet._
 import org.slf4j.LoggerFactory
-
+import org.slf4j.Logger
 import scala.collection.mutable.ArrayBuffer
 
+object BaseModule {
+  /**
+   * Check that all input names are in symbol's arguments.
+   */
+  @throws(classOf[IllegalArgumentException])
+  private[module] def _checkInputNames(symbol: Symbol, names: IndexedSeq[String],
+    typeName: String, throws: Boolean, logger: Logger): Unit = {
+    val args = symbol.listArguments()
+    for (name <- names) {
+      if (!args.contains(name)) {
+        val candidates = args.filter ( arg =>
+          !arg.endsWith("_weight") && !arg.endsWith("_bias")
+          && !arg.endsWith("_gamma") && !arg.endsWith("_beta"))
+        val msg = s"You created Module with Module(..., ${typeName}_names=${names.mkString})" +
+          s" but input with name \'${name}\' is not found in symbol.listArguments(). " +
+          s"Did you mean one of:\n${candidates.mkString("\n\t")}"
+        if (throws) throw new IllegalArgumentException(msg)
+        else logger.warn(msg)
+      }
+    }
+  }
+}
+
 /**
  * The base class of a modules. A module represents a computation component. The design
  * purpose of a module is that it abstract a computation "machine", that one can run forward,
@@ -397,7 +420,7 @@ abstract class BaseModule {
 
       // one epoch of training is finished
       val (name, value) = fitParams.evalMetric.get
-      logger.info(s"Epoch[$epoch] Train-$name=$value")
+      logger.info(s"Epoch[$epoch] Train-${name.head}=${value.head}")
       val toc = System.currentTimeMillis
       logger.info(s"Epoch[$epoch] Time cost=${toc - tic}")
 
@@ -415,7 +438,7 @@ abstract class BaseModule {
           scoreEndCallback = fitParams.evalEndCallback,
           batchEndCallback = fitParams.evalBatchEndCallback, epoch = epoch)
         val (name, value) = res.get
-        logger.info(s"Epoch[$epoch] Validation-$name=$value")
+        logger.info(s"Epoch[$epoch] Validation-${name.head}=${value.head}")
       })
 
       // end of 1 epoch, reset the data-iter for another epoch
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BucketingModule.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BucketingModule.scala
new file mode 100644
index 000000000000..d64ccc0c0da3
--- /dev/null
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/BucketingModule.scala
@@ -0,0 +1,404 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ml.dmlc.mxnet.module
+
+import ml.dmlc.mxnet._
+import org.slf4j.LoggerFactory
+import org.slf4j.Logger
+import scala.collection.mutable.ArrayBuffer
+import ml.dmlc.mxnet.optimizer.SGD
+import scala.collection.immutable.ListMap
+import ml.dmlc.mxnet.module.BaseModule._
+
+/**
+ * This module helps to deal efficiently with varying-length inputs.
+ * @param symGen A function when called with a bucket key, returns a triple
+ *              ``(symbol, dataNames, labelNames)``.
+ * @param defaultBucketKey The key for the default bucket.
+ * @param contexts Default is cpu().
+ * @param workLoadList Default `None`, indicating uniform workload.
+ * @param fixedParamNames Default `None`, indicating no network parameters are fixed.
+ */
+class BucketingModule(symGen: AnyRef => (Symbol, IndexedSeq[String], IndexedSeq[String]),
+                     defaultBucketKey: AnyRef, contexts: Array[Context] = Context.cpu(),
+                     workLoadList: Option[IndexedSeq[Float]] = None,
+                     fixedParamNames: Option[Set[String]] = None) extends BaseModule {
+  private val logger = LoggerFactory.getLogger(classOf[BucketingModule])
+
+  {
+    val (sym, dNames, lNames) = symGen(defaultBucketKey)
+    val dataNameList = if (dNames == null) IndexedSeq.empty[String] else dNames
+    val labelNameList = if (lNames == null) IndexedSeq.empty[String] else lNames
+    val fixedParamNameList = fixedParamNames.getOrElse(IndexedSeq.empty[String]).toIndexedSeq
+
+    _checkInputNames(sym, dataNameList, "data", true, logger)
+    _checkInputNames(sym, labelNameList, "label", false, logger)
+    _checkInputNames(sym, fixedParamNameList, "fixed_param", true, logger)
+  }
+
+  private val workLoads = workLoadList.getOrElse(contexts.map(_ => 1f).toIndexedSeq)
+  require(workLoads.size == contexts.length)
+
+  private val _buckets = scala.collection.mutable.Map[AnyRef, Module]()
+  private var _currModule: Module = null
+  private var _currBucketKey = defaultBucketKey
+
+  private var paramsDirty = false
+
+  // Internal function to reset binded state.
+  private def resetBind(): Unit = {
+    this.binded = false
+    this._buckets.clear()
+    this._currModule = null
+    this._currBucketKey = defaultBucketKey
+  }
+
+  // Symbol information
+  // A list of names for data required by this module.
+  override def dataNames: IndexedSeq[String] = {
+    if (this.binded) this._currModule.dataNames
+    else this.symGen(this.defaultBucketKey)._2
+  }
+
+  // A list of names for the outputs of this module.
+  override def outputNames: IndexedSeq[String] = {
+    if (this.binded) this._currModule.outputNames
+    else this.symGen(this.defaultBucketKey)._1.listOutputs()
+  }
+
+  // Input/Output information
+  // A list of (name, shape) pairs specifying the data inputs to this module.
+  override def dataShapes: IndexedSeq[DataDesc] = {
+    require(this.binded)
+    this._currModule.dataShapes
+  }
+
+  /**
+   * A list of (name, shape) pairs specifying the label inputs to this module.
+   * If this module does not accept labels -- either it is a module without loss
+   * function, or it is not binded for training, then this should return an empty
+   * list `[]`.
+   */
+  override def labelShapes: IndexedSeq[DataDesc] = {
+    require(this.binded)
+    this._currModule.labelShapes
+  }
+
+  // A list of (name, shape) pairs specifying the outputs of this module.
+  override def outputShapes: IndexedSeq[(String, Shape)] = {
+    require(this.binded)
+    this._currModule.outputShapes
+  }
+
+  /**
+   * Get current parameters.
+   * `(arg_params, aux_params)`, each a dictionary of name to parameters (in
+   * `NDArray`) mapping.
+   */
+  override def getParams: (Map[String, NDArray], Map[String, NDArray]) = {
+    require(binded && paramsInitialized)
+    this._currModule.paramsDirty = this.paramsDirty
+    val params = this._currModule.getParams
+    this.paramsDirty = false
+    params
+  }
+
+  /**
+   * Assign parameter and aux state values.
+   * @param argParams Dictionary of name to value (`NDArray`) mapping.
+   * @param auxParams Dictionary of name to value (`NDArray`) mapping.
+   * @param allowMissing
+   *         If true, params could contain missing values, and the initializer will be
+   *         called to fill those missing params.
+   * @param forceInit
+   *         If true, will force re-initialize even if already initialized.
+   * @param allowExtra
+   *         Whether allow extra parameters that are not needed by symbol.
+   *         If this is True, no error will be thrown when argParams or auxParams
+   *         contain extra parameters that is not needed by the executor.
+   */
+  override def setParams(argParams: Map[String, NDArray],
+                auxParams: Map[String, NDArray],
+                allowMissing: Boolean = false,
+                forceInit: Boolean = true,
+                allowExtra: Boolean = false): Unit = {
+    if (!allowMissing) {
+      this.initParams(null, argParams, auxParams, allowMissing, forceInit, allowExtra)
+    } else if (this.paramsInitialized && !forceInit) {
+      logger.warn("Parameters already initialized and forceInit=false. " +
+        "setParams call ignored.")
+    } else {
+      this._currModule.setParams(
+        argParams, auxParams, allowMissing, forceInit, allowExtra)
+
+      // because we didn't update self._arg_params, they are dirty now.
+      this.paramsDirty = true
+      this.paramsInitialized = true
+    }
+  }
+
+  /**
+   * Initialize the parameters and auxiliary states.
+   * @param initializer Called to initialize parameters if needed.
+   * @param argParams If not None, should be a dictionary of existing arg_params.
+   *                  Initialization will be copied from that.
+   * @param auxParams If not None, should be a dictionary of existing aux_params.
+   *                  Initialization will be copied from that.
+   * @param allowMissing If true, params could contain missing values,
+   *                     and the initializer will be called to fill those missing params.
+   * @param forceInit If true, will force re-initialize even if already initialized.
+   * @param allowExtra Whether allow extra parameters that are not needed by symbol.
+   *         If this is True, no error will be thrown when argParams or auxParams
+   *         contain extra parameters that is not needed by the executor.
+   */
+  override def initParams(initializer: Initializer = new Uniform(0.01f),
+                          argParams: Map[String, NDArray] = null,
+                          auxParams: Map[String, NDArray] = null,
+                          allowMissing: Boolean = false,
+                          forceInit: Boolean = false,
+                          allowExtra: Boolean = false): Unit = {
+    if (paramsInitialized && !forceInit) {
+      return
+    }
+    require(binded, "call bind before initializing the parameters")
+    this._currModule.initParams(initializer, argParams, auxParams,
+      allowMissing, forceInit, allowExtra)
+    this.paramsDirty = false
+    this.paramsInitialized = true
+  }
+
+  /**
+   * Bind the symbols to construct executors. This is necessary before one
+   * can perform computation with the module.
+   * @param dataShapes Typically is `dataIter.provideData`.
+   * @param labelShapes Typically is `dataIter.provideLabel`.
+   * @param forTraining Default is `true`. Whether the executors should be bind for training.
+   * @param inputsNeedGrad Default is `false`.
+   *                       Whether the gradients to the input data need to be computed.
+   *                       Typically this is not needed.
+   *                       But this might be needed when implementing composition of modules.
+   * @param forceRebind Default is `false`.
+   *                    This function does nothing if the executors are already binded.
+   *                    But with this `true`, the executors will be forced to rebind.
+   * @param sharedModule Default is `None`. This is used in bucketing.
+   *                     When not `None`, the shared module essentially corresponds to
+   *                     a different bucket -- a module with different symbol
+   *                     but with the same sets of parameters
+   *                     (e.g. unrolled RNNs with different lengths).
+   */
+  override def bind(dataShapes: IndexedSeq[DataDesc],
+                    labelShapes: Option[IndexedSeq[DataDesc]] = None,
+                    forTraining: Boolean = true, inputsNeedGrad: Boolean = false,
+                    forceRebind: Boolean = false, sharedModule: Option[BaseModule] = None,
+                    gradReq: String = "write"): Unit = {
+    // in case we already initialized params, keep it
+    val (argParams, auxParams) =
+      if (this.paramsInitialized) this.getParams
+      else (null, null)
+
+    // force rebinding is typically used when one want to switch from
+    // training to prediction phase.
+    if (forceRebind) this.resetBind()
+
+    if (this.binded) {
+      logger.warn("Already bound, ignoring bind()")
+      return
+    }
+
+    require(sharedModule == None,
+      "shared_module for BucketingModule is not supported")
+
+    this.forTraining = forTraining
+    this.inputsNeedGrad = inputsNeedGrad
+    this.binded = true
+
+    val (sym, dNames, lNames) = this.symGen(this.defaultBucketKey)
+    val module = new Module(sym, dNames, lNames, this.contexts,
+      this.workLoadList, this.fixedParamNames)
+    module.bind(dataShapes, labelShapes, forTraining, inputsNeedGrad,
+      forceRebind = false, sharedModule = None, gradReq)
+    this._currModule = module
+    this._currBucketKey = this.defaultBucketKey
+    this._buckets(this.defaultBucketKey) = module
+
+    // copy back saved params, if already initialized
+    if (this.paramsInitialized) {
+      this.setParams(argParams, auxParams)
+    }
+  }
+
+  /**
+   * Switches to a different bucket. This will change ``this._currModule``.
+   * @param bucketKey The key of the target bucket.
+   * @param dataShapes Typically is `dataIter.provideData`.
+   * @param labelShapes Typically is `dataIter.provideLabel`.
+   */
+  def switchBucket(bucketKey: AnyRef, dataShapes: IndexedSeq[DataDesc],
+    labelShapes: Option[IndexedSeq[DataDesc]] = None): Unit = {
+    require(this.binded, "call bind before switching bucket")
+    if (!this._buckets.contains(bucketKey)) {
+      val (sym, dNames, lNames) = this.symGen(bucketKey)
+      val module = new Module(sym, dNames, lNames, this.contexts,
+        this.workLoadList, this.fixedParamNames)
+      module.bind(dataShapes, labelShapes, this._currModule.forTraining,
+        this._currModule.inputsNeedGrad, forceRebind = false,
+        sharedModule = Option(this._buckets(this.defaultBucketKey)))
+      this._buckets(bucketKey) = module
+    }
+
+    this._currModule = this._buckets(bucketKey)
+    this._currBucketKey = bucketKey
+  }
+
+  /**
+   * Install and initialize optimizers.
+   * @param kvstore
+   * @param optimizer
+   * @param resetOptimizer Default `True`, indicating whether we should set `rescaleGrad`
+   *                       & `idx2name` for optimizer according to executorGroup
+   * @param forceInit Default `False`, indicating whether we should force re-initializing
+   *                  the optimizer in the case an optimizer is already installed.
+   */
+  override def initOptimizer(kvstore: String = "local", optimizer: Optimizer = new SGD(),
+                    resetOptimizer: Boolean = true, forceInit: Boolean = false): Unit = {
+    require(binded && paramsInitialized)
+    if (optimizerInitialized && !forceInit) {
+      logger.warn("optimizer already initialized, ignoring ...")
+    } else {
+      this._currModule.initOptimizer(kvstore, optimizer, resetOptimizer, forceInit)
+      for (mod <- this._buckets.values) {
+        if (mod != this._currModule) mod.borrowOptimizer(this._currModule)
+      }
+      this.optimizerInitialized = true
+    }
+  }
+
+  /**
+   * Prepares a data batch for forward.
+   * @param dataBatch input data
+   */
+  def prepare(dataBatch: DataBatch): Unit = {
+    // perform bind if haven't done so
+    require(this.binded && this.paramsInitialized)
+    val bucketKey = dataBatch.bucketKey
+    val originalBucketKey = this._currBucketKey
+    this.switchBucket(bucketKey, dataBatch.provideData, Option(dataBatch.provideLabel))
+    // switch back
+    this.switchBucket(originalBucketKey, null, None)
+  }
+
+  /**
+   * Forward computation.
+   * @param dataBatch input data
+   * @param isTrain Default is `None`, which means `is_train` takes the value of `for_training`.
+   */
+  override def forward(dataBatch: DataBatch, isTrain: Option[Boolean] = None): Unit = {
+    require(binded && paramsInitialized)
+    this.switchBucket(dataBatch.bucketKey, dataBatch.provideData,
+      Option(dataBatch.provideLabel))
+    this._currModule.forward(dataBatch, isTrain)
+  }
+
+  /**
+   * Backward computation.
+   * @param outGrads Gradient on the outputs to be propagated back.
+   *                 This parameter is only needed when bind is called
+   *                 on outputs that are not a loss function.
+   */
+  override def backward(outGrads: Array[NDArray] = null): Unit = {
+    require(binded && paramsInitialized)
+    this._currModule.backward(outGrads)
+  }
+
+  // Update parameters according to the installed optimizer and the gradients computed
+  // in the previous forward-backward cycle.
+  override def update(): Unit = {
+    require(binded && paramsInitialized && optimizerInitialized)
+    this.paramsDirty = true
+    this._currModule.update()
+  }
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be collected from multiple devices.
+   *         The results will look like `[[out1_dev1, out1_dev2], [out2_dev1, out2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  override def getOutputs(): IndexedSeq[IndexedSeq[NDArray]] = {
+    require(binded && paramsInitialized)
+    this._currModule.getOutputs()
+  }
+
+  /**
+   * Get outputs of the previous forward computation.
+   * @return In the case when data-parallelism is used,
+   *         the outputs will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[out1, out2]`
+   */
+  override def getOutputsMerged(): IndexedSeq[NDArray] = {
+    require(binded && paramsInitialized)
+    this._currModule.getOutputsMerged()
+  }
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be collected from multiple devices.
+   *         The results will look like `[[grad1_dev1, grad1_dev2], [grad2_dev1, grad2_dev2]]`,
+   *         those `NDArray` might live on different devices.
+   */
+  override def getInputGrads(): IndexedSeq[IndexedSeq[NDArray]] = {
+    require(binded && paramsInitialized && inputsNeedGrad)
+    this._currModule.getInputGrads()
+  }
+
+  /**
+   * Get the gradients to the inputs, computed in the previous backward computation.
+   * @return In the case when data-parallelism is used,
+   *         the grads will be merged from multiple devices,
+   *         as they look like from a single executor.
+   *         The results will look like `[grad1, grad2]`
+   */
+  override def getInputGradsMerged(): IndexedSeq[NDArray] = {
+    require(binded && paramsInitialized && inputsNeedGrad)
+    this._currModule.getInputGradsMerged()
+  }
+
+  /**
+   * Evaluate and accumulate evaluation metric on outputs of the last forward computation.
+   * @param evalMetric
+   * @param labels
+   */
+  override def updateMetric(evalMetric: EvalMetric, labels: IndexedSeq[NDArray]): Unit = {
+    require(binded && paramsInitialized)
+    this._currModule.updateMetric(evalMetric, labels)
+  }
+
+  override def getSymbol: Symbol = {
+    require(binded)
+    this._currModule.symbol
+  }
+
+  // Install monitor on all executors
+  override def installMonitor(monitor: Monitor): Unit = {
+    require(binded)
+    for (mod <- this._buckets.values) mod.installMonitor(monitor)
+  }
+}
diff --git a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala
index b9cc07826504..445622e5cbfc 100644
--- a/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala
+++ b/scala-package/core/src/main/scala/ml/dmlc/mxnet/module/Module.scala
@@ -56,7 +56,7 @@ class Module(symbolVar: Symbol,
   private val auxNames = symbol.listAuxiliaryStates()
   private val outputNamesVar = symbol.listOutputs()
 
-  private var paramsDirty = false
+  private[module] var paramsDirty = false
 
   private var optimizer: Optimizer = null
   private var kvstore: Option[KVStore] = None
@@ -168,6 +168,41 @@ class Module(symbolVar: Symbol,
     }
   }
 
+  /**
+   * Assign parameter and aux state values.
+   *     argParams : dict
+   *         Dictionary of name to value (`NDArray`) mapping.
+   *     auxParams : dict
+   *         Dictionary of name to value (`NDArray`) mapping.
+   *     allowMissing : bool
+   *         If true, params could contain missing values, and the initializer will be
+   *         called to fill those missing params.
+   *     forceInit : bool
+   *         If true, will force re-initialize even if already initialized.
+   *     allowExtra : bool
+   *         Whether allow extra parameters that are not needed by symbol.
+   *         If this is True, no error will be thrown when argParams or auxParams
+   *         contain extra parameters that is not needed by the executor.
+   */
+  override def setParams(argParams: Map[String, NDArray],
+                auxParams: Map[String, NDArray],
+                allowMissing: Boolean = false,
+                forceInit: Boolean = true,
+                allowExtra: Boolean = false): Unit = {
+    if (!allowMissing) {
+      this.initParams(null, argParams, auxParams, allowMissing, forceInit, allowExtra)
+    } else if (this.paramsInitialized && !forceInit) {
+      logger.warn("Parameters already initialized and forceInit=false. " +
+        "setParams call ignored.")
+    } else {
+      this.execGroup.setParams(argParams, auxParams, allowExtra)
+
+      // because we didn't update self._arg_params, they are dirty now.
+      this.paramsDirty = true
+      this.paramsInitialized = true
+    }
+  }
+
   // Internal function to reset binded state.
   private def resetBind(): Unit = {
     binded = false
diff --git a/scala-package/core/src/test/resources/log4j.properties b/scala-package/core/src/test/resources/log4j.properties
index 7d7ca36b28a1..d82fd7ea4f3d 100644
--- a/scala-package/core/src/test/resources/log4j.properties
+++ b/scala-package/core/src/test/resources/log4j.properties
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # for development debugging
 log4j.rootLogger = debug, stdout
 
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala
index e1d091d1cd01..0aa1e726556e 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/NDArraySuite.scala
@@ -170,7 +170,7 @@ class NDArraySuite extends FunSuite with BeforeAndAfterAll with Matchers {
       val result = (start until stop by step).flatMap(x => Array.fill[Float](repeat)(x))
       val range = NDArray.arange(start = start, stop = Some(stop), step = step,
         repeat = repeat, ctx = Context.cpu(), dType = DType.Float32)
-      assert(CheckUtils.reldiff(result.toArray, range.toArray) <= 1e-5f)
+      assert(CheckUtils.reldiff(result.toArray, range.toArray) <= 1e-4f)
     }
   }
 
diff --git a/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala b/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
index 86f04366a938..6c9a2fa24f9c 100644
--- a/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
+++ b/scala-package/core/src/test/scala/ml/dmlc/mxnet/OperatorSuite.scala
@@ -37,7 +37,7 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     exec.forward()
     val forwardOutput = exec.outputs(0)
     val forwardOutputExpected = arr.reduce(_ + _)
-    assert(reldiff(forwardOutput, forwardOutputExpected) < 2e-6)
+    assert(reldiff(forwardOutput, forwardOutputExpected) < 5e-5)
 
     // backward
     val outGrad = Random.uniform(-10, 10, shape)
diff --git a/scala-package/dev/change-artifact-id.sh b/scala-package/dev/change-artifact-id.sh
new file mode 100755
index 000000000000..fa7837972b8a
--- /dev/null
+++ b/scala-package/dev/change-artifact-id.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# (Yizhi) This is mainly inspired by the script in apache/spark.
+# I did some modificaiton to get it with our project.
+#
+
+set -e
+
+if [[ ($# -ne 2) || ( $1 == "--help") ||  $1 == "-h" ]]; then
+  echo "Usage: $(basename $0) [-h|--help] <from_artifactId> <to_artifactId>" 1>&2
+  exit 1
+fi
+
+FROM_ARTIFACT_ID=$1
+TO_ARTIFACT_ID=$2
+
+sed_i() {
+  perl -p -000 -e "$1" "$2" > "$2.tmp" && mv "$2.tmp" "$2"
+}
+   
+export -f sed_i
+ 
+BASEDIR=$(dirname $0)/..
+
+find "$BASEDIR" -name 'pom.xml' -not -path '*target*' -print \
+  -exec bash -c \
+  "sed_i 's/(<artifactId)>'$FROM_ARTIFACT_ID'(<\/artifactId>)/\1>'$TO_ARTIFACT_ID'\2/g' {}" \;
+
+# Change assembly including settings
+# <includes>
+# 	<include>ml.dmlc.mxnet:libmxnet-scala-linux-x86_64-cpu:so</include>
+# </includes>
+find "$BASEDIR" -name 'assembly.xml' -not -path '*target*' -print \
+  -exec bash -c \
+  "sed_i 's/(<include>.*mxnet):'$FROM_ARTIFACT_ID'(:.*<\/include>)/\1:'$TO_ARTIFACT_ID'\2/g' {}" \;
diff --git a/scala-package/examples/pom.xml b/scala-package/examples/pom.xml
index 356690cf0176..f65caa16a77c 100644
--- a/scala-package/examples/pom.xml
+++ b/scala-package/examples/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -118,7 +138,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/examples/scripts/customop/run_customop.sh b/scala-package/examples/scripts/customop/run_customop.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/customop/run_customopwithrtc.sh b/scala-package/examples/scripts/customop/run_customopwithrtc.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/module/run_sequential_module.sh b/scala-package/examples/scripts/module/run_sequential_module.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/neuralstyle_end2end/run_test_end2end.sh b/scala-package/examples/scripts/neuralstyle_end2end/run_test_end2end.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/neuralstyle_end2end/run_train_end2end.sh b/scala-package/examples/scripts/neuralstyle_end2end/run_train_end2end.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/profiler/run_profiler_matmul.sh b/scala-package/examples/scripts/profiler/run_profiler_matmul.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/profiler/run_profiler_ndarray.sh b/scala-package/examples/scripts/profiler/run_profiler_ndarray.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/rnn/run_lstm_bucketing.sh b/scala-package/examples/scripts/rnn/run_lstm_bucketing.sh
new file mode 100755
index 000000000000..3ad160e2bab3
--- /dev/null
+++ b/scala-package/examples/scripts/rnn/run_lstm_bucketing.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
+OS=$(uname)
+if [ "$OS" = "Darwin" ]; then
+  CLASS_PATH=$MXNET_ROOT/scala-package/assembly/osx-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+else
+  CLASS_PATH=$MXNET_ROOT/scala-package/assembly/linux-x86_64-gpu/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
+fi
+
+DATA_TRAIN=$1
+DATA_VAL=$2
+NUM_EPOCH=5
+GPUS="0"
+SAVE_MODEL_PATH=./model/lstm
+
+java -Xmx4G -cp $CLASS_PATH \
+  ml.dmlc.mxnetexamples.rnn.LstmBucketing \
+  --data-train $DATA_TRAIN \
+  --data-val $DATA_VAL \
+  --num-epoch $NUM_EPOCH \
+  --gpus $GPUS \
+  --save-model-path $SAVE_MODEL_PATH
diff --git a/scala-package/examples/scripts/rnn/run_test_charrnn.sh b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/run_cnntextclassification.sh b/scala-package/examples/scripts/run_cnntextclassification.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/run_gan_mnist.sh b/scala-package/examples/scripts/run_gan_mnist.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/run_multitask.sh b/scala-package/examples/scripts/run_multitask.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/run_neuralstyle.sh b/scala-package/examples/scripts/run_neuralstyle.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/scripts/run_visualization.sh b/scala-package/examples/scripts/run_visualization.sh
old mode 100644
new mode 100755
diff --git a/scala-package/examples/src/main/resources/log4j.properties b/scala-package/examples/src/main/resources/log4j.properties
index cb92f4c5250a..ef523cb7bc4f 100644
--- a/scala-package/examples/src/main/resources/log4j.properties
+++ b/scala-package/examples/src/main/resources/log4j.properties
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 # for development debugging
 log4j.rootLogger = info, stdout
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Lstm.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Lstm.scala
index fe900a3bd0b4..7804fd0e613b 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Lstm.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/Lstm.scala
@@ -77,8 +77,8 @@ object Lstm {
                                      i2hBias = Symbol.Variable(s"l${i}_i2h_bias"),
                                      h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
                                      h2hBias = Symbol.Variable(s"l${i}_h2h_bias")))
-      lastStatesBuf.append(LSTMState(c = Symbol.Variable(s"l${i}_init_c"),
-                                     h = Symbol.Variable(s"l${i}_init_h")))
+      lastStatesBuf.append(LSTMState(c = Symbol.Variable(s"l${i}_init_c_beta"),
+                                     h = Symbol.Variable(s"l${i}_init_h_beta")))
     }
     val paramCells = paramCellsBuf.toArray
     val lastStates = lastStatesBuf.toArray
@@ -134,8 +134,8 @@ object Lstm {
                                            i2hBias = Symbol.Variable(s"l${i}_i2h_bias"),
                                            h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
                                            h2hBias = Symbol.Variable(s"l${i}_h2h_bias"))
-      lastStates = lastStates :+ LSTMState(c = Symbol.Variable(s"l${i}_init_c"),
-                                           h = Symbol.Variable(s"l${i}_init_h"))
+      lastStates = lastStates :+ LSTMState(c = Symbol.Variable(s"l${i}_init_c_beta"),
+                                           h = Symbol.Variable(s"l${i}_init_h_beta"))
     }
     assert(lastStates.length == numLstmLayer)
 
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/LstmBucketing.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/LstmBucketing.scala
index a49044bad268..303c2fb1673a 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/LstmBucketing.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/LstmBucketing.scala
@@ -26,6 +26,8 @@ import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.{Logger, LoggerFactory}
 
 import scala.collection.JavaConverters._
+import ml.dmlc.mxnet.module.BucketingModule
+import ml.dmlc.mxnet.module.FitParams
 
 /**
  * Bucketing LSTM examples
@@ -50,6 +52,7 @@ object LstmBucketing {
   private val logger: Logger = LoggerFactory.getLogger(classOf[LstmBucketing])
 
   def perplexity(label: NDArray, pred: NDArray): Float = {
+    pred.waitToRead()
     val labelArr = label.T.toArray.map(_.toInt)
     var loss = .0
     (0 until pred.shape(0)).foreach(i =>
@@ -74,25 +77,22 @@ object LstmBucketing {
       val numEmbed = 200
       val numLstmLayer = 2
 
-      val learningRate = 0.01f
-      val momentum = 0.0f
-
       logger.info("Building vocab ...")
       val vocab = BucketIo.defaultBuildVocab(inst.dataTrain)
 
-      class BucketSymGen extends SymbolGenerator {
-        override def generate(key: AnyRef): Symbol = {
-          val seqLen = key.asInstanceOf[Int]
-          Lstm.lstmUnroll(numLstmLayer, seqLen, vocab.size,
-            numHidden = numHidden, numEmbed = numEmbed, numLabel = vocab.size)
-        }
+      def BucketSymGen(key: AnyRef):
+        (Symbol, IndexedSeq[String], IndexedSeq[String]) = {
+        val seqLen = key.asInstanceOf[Int]
+        val sym = Lstm.lstmUnroll(numLstmLayer, seqLen, vocab.size,
+          numHidden = numHidden, numEmbed = numEmbed, numLabel = vocab.size)
+        (sym, IndexedSeq("data"), IndexedSeq("softmax_label"))
       }
 
       val initC = (0 until numLstmLayer).map(l =>
-        (s"l${l}_init_c", (batchSize, numHidden))
+        (s"l${l}_init_c_beta", (batchSize, numHidden))
       )
       val initH = (0 until numLstmLayer).map(l =>
-        (s"l${l}_init_h", (batchSize, numHidden))
+        (s"l${l}_init_h_beta", (batchSize, numHidden))
       )
       val initStates = initC ++ initH
 
@@ -101,18 +101,26 @@ object LstmBucketing {
       val dataVal = new BucketSentenceIter(inst.dataVal, vocab,
         buckets, batchSize, initStates)
 
+      val model = new BucketingModule(
+        symGen = BucketSymGen,
+        defaultBucketKey = dataTrain.defaultBucketKey,
+        contexts = contexts)
+
+      val fitParams = new FitParams()
+      fitParams.setEvalMetric(
+        new CustomMetric(perplexity, name = "perplexity"))
+      fitParams.setKVStore("device")
+      fitParams.setOptimizer(
+        new SGD(learningRate = 0.01f, momentum = 0f, wd = 0.00001f))
+      fitParams.setInitializer(new Xavier(factorType = "in", magnitude = 2.34f))
+      fitParams.setBatchEndCallback(new Speedometer(batchSize, 50))
+
       logger.info("Start training ...")
-      val model = FeedForward.newBuilder(new BucketSymGen())
-        .setContext(contexts)
-        .setNumEpoch(inst.numEpoch)
-        .setOptimizer(new SGD(learningRate = learningRate, momentum = momentum, wd = 0.00001f))
-        .setInitializer(new Xavier(factorType = "in", magnitude = 2.34f))
-        .setTrainData(dataTrain)
-        .setEvalData(dataVal)
-        .setEvalMetric(new CustomMetric(perplexity, name = "perplexity"))
-        .setBatchEndCallback(new Speedometer(batchSize, 50))
-        .build()
-      model.save(inst.saveModelPath)
+      model.fit(
+        trainData = dataTrain,
+        evalData = Some(dataVal),
+        numEpoch = inst.numEpoch, fitParams)
+      logger.info("Finished training...")
     } catch {
       case ex: Exception =>
         logger.error(ex.getMessage, ex)
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/RnnModel.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/RnnModel.scala
index b2188f81eb11..5f919f13e030 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/RnnModel.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/RnnModel.scala
@@ -35,9 +35,9 @@ object RnnModel {
                                                dropout)
     private val batchSize = 1
     private val initC = (for (l <- 0 until numLstmLayer)
-                          yield (s"l${l}_init_c" -> Shape(batchSize, numHidden))).toMap
+                          yield (s"l${l}_init_c_beta" -> Shape(batchSize, numHidden))).toMap
     private val initH = (for (l <- 0 until numLstmLayer)
-                          yield (s"l${l}_init_h" -> Shape(batchSize, numHidden))).toMap
+                          yield (s"l${l}_init_h_beta" -> Shape(batchSize, numHidden))).toMap
     private val dataShape = Map("data" -> Shape(batchSize))
     private val inputShape = initC ++ initH ++ dataShape
     private val executor = sym.simpleBind(ctx = ctx, shapeDict = inputShape)
@@ -49,7 +49,7 @@ object RnnModel {
     }
 
     private var stateName = (Array[String]() /: (0 until numLstmLayer)) { (acc, i) =>
-      acc :+ s"l${i}_init_c"  :+ s"l${i}_init_h"
+      acc :+ s"l${i}_init_c_beta"  :+ s"l${i}_init_h_beta"
     }
 
     private val statesDict = stateName.zip(this.executor.outputs.drop(1)).toMap
diff --git a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TrainCharRnn.scala b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TrainCharRnn.scala
index 59c074e1cbe7..2fe27800359f 100644
--- a/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TrainCharRnn.scala
+++ b/scala-package/examples/src/main/scala/ml/dmlc/mxnetexamples/rnn/TrainCharRnn.scala
@@ -70,8 +70,10 @@ object TrainCharRnn {
       }
 
       // initalize states for LSTM
-      val initC = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_c", (batchSize, numHidden))
-      val initH = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_h", (batchSize, numHidden))
+      val initC = for (l <- 0 until numLstmLayer)
+        yield (s"l${l}_init_c_beta", (batchSize, numHidden))
+      val initH = for (l <- 0 until numLstmLayer)
+        yield (s"l${l}_init_h_beta", (batchSize, numHidden))
       val initStates = initC ++ initH
 
       val dataTrain = new BucketIo.BucketSentenceIter(incr.dataPath, vocab, buckets,
diff --git a/scala-package/init-native/linux-x86_64/pom.xml b/scala-package/init-native/linux-x86_64/pom.xml
index 9d784c471cb9..6a887897424b 100644
--- a/scala-package/init-native/linux-x86_64/pom.xml
+++ b/scala-package/init-native/linux-x86_64/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +40,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/osx-x86_64/pom.xml b/scala-package/init-native/osx-x86_64/pom.xml
index fb3748e5698f..7c0005224d0f 100644
--- a/scala-package/init-native/osx-x86_64/pom.xml
+++ b/scala-package/init-native/osx-x86_64/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-init-native-parent</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +40,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/init-native/pom.xml b/scala-package/init-native/pom.xml
index 2b633169501d..cb04a388888d 100644
--- a/scala-package/init-native/pom.xml
+++ b/scala-package/init-native/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc b/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc
index 114510c66afb..19051891df66 100644
--- a/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc
+++ b/scala-package/init-native/src/main/native/ml_dmlc_mxnet_init_native_c_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ml_dmlc_mxnet_native_c_api.cc
  * \brief JNI function implementations
  */
diff --git a/scala-package/init/pom.xml b/scala-package/init/pom.xml
index 04413e219429..31342335b198 100644
--- a/scala-package/init/pom.xml
+++ b/scala-package/init/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,8 +26,8 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>${project.version}</version>
-    <relativePath>../pom.xml</relativePath>
+    <version>1.0.0-SNAPSHOT</version>
+<!--  <relativePath>../pom.xml</relativePath>-->
   </parent>
 
   <artifactId>mxnet-init_2.11</artifactId>
diff --git a/scala-package/macros/pom.xml b/scala-package/macros/pom.xml
index 2a1498cb2639..e2ee63210856 100644
--- a/scala-package/macros/pom.xml
+++ b/scala-package/macros/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -41,13 +61,13 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-init_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>libmxnet-init-scala-${platform}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
       <type>${libtype}</type>
     </dependency>
diff --git a/scala-package/macros/src/main/scala/ml/dmlc/mxnet/NDArrayMacro.scala b/scala-package/macros/src/main/scala/ml/dmlc/mxnet/NDArrayMacro.scala
index 45e2571f2956..1be9a8d71b6b 100644
--- a/scala-package/macros/src/main/scala/ml/dmlc/mxnet/NDArrayMacro.scala
+++ b/scala-package/macros/src/main/scala/ml/dmlc/mxnet/NDArrayMacro.scala
@@ -177,7 +177,10 @@ private[mxnet] object NDArrayMacro {
     val realName = if (aliasName == name.value) "" else s"(a.k.a., ${name.value})"
     val docStr = s"$aliasName $realName\n${desc.value}\n\n$paramStr\n$extraDoc\n"
     // scalastyle:off println
-    println("NDArray function definition:\n" + docStr)
+    if (System.getenv("MXNET4J_PRINT_OP_DEF") != null
+          && System.getenv("MXNET4J_PRINT_OP_DEF").toLowerCase == "true") {
+      println("NDArray function definition:\n" + docStr)
+    }
     // scalastyle:on println
     (aliasName, new NDArrayFunction(handle))
   }
diff --git a/scala-package/macros/src/main/scala/ml/dmlc/mxnet/SymbolMacro.scala b/scala-package/macros/src/main/scala/ml/dmlc/mxnet/SymbolMacro.scala
index bd5b05bd818f..730d4243123e 100644
--- a/scala-package/macros/src/main/scala/ml/dmlc/mxnet/SymbolMacro.scala
+++ b/scala-package/macros/src/main/scala/ml/dmlc/mxnet/SymbolMacro.scala
@@ -169,7 +169,10 @@ private[mxnet] object SymbolImplMacros {
     val realName = if (aliasName == name.value) "" else s"(a.k.a., ${name.value})"
     val docStr = s"$aliasName $realName\n${desc.value}\n\n$paramStr\n$extraDoc\n"
     // scalastyle:off println
-    println("Symbol function definition:\n" + docStr)
+    if (System.getenv("MXNET4J_PRINT_OP_DEF") != null
+          && System.getenv("MXNET4J_PRINT_OP_DEF").toLowerCase == "true") {
+      println("Symbol function definition:\n" + docStr)
+    }
     // scalastyle:on println
     (aliasName, new SymbolFunction(handle, keyVarNumArgs.value))
   }
diff --git a/scala-package/native/linux-x86_64-cpu/pom.xml b/scala-package/native/linux-x86_64-cpu/pom.xml
index df45cd9e6c2e..cbe124ad367d 100644
--- a/scala-package/native/linux-x86_64-cpu/pom.xml
+++ b/scala-package/native/linux-x86_64-cpu/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +40,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/linux-x86_64-gpu/pom.xml b/scala-package/native/linux-x86_64-gpu/pom.xml
index edc70e923ff7..705d84e97841 100644
--- a/scala-package/native/linux-x86_64-gpu/pom.xml
+++ b/scala-package/native/linux-x86_64-gpu/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +40,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/osx-x86_64-cpu/pom.xml b/scala-package/native/osx-x86_64-cpu/pom.xml
index b6fb83f26a71..b8e741abf8a8 100644
--- a/scala-package/native/osx-x86_64-cpu/pom.xml
+++ b/scala-package/native/osx-x86_64-cpu/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-scala-native-parent</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -20,7 +40,7 @@
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <type>jar</type>
       <scope>compile</scope>
     </dependency>
diff --git a/scala-package/native/pom.xml b/scala-package/native/pom.xml
index e68ebb96666e..dfa23866f597 100644
--- a/scala-package/native/pom.xml
+++ b/scala-package/native/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/scala-package/native/src/main/native/jni_helper_func.h b/scala-package/native/src/main/native/jni_helper_func.h
index 009bbec64e66..2ef31b2b39d3 100644
--- a/scala-package/native/src/main/native/jni_helper_func.h
+++ b/scala-package/native/src/main/native/jni_helper_func.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file jni_helper_func.h
  * \brief Helper functions for operating JVM objects
  */
diff --git a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
index 166f6b71eb9f..f4fe93be065a 100644
--- a/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
+++ b/scala-package/native/src/main/native/ml_dmlc_mxnet_native_c_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ml_dmlc_mxnet_native_c_api.cc
  * \brief JNI function implementations
  */
@@ -2301,7 +2302,10 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_mxnet_LibInfo_mxCustomOpRegister
                   ptrsArr, (jsize)0, (jsize)size, reinterpret_cast<jlong*>(ptrs));
 #if MXNET_USE_CUDA
                 mxnet::NDArray* tmp = reinterpret_cast<mxnet::NDArray*>(ptrs[0]);
-                CUDA_CALL(cudaSetDevice(tmp->ctx().dev_id));
+                if (tmp->ctx().dev_type == mxnet::Context::kGPU
+                  || tmp->ctx().dev_type == mxnet::Context::kCPUPinned) {
+                  CUDA_CALL(cudaSetDevice(tmp->ctx().dev_id));
+                }
 #endif
                 bool is_train =  true;
                 if (isTrain == 0) is_train = false;
diff --git a/scala-package/pom.xml b/scala-package/pom.xml
index 7bfd8774de6b..68eb598012e6 100644
--- a/scala-package/pom.xml
+++ b/scala-package/pom.xml
@@ -5,7 +5,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>ml.dmlc.mxnet</groupId>
   <artifactId>mxnet-parent_2.11</artifactId>
-  <version>${project.version}</version>
+  <version>1.0.0-SNAPSHOT</version>
   <name>MXNet Scala Package - Parent</name>
   <url>https://github.com/dmlc/mxnet/tree/master/scala-package</url>
   <description>MXNet Scala Package</description>
@@ -48,7 +48,6 @@
   </developers>
 
   <properties>
-    <project.version>0.11.0-SNAPSHOT</project.version>
     <scala.version>2.11.8</scala.version>
     <scala.binary.version>2.11</scala.binary.version>
   </properties>
diff --git a/scala-package/spark/README.md b/scala-package/spark/README.md
index 974691650ff4..06106648c059 100644
--- a/scala-package/spark/README.md
+++ b/scala-package/spark/README.md
@@ -11,7 +11,7 @@ The MXNet on Spark is still in *experimental stage*. Any suggestion or contribut
 Build
 ------------
 
-Checkout the [Installation Guide](http://mxnet.io/get_started/setup.html) contains instructions to install mxnet. Remember to enable the distributed training, i.e., set `USE_DIST_KVSTORE = 1`.
+Checkout the [Installation Guide](http://mxnet.io/get_started/setup.html) contains instructions to install mxnet. Remember to enable the distributed training, i.e., set `USE_DIST_KVSTORE=1`.
 
 Compile the Scala Package by
 
diff --git a/scala-package/spark/bin/run-mnist-example.sh b/scala-package/spark/bin/run-mnist-example.sh
index cae19386a8ee..57b8a1803363 100755
--- a/scala-package/spark/bin/run-mnist-example.sh
+++ b/scala-package/spark/bin/run-mnist-example.sh
@@ -18,47 +18,62 @@
 # under the License.
 
 CURR_DIR=$(cd `dirname $0`; pwd)
-MODULE_DIR=$(cd $CURR_DIR/../; pwd)
-ROOT_DIR=$(cd $CURR_DIR/../../; pwd)
+SPARK_MODULE_DIR=$(cd $CURR_DIR/../; pwd)
+SCALA_PKG_DIR=$(cd $CURR_DIR/../../; pwd)
 
+OS=""
 
-LIB_DIR=${MODULE_DIR}/target/classes/lib
-JAR=${MODULE_DIR}/target/mxnet-spark_2.10-0.1.2-SNAPSHOT.jar
+if [ "$(uname)" == "Darwin" ]; then
+	# Do something under Mac OS X platform
+  OS='osx-x86_64-cpu'
+elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
+  OS='linux-x86_64-cpu'
+fi
 
-LIBS=${ROOT_DIR}/assembly/linux-x86_64-cpu/target/mxnet-full_2.10-linux-x86_64-cpu-0.1.2-SNAPSHOT.jar
-LIBS="${LIBS},${LIB_DIR}/args4j-2.0.29.jar,${LIB_DIR}/scala-library-2.10.4.jar,${JAR}"
+LIB_DIR=${SPARK_MODULE_DIR}/target/classes/lib
+SPARK_JAR=`find ${SPARK_MODULE_DIR}/target -name "*.jar" -type f -exec ls "{}" + | grep -v -E '(javadoc|sources)'`
+SCALA_JAR=`find ${SCALA_PKG_DIR}/assembly/$OS/target -maxdepth 1 -name "*.jar" -type f -exec ls "{}" + | grep -v -E '(javadoc|sources)'`
 
-SPARK_OPTS+=" --name mxnet"
+SPARK_OPTS+=" --name mxnet-spark-mnist"
 SPARK_OPTS+=" --driver-memory 1g"
 SPARK_OPTS+=" --executor-memory 1g"
 SPARK_OPTS+=" --num-executors 2"
 SPARK_OPTS+=" --executor-cores 1"
-SPARK_OPTS+=" --jars ${LIBS}"
+SPARK_OPTS+=" --jars ${SCALA_JAR}"
 
-# You can download these two files as training & validation set.
-# They were converted from the MNIST dataset,
-# in which each sample was simply flatterned to an array of floats.
-# https://s3-us-west-2.amazonaws.com/mxnet.liuyz/data/mnist/train.txt
-# https://s3-us-west-2.amazonaws.com/mxnet.liuyz/data/mnist/val.txt
+# Download training and test set
+if [ ! -f ./train.txt ]; then
+  wget https://s3-us-west-2.amazonaws.com/mxnet.liuyz/data/mnist/train.txt
+fi
+
+if [ ! -f ./val.txt ]; then
+  wget https://s3-us-west-2.amazonaws.com/mxnet.liuyz/data/mnist/val.txt
+fi
 
 # running opts
-RUN_OPTS+=" --input ${INPUT_TRAIN}"
-RUN_OPTS+=" --input-val ${INPUT_VAL}"
-RUN_OPTS+=" --output ${OUTPUT}"
+RUN_OPTS+=" --input train.txt"
+RUN_OPTS+=" --input-val val.txt"
+RUN_OPTS+=" --output ./"
 # These jars are required by the KVStores at runtime.
 # They will be uploaded and distributed to each node automatically.
-RUN_OPTS+=" --jars ${LIBS}"
+RUN_OPTS+=" --jars $SCALA_JAR,$SPARK_JAR"
 RUN_OPTS+=" --num-server 1"
 RUN_OPTS+=" --num-worker 2"
-RUN_OPTS+=" --java /usr/local/jdk1.8.0_60/bin/java"
+RUN_OPTS+=" --java $JAVA_HOME/bin/java"
 RUN_OPTS+=" --model mlp"
 RUN_OPTS+=" --cpus 0,1"
 RUN_OPTS+=" --num-epoch 5"
 
-${SPARK_HOME}/bin/spark-submit --master spark://localhost:7077 \
-  --conf spark.dynamicAllocation.enabled=false \
-  --conf spark.speculation=false \
+# check if SPARK_HOME is set
+if [ -z "$SPARK_HOME" ]; then
+  echo "SPARK_HOME is unset";
+  exit 1
+fi
+
+HOST=`hostname`
+
+$SPARK_HOME/bin/spark-submit --master spark://$HOST:7077 \
   --class ml.dmlc.mxnet.spark.example.ClassificationExample \
   ${SPARK_OPTS} \
-  ${JAR} \
+  ${SPARK_JAR} \
   ${RUN_OPTS}
diff --git a/scala-package/spark/pom.xml b/scala-package/spark/pom.xml
index 18170b95579b..68e6f4e87bcb 100644
--- a/scala-package/spark/pom.xml
+++ b/scala-package/spark/pom.xml
@@ -1,4 +1,24 @@
 <?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
 <project xmlns="http://maven.apache.org/POM/4.0.0"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
@@ -6,7 +26,7 @@
   <parent>
     <groupId>ml.dmlc.mxnet</groupId>
     <artifactId>mxnet-parent_2.11</artifactId>
-    <version>${project.version}</version>
+    <version>1.0.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -14,14 +34,14 @@
   <name>MXNet Scala Package - Spark ML</name>
 
   <properties>
-    <spark.version>1.6.1</spark.version>
+    <spark.version>1.6.3</spark.version>
   </properties>
 
   <dependencies>
     <dependency>
       <groupId>ml.dmlc.mxnet</groupId>
       <artifactId>mxnet-core_${scala.binary.version}</artifactId>
-      <version>${project.version}</version>
+      <version>1.0.0-SNAPSHOT</version>
       <scope>provided</scope>
     </dependency>
     <dependency>
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNet.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNet.scala
index 27dd99f07233..cc77342be751 100644
--- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNet.scala
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/MXNet.scala
@@ -27,14 +27,24 @@ import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext
 
-/**
- * MXNet Training On Spark
- * @author Yizhi Liu
- */
 class MXNet extends Serializable {
+
+  class MXNetControllingThread(
+      schedulerIP: String,
+      schedulerPort: Int,
+      sparkContext: SparkContext,
+      triggerOfComponent: (String, Int, SparkContext) => Unit) extends Thread {
+    override def run() {
+      triggerOfComponent(schedulerIP, schedulerPort, sparkContext)
+    }
+  }
+
   private val logger: Logger = LoggerFactory.getLogger(classOf[MXNet])
   private val params: MXNetParams = new MXNetParams
 
+  @transient private var psServerThread: MXNetControllingThread = _
+  @transient private var psSchedulerThread: MXNetControllingThread = _
+
   def setBatchSize(batchSize: Int): this.type = {
     params.batchSize = batchSize
     this
@@ -105,30 +115,51 @@ class MXNet extends Serializable {
     this
   }
 
-  private def startParameterServers(
+  private def startPSServers(
       schedulerIP: String,
       schedulerPort: Int,
-      sc: SparkContext): ParameterServer = {
-    // TODO: check ip & port available
-    logger.info("Starting scheduler on {}:{}", schedulerIP, schedulerPort)
-    val scheduler = new ParameterServer(params.runtimeClasspath, role = "scheduler",
-      rootUri = schedulerIP, rootPort = schedulerPort,
-      numServer = params.numServer, numWorker = params.numWorker,
-      timeout = params.timeout, java = params.javabin)
-    require(scheduler.startProcess(), "Failed to start ps scheduler process")
-
-    sc.parallelize(1 to params.numServer, params.numServer).foreachPartition { p =>
-      logger.info("Starting server ...")
-      val server = new ParameterServer(params.runtimeClasspath,
-        role = "server",
+      sc: SparkContext) = {
+    def startPSServersInner(
+        schedulerIP: String,
+        schedulerPort: Int,
+        sc: SparkContext): Unit = {
+      sc.parallelize(1 to params.numServer, params.numServer).foreachPartition { p =>
+          logger.info("Starting server ...")
+          val server = new ParameterServer(params.runtimeClasspath,
+            role = "server",
+            rootUri = schedulerIP, rootPort = schedulerPort,
+            numServer = params.numServer,
+            numWorker = params.numWorker,
+            timeout = params.timeout,
+            java = params.javabin)
+          val exitCode = server.startProcess()
+          require(exitCode == 0, s"ps server process quit with exit code $exitCode")
+        }
+    }
+    psServerThread = new MXNetControllingThread(schedulerIP, schedulerPort, sc, startPSServersInner)
+    psServerThread.start()
+  }
+
+  private def startPSScheduler(
+      schedulerIP: String,
+      schedulerPort: Int,
+      sc: SparkContext) = {
+    def startPSSchedulerInner(
+        schedulerIP: String,
+        schedulerPort: Int,
+        sc: SparkContext): Unit = {
+      // TODO: check ip & port available
+      logger.info("Starting scheduler on {}:{}", schedulerIP, schedulerPort)
+      val scheduler = new ParameterServer(params.runtimeClasspath, role = "scheduler",
         rootUri = schedulerIP, rootPort = schedulerPort,
-        numServer = params.numServer,
-        numWorker = params.numWorker,
-        timeout = params.timeout,
-        java = params.javabin)
-      require(server.startProcess(), "Failed to start ps server process")
+        numServer = params.numServer, numWorker = params.numWorker,
+        timeout = params.timeout, java = params.javabin)
+      val exitCode = scheduler.startProcess()
+      require(exitCode == 0, s"Failed to start ps scheduler process with exit code $exitCode")
     }
-    scheduler
+    psSchedulerThread = new MXNetControllingThread(schedulerIP, schedulerPort, sc,
+      startPSSchedulerInner)
+    psSchedulerThread.start()
   }
 
   private def setFeedForwardModel(
@@ -212,23 +243,21 @@ class MXNet extends Serializable {
     // distribute native jars
     params.jars.foreach(jar => sc.addFile(jar))
     val trainData = {
-      if (params.numWorker > data.partitions.length) {
+      if (params.numWorker != data.partitions.length) {
         logger.info("repartitioning training set to {} partitions", params.numWorker)
         data.repartition(params.numWorker)
-      } else if (params.numWorker < data.partitions.length) {
-        logger.info("repartitioning training set to {} partitions", params.numWorker)
-        data.coalesce(params.numWorker)
       } else {
         data
       }
     }
     val schedulerIP = utils.Network.ipAddress
     val schedulerPort = utils.Network.availablePort
-    val scheduler = startParameterServers(schedulerIP, schedulerPort, sc)
-    // simply the first model
+    startPSScheduler(schedulerIP, schedulerPort, sc)
+    startPSServers(schedulerIP, schedulerPort, sc)
     val mxModel = trainModel(trainData, schedulerIP, schedulerPort)
     logger.info("Waiting for scheduler ...")
-    scheduler.waitFor()
+    psSchedulerThread.join()
+    psServerThread.join()
     mxModel
   }
 }
diff --git a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala
index 60e1c697490b..7ed45124440f 100644
--- a/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala
+++ b/scala-package/spark/src/main/scala/ml/dmlc/mxnet/spark/ParameterServer.scala
@@ -84,17 +84,19 @@ private[mxnet] object ParameterServer {
   }
 }
 
-class ParameterServer(private val classpath: String,
-                      private val role: String,
-                      private val rootUri: String,
-                      private val rootPort: Int,
-                      private val numServer: Int = 1,
-                      private val numWorker: Int = 1,
-                      private val timeout: Int = 0,
-                      private val java: String = "java",
-                      private val jvmOpts: String = "") {
+class ParameterServer(
+    classpath: String,
+    role: String,
+    rootUri: String,
+    rootPort: Int,
+    numServer: Int = 1,
+    numWorker: Int = 1,
+    timeout: Int = 0,
+    java: String = "java",
+    jvmOpts: String = "") {
+
   private val logger: Logger = LoggerFactory.getLogger(classOf[ParameterServer])
-  private val trackerProcess: AtomicReference[Process] = new AtomicReference[Process]
+  private val psProcess: AtomicReference[Process] = new AtomicReference[Process]
 
   /**
    * A utility class to redirect the child process's stdout or stderr.
@@ -121,47 +123,38 @@ class ParameterServer(private val classpath: String,
     }
   }
 
-  def startProcess(): Boolean = {
+  private def startLoggingThreads(rootUri: String, rootPort: Int): Unit = {
+    val inputStream = psProcess.get().getInputStream
+    val errorStream = psProcess.get().getErrorStream
+    logger.info(s"Starting InputStream-Redirecter Thread for $rootUri:$rootPort")
+    new RedirectThread(inputStream, System.out, "InputStream-Redirecter", true).start()
+    logger.info(s"Starting ErrorStream-Redirecter Thread for $rootUri:$rootPort")
+    new RedirectThread(errorStream, System.err, "ErrorStream-Redirecter", true).start()
+  }
+
+  def startProcess(): Int = {
     val cp = if (classpath == null) "" else s"-cp $classpath"
     val cmd = s"$java $jvmOpts $cp $runningClass " +
       s"--role=$role --root-uri=$rootUri --root-port=$rootPort " +
       s"--num-server=$numServer --num-worker=$numWorker --timeout=$timeout"
-    logger.info(s"Start process: $cmd")
     try {
       val childProcess = Runtime.getRuntime.exec(cmd)
-      trackerProcess.set(childProcess)
-      val inputStream = childProcess.getInputStream
-      val errorStream = childProcess.getErrorStream
-      logger.info("Starting InputStream-Redirecter Thread")
-      new RedirectThread(inputStream, System.out, "InputStream-Redirecter", true).start()
-      logger.info("Starting ErrorStream-Redirecter Thread")
-      new RedirectThread(errorStream, System.err, "ErrorStream-Redirecter", true).start()
-      true
+      logger.info(s"Started process: $cmd at $rootUri:$rootPort")
+      psProcess.set(childProcess)
+      startLoggingThreads(rootUri, rootPort)
+      psProcess.get().waitFor()
     } catch {
       case ioe: IOException =>
         ioe.printStackTrace()
-        false
+        1
+    } finally {
+      stop()
     }
   }
 
   def stop() {
-    if (trackerProcess.get != null) {
-      trackerProcess.get.destroy()
-    }
-  }
-
-  def waitFor(): Int = {
-    try {
-      trackerProcess.get.waitFor()
-      val returnVal: Int = trackerProcess.get.exitValue
-      logger.info("Process ends with exit code " + returnVal)
-      stop()
-      returnVal
-    } catch {
-      case e: InterruptedException =>
-        e.printStackTrace()
-        logger.error("Process terminated unexpectedly")
-        1
+    if (psProcess.get != null && psProcess.get().isAlive) {
+      psProcess.get.destroy()
     }
   }
 
diff --git a/setup-utils/install-mxnet-amz-linux.sh b/setup-utils/install-mxnet-amz-linux.sh
old mode 100644
new mode 100755
diff --git a/setup-utils/install-mxnet-fedora-python.sh b/setup-utils/install-mxnet-fedora-python.sh
old mode 100644
new mode 100755
diff --git a/setup-utils/install-mxnet-osx-python.sh b/setup-utils/install-mxnet-osx-python.sh
index 8bfb7dade7b1..d6efa4083a09 100755
--- a/setup-utils/install-mxnet-osx-python.sh
+++ b/setup-utils/install-mxnet-osx-python.sh
@@ -33,7 +33,7 @@ then
 	# TODO: Change this to latest tag
 	#       to avoid updating this value for every release
 	#
-	export MXNET_TAG="v0.10.0"
+	export MXNET_TAG="1.0.0"
 fi
 
 export TARIKH=`/bin/date +%Y-%m-%d-%H:%M:%S`
diff --git a/setup-utils/install-mxnet-ubuntu-python.sh b/setup-utils/install-mxnet-ubuntu-python.sh
old mode 100644
new mode 100755
diff --git a/setup-utils/install-mxnet-ubuntu-r.sh b/setup-utils/install-mxnet-ubuntu-r.sh
old mode 100644
new mode 100755
diff --git a/snapcraft.yaml b/snapcraft.yaml
index 27356c332a29..bbc8087a7468 100644
--- a/snapcraft.yaml
+++ b/snapcraft.yaml
@@ -1,5 +1,5 @@
 name: mxnet
-version: '0.11.0'
+version: '1.0.0'
 summary: MXNet is a deep learning framework designed for efficiency and flexibility.
 description: |
   MXNet is a deep learning framework designed for both efficiency and 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 93458d21ac5a..027f00ba8762 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_api.cc
  * \brief C API of mxnet
  */
@@ -33,7 +34,8 @@
 #include <mxnet/io.h>
 #include <mxnet/c_api.h>
 #include <mxnet/kvstore.h>
-#include <mxnet/mxrtc.h>
+#include <mxnet/rtc.h>
+#include <mxnet/storage.h>
 #include <vector>
 #include <sstream>
 #include <string>
@@ -136,6 +138,18 @@ int MXSetNumOMPThreads(int thread_num) {
   API_END();
 }
 
+int MXEngineSetBulkSize(int bulk_size, int* prev_bulk_size) {
+  API_BEGIN();
+  *prev_bulk_size = Engine::Get()->set_bulk_size(bulk_size);
+  API_END();
+}
+
+int MXGetVersion(int *out) {
+  API_BEGIN();
+  *out = static_cast<int>(MXNET_VERSION);
+  API_END();
+}
+
 int MXNDArrayCreateNone(NDArrayHandle *out) {
   API_BEGIN();
   *out = new NDArray();
@@ -172,6 +186,39 @@ int MXNDArrayCreateEx(const mx_uint *shape,
   API_END();
 }
 
+int MXNDArrayCreateSparseEx(int storage_type,
+                    const mx_uint *shape,
+                    mx_uint ndim,
+                    int dev_type,
+                    int dev_id,
+                    int delay_alloc,
+                    int dtype,
+                    mx_uint num_aux,
+                    int *aux_type,
+                    mx_uint *aux_ndims,
+                    const mx_uint *aux_shape,
+                    NDArrayHandle *out) {
+  API_BEGIN();
+  std::vector<int> aux_types;
+  std::vector<TShape> aux_shapes;
+  auto shape_start = aux_shape;
+  for (size_t i = 0; i < num_aux; i++) {
+    // types
+    aux_types.push_back(aux_type[i]);
+    // shapes
+    aux_shapes.emplace_back(shape_start, shape_start + aux_ndims[i]);
+    shape_start += aux_ndims[i];
+  }
+  *out = new NDArray(
+      NDArrayStorageType(storage_type),
+      TShape(shape, shape + ndim),
+      Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
+      delay_alloc != 0,
+      dtype, aux_types, aux_shapes);
+  API_END();
+}
+
+
 int MXNDArrayLoadFromRawBytes(const void *buf,
                               size_t size,
                               NDArrayHandle *out) {
@@ -215,6 +262,30 @@ int MXNDArraySyncCopyToCPU(NDArrayHandle handle,
   API_END();
 }
 
+/*!
+ * \brief Copy src.data() to dst.data() if i = -1, else dst.aux_data(i) if i >= 0
+ * This function blocks. Do not use it in performance critical code.
+ * \param handle_dst handle of a dst ndarray whose data/aux_data has been allocated
+ * \param handle_src handle of a src ndarray which has default storage type
+ * \param i dst data blob indicator
+ */
+int MXNDArraySyncCopyFromNDArray(NDArrayHandle handle_dst,
+                                 const NDArrayHandle handle_src,
+                                 const int i) {
+  API_BEGIN();
+  NDArray* dst = static_cast<NDArray*>(handle_dst);
+  NDArray* src = static_cast<NDArray*>(handle_src);
+  dst->SyncCopyFromNDArray(*src, -1, i);
+  API_END();
+}
+
+int MXNDArraySyncCheckFormat(NDArrayHandle handle, const bool full_check) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  arr->SyncCheckFormat(full_check);
+  API_END();
+}
+
 int MXNDArrayWaitToRead(NDArrayHandle handle) {
   API_BEGIN();
   static_cast<NDArray*>(handle)->WaitToRead();
@@ -299,7 +370,7 @@ int MXNDArraySlice(NDArrayHandle handle,
                    NDArrayHandle *out) {
   NDArray *ptr = new NDArray();
   API_BEGIN();
-  *ptr = static_cast<NDArray*>(handle)->Slice(
+  *ptr = static_cast<NDArray*>(handle)->SliceWithRecord(
       slice_begin, slice_end);
   *out = ptr;
   API_END_HANDLE_ERROR(delete ptr);
@@ -310,7 +381,7 @@ int MXNDArrayAt(NDArrayHandle handle,
                 NDArrayHandle *out) {
   NDArray *ptr = new NDArray();
   API_BEGIN();
-  *ptr = static_cast<NDArray*>(handle)->At(idx);
+  *ptr = static_cast<NDArray*>(handle)->AtWithRecord(idx);
   *out = ptr;
   API_END_HANDLE_ERROR(delete ptr);
 }
@@ -346,11 +417,23 @@ MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle,
   if (pos >= 0) {
     new_shape[pos] = arr->shape().Size() / size;
   }
-  *ptr = arr->Reshape(new_shape);
+  *ptr = arr->ReshapeWithRecord(new_shape);
   *out = ptr;
   API_END_HANDLE_ERROR(delete ptr);
 }
 
+int MXNDArrayGetStorageType(NDArrayHandle handle,
+                     int *out_storage_type) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  if (!arr->is_none()) {
+    *out_storage_type = arr->storage_type();
+  } else {
+    *out_storage_type = kUndefinedStorage;
+  }
+  API_END();
+}
+
 int MXNDArrayGetShape(NDArrayHandle handle,
                       mx_uint *out_dim,
                       const mx_uint **out_pdata) {
@@ -375,13 +458,7 @@ int MXNDArrayGetData(NDArrayHandle handle,
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
   if (!arr->is_none()) {
-    CHECK(arr->ctx().dev_mask() == cpu::kDevMask)
-        << "MXNDArrayGetData can only be called for NDArray on CPU";
-    const TBlob &b = arr->data();
-    CHECK(b.CheckContiguous());
-    MSHADOW_REAL_TYPE_SWITCH(arr->dtype(), DType, {
-      *out_pdata = b.FlatTo2D<cpu, DType>().dptr_;
-    });
+    *out_pdata = arr->data().dptr_;
   } else {
     *out_pdata = nullptr;
   }
@@ -400,6 +477,42 @@ int MXNDArrayGetDType(NDArrayHandle handle,
   API_END();
 }
 
+int MXNDArrayGetAuxType(NDArrayHandle handle,
+                        mx_uint i,
+                        int *out_type) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  *out_type = arr->aux_type(i);
+  API_END();
+}
+
+/*!
+ * \brief Get a deep copy of the ith aux data blob
+ * in the form of an NDArray of default storage type.
+ * This function blocks. Do not use it in performance critical code.
+ */
+int MXNDArrayGetAuxNDArray(NDArrayHandle handle,
+                           mx_uint i,
+                           NDArrayHandle *out) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  *out = new NDArray(arr->aux_ndarray(i));
+  API_END();
+}
+
+/*!
+ * \brief Get a deep copy of the data blob
+ * in the form of an NDArray of default storage type.
+ * This function blocks. Do not use it in performance critical code.
+ */
+int MXNDArrayGetDataNDArray(NDArrayHandle handle,
+                            NDArrayHandle *out) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  *out = new NDArray(arr->data_ndarray());
+  API_END();
+}
+
 int MXNDArrayGetContext(NDArrayHandle handle,
                         int *out_dev_type,
                         int *out_dev_id) {
@@ -635,6 +748,20 @@ int MXKVStoreCreate(const char *type,
   API_END();
 }
 
+int MXKVStoreSetGradientCompression(KVStoreHandle handle, mx_uint num_params,
+                                    const char** keys, const char** vals) {
+  API_BEGIN();
+  std::vector<std::pair<std::string, std::string> > params;
+  for (mx_uint i = 0; i < num_params; ++i) {
+    std::pair<std::string, std::string> p;
+    p.first = keys[i];
+    p.second = vals[i];
+    params.push_back(p);
+  }
+  static_cast<KVStore*>(handle)->SetGradientCompression(params);
+  API_END();
+}
+
 int MXKVStoreFree(KVStoreHandle handle) {
   API_BEGIN();
   delete static_cast<KVStore*>(handle);
@@ -735,10 +862,45 @@ int MXKVStorePullEx(KVStoreHandle handle,
   API_END();
 }
 
-int MXKVStoreSetUpdater(KVStoreHandle handle,
-                        MXKVStoreUpdater updater,
-                        void* updater_handle) {
+int MXKVStorePullRowSparse(KVStoreHandle handle,
+                           mx_uint num,
+                           const int* keys,
+                           NDArrayHandle* vals,
+                           const NDArrayHandle* row_ids,
+                           int priority) {
   API_BEGIN();
+  std::vector<int> v_keys(num);
+  std::vector<std::pair<NDArray*, NDArray>> v_val_rowids(num);
+  for (mx_uint i = 0; i < num; ++i) {
+    v_keys[i] = keys[i];
+    v_val_rowids[i] = std::make_pair(static_cast<NDArray*>(vals[i]),
+                                     *static_cast<NDArray*>(row_ids[i]));
+  }
+  static_cast<KVStore*>(handle)->PullRowSparse(v_keys, v_val_rowids, priority);
+  API_END();
+}
+
+int MXKVStorePullRowSparseEx(KVStoreHandle handle,
+                             mx_uint num,
+                             const char** keys,
+                             NDArrayHandle* vals,
+                             const NDArrayHandle* row_ids,
+                             int priority) {
+  API_BEGIN();
+  std::vector<std::string> v_keys(num);
+  std::vector<std::pair<NDArray*, NDArray>> v_val_rowids(num);
+  for (mx_uint i = 0; i < num; ++i) {
+    v_keys[i] = keys[i];
+    v_val_rowids[i] = std::make_pair(static_cast<NDArray*>(vals[i]),
+                                     *static_cast<NDArray*>(row_ids[i]));
+  }
+  static_cast<KVStore*>(handle)->PullRowSparse(v_keys, v_val_rowids, priority);
+  API_END();
+}
+
+void MXKVStoreSetUpdaterImpl(KVStoreHandle handle,
+                             MXKVStoreUpdater updater,
+                             void* updater_handle) {
   MXKVStoreUpdater * updater_temp = updater;
   void* updater_handle_temp = updater_handle;
   std::function<void(int, const NDArray&, NDArray*)> updt
@@ -750,6 +912,36 @@ int MXKVStoreSetUpdater(KVStoreHandle handle,
     updater_temp(key, recv_copy, local_copy, updater_handle_temp);
   };
   static_cast<KVStore*>(handle)->set_updater(updt);
+}
+
+int MXKVStoreSetUpdater(KVStoreHandle handle,
+                        MXKVStoreUpdater updater,
+                        void* updater_handle) {
+  API_BEGIN();
+  MXKVStoreSetUpdaterImpl(handle, updater, updater_handle);
+  API_END();
+}
+
+int MXKVStoreSetUpdaterEx(KVStoreHandle handle,
+                          MXKVStoreUpdater updater,
+                          MXKVStoreStrUpdater str_updater,
+                          void* updater_handle) {
+  API_BEGIN();
+  // set updater with int keys
+  MXKVStoreSetUpdaterImpl(handle, updater, updater_handle);
+  // set updater with string keys
+  MXKVStoreStrUpdater * updater_temp = str_updater;
+  void* updater_handle_temp = updater_handle;
+  std::function<void(const std::string&, const NDArray&, NDArray*)> updt
+  = [updater_temp, updater_handle_temp]
+    (const std::string& key, const NDArray& recv, NDArray* local) {
+    NDArray* recv_copy = new NDArray();
+    *recv_copy = recv;
+    NDArray* local_copy = new NDArray();
+    *local_copy = *local;
+    updater_temp(key.c_str(), recv_copy, local_copy, updater_handle_temp);
+  };
+  static_cast<KVStore*>(handle)->set_updater(updt);
   API_END();
 }
 
@@ -940,26 +1132,20 @@ int MXRecordIOReaderSeek(RecordIOHandle handle, size_t pos) {
   API_END();
 }
 
+int MXRecordIOReaderTell(RecordIOHandle handle, size_t *pos) {
+  API_BEGIN();
+  MXRecordIOContext *context =
+    reinterpret_cast<MXRecordIOContext*>(handle);
+  *pos = context->reader->Tell();
+  API_END();
+}
+
 int MXRtcCreate(char* name, mx_uint num_input, mx_uint num_output,
                 char** input_names, char** output_names,
                 NDArrayHandle* inputs, NDArrayHandle* outputs,
                 char* kernel, RtcHandle *out) {
   API_BEGIN();
-#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
-  std::vector<std::pair<std::string, NDArray> > input, output;
-  for (mx_uint i = 0; i < num_input; ++i) {
-    input.push_back(std::pair<std::string, NDArray>(input_names[i],
-                                                    *reinterpret_cast<NDArray*>(inputs[i])));
-  }
-  for (mx_uint i = 0; i < num_output; ++i) {
-    output.push_back(std::pair<std::string, NDArray>(output_names[i],
-                                                     *reinterpret_cast<NDArray*>(outputs[i])));
-  }
-  MXRtc *rtc = new MXRtc(name, input, output, kernel);
-  *out = reinterpret_cast<RtcHandle>(rtc);
-#else
-  LOG(FATAL) << "Need to compile with USE_CUDA=1 and USE_NVRTC=1 for MXRtc.";
-#endif  // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
+  LOG(FATAL) << "Old rtc API is deprecated. Please use CudaModule";
   API_END();
 }
 
@@ -972,34 +1158,13 @@ int MXRtcPush(RtcHandle handle, mx_uint num_input, mx_uint num_output,
               mx_uint blockDimY,
               mx_uint blockDimZ) {
   API_BEGIN();
-#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
-  std::vector<NDArray> input, output;
-  for (mx_uint i = 0; i < num_input; ++i) {
-    input.push_back(*reinterpret_cast<NDArray*>(inputs[i]));
-  }
-  for (mx_uint i = 0; i < num_output; ++i) {
-    output.push_back(*reinterpret_cast<NDArray*>(outputs[i]));
-  }
-  reinterpret_cast<MXRtc*>(handle)->push(input, output,
-                                         gridDimX,
-                                         gridDimY,
-                                         gridDimZ,
-                                         blockDimX,
-                                         blockDimY,
-                                         blockDimZ);
-#else
-  LOG(FATAL) << "Need to compile with USE_CUDA=1 and USE_NVRTC=1 for MXRtc.";
-#endif  // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
+  LOG(FATAL) << "Old rtc API is deprecated. Please use CudaModule";
   API_END();
 }
 
 int MXRtcFree(RtcHandle handle) {
   API_BEGIN();
-#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
-  delete reinterpret_cast<MXRtc*>(handle);
-#else
-  LOG(FATAL) << "Need to compile with USE_CUDA=1 and USE_NVRTC=1 for MXRtc.";
-#endif  // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
+  LOG(FATAL) << "Old rtc API is deprecated. Please use CudaModule";
   API_END();
 }
 
@@ -1008,3 +1173,115 @@ int MXCustomOpRegister(const char* op_type, CustomOpPropCreator creator) {
   mxnet::op::custom::Registry::Get()->Register(op_type, creator);
   API_END();
 }
+
+
+int MXRtcCudaModuleCreate(const char* source, int num_options,
+                          const char** options, int num_exports,
+                          const char** exports, CudaModuleHandle *out) {
+  API_BEGIN();
+#if MXNET_USE_CUDA
+  std::vector<std::string> str_opts;
+  for (int i = 0; i < num_options; ++i) str_opts.emplace_back(options[i]);
+  std::vector<std::string> str_exports;
+  for (int i = 0; i < num_exports; ++i) str_exports.emplace_back(exports[i]);
+  *out = new rtc::CudaModule(source, str_opts, str_exports);
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 to use GPU.";
+#endif
+  API_END();
+}
+
+int MXRtcCudaModuleFree(CudaModuleHandle handle) {
+  API_BEGIN();
+#if MXNET_USE_CUDA
+  delete reinterpret_cast<rtc::CudaModule*>(handle);
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 to use GPU.";
+#endif
+  API_END();
+}
+
+int MXRtcCudaKernelCreate(CudaModuleHandle handle, const char* name, int num_args,
+                          int* is_ndarray, int* is_const, int* arg_types,
+                          CudaKernelHandle *out) {
+  API_BEGIN();
+#if MXNET_USE_CUDA
+  auto module = reinterpret_cast<rtc::CudaModule*>(handle);
+  std::vector<rtc::CudaModule::ArgType> signature;
+  for (int i = 0; i < num_args; ++i) {
+    signature.push_back(rtc::CudaModule::ArgType{
+        static_cast<bool>(is_ndarray[i]), static_cast<bool>(is_const[i]),
+        static_cast<mshadow::TypeFlag>(arg_types[i])});
+  }
+  auto kernel = module->GetKernel(name, signature);
+  *out = new std::shared_ptr<rtc::CudaModule::Kernel>(kernel);
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 to use GPU.";
+#endif
+  API_END();
+}
+
+int MXRtcCudaKernelFree(CudaKernelHandle handle) {
+  API_BEGIN();
+#if MXNET_USE_CUDA
+  delete reinterpret_cast<std::shared_ptr<rtc::CudaModule::Kernel>*>(handle);
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 to use GPU.";
+#endif
+  API_END();
+}
+
+int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** args,
+                        mx_uint grid_dim_x, mx_uint grid_dim_y,
+                        mx_uint grid_dim_z, mx_uint block_dim_x,
+                        mx_uint block_dim_y, mx_uint block_dim_z,
+                        mx_uint shared_mem) {
+  API_BEGIN();
+#if MXNET_USE_CUDA
+  auto kernel = reinterpret_cast<std::shared_ptr<rtc::CudaModule::Kernel>*>(handle);
+  const auto& signature = (*kernel)->signature();
+  std::vector<dmlc::any> any_args;
+  for (size_t i = 0; i < signature.size(); ++i) {
+    if (signature[i].is_ndarray) {
+      any_args.emplace_back(*static_cast<NDArray*>(args[i]));
+    } else {
+      MSHADOW_TYPE_SWITCH(signature[i].dtype, DType, {
+        any_args.emplace_back(*static_cast<DType*>(args[i]));
+      });
+    }
+  }
+  (*kernel)->Launch(Context::GPU(dev_id), any_args, grid_dim_x, grid_dim_y,
+                    grid_dim_z, block_dim_x, block_dim_y, block_dim_z, shared_mem);
+#else
+  LOG(FATAL) << "Compile with USE_CUDA=1 to use GPU.";
+#endif
+  API_END();
+}
+
+
+int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid, int* shared_id) {
+  API_BEGIN();
+  NDArray* arr = reinterpret_cast<NDArray*>(handle);
+  Storage::Handle shandle;
+  if (arr->ctx().dev_type == Context::kCPUShared) {
+    arr->WaitToRead();
+    shandle = arr->storage_handle();
+    Storage::Get()->SharedIncrementRefCount(shandle);
+  } else {
+    NDArray new_arr(arr->shape(), Context::CPUShared(0), false, arr->dtype());
+    CopyFromTo(*arr, new_arr);
+    new_arr.WaitToRead();
+    shandle = new_arr.storage_handle();
+    Storage::Get()->SharedIncrementRefCount(shandle);
+  }
+  *shared_pid = shandle.shared_pid;
+  *shared_id = shandle.shared_id;
+  API_END();
+}
+
+int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const mx_uint *shape,
+                                 mx_uint ndim, int dtype, NDArrayHandle *out) {
+  API_BEGIN();
+  *out = new NDArray(shared_pid, shared_id, TShape(shape, shape + ndim), dtype);
+  API_END();
+}
diff --git a/src/c_api/c_api_common.h b/src/c_api/c_api_common.h
index 846b53973b07..7f866045a762 100644
--- a/src/c_api/c_api_common.h
+++ b/src/c_api/c_api_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_api_error.h
  * \brief Error handling for C API.
  */
@@ -72,16 +73,22 @@ struct MXAPIThreadLocalEntry {
   std::vector<const char *> ret_vec_charp;
   /*! \brief result holder for returning handles */
   std::vector<void *> ret_handles;
+  /*! \brief holder for NDArray handles */
+  std::vector<NDArray*> ndinputs, ndoutputs;
   /*! \brief result holder for returning shapes */
   std::vector<TShape> arg_shapes, out_shapes, aux_shapes;
   /*! \brief result holder for returning type flags */
   std::vector<int> arg_types, out_types, aux_types;
+  /*! \brief result holder for returning storage types */
+  std::vector<int> arg_storage_types, out_storage_types, aux_storage_types;
   /*! \brief result holder for returning shape dimensions */
   std::vector<mx_uint> arg_shape_ndim, out_shape_ndim, aux_shape_ndim;
   /*! \brief result holder for returning shape pointer */
   std::vector<const mx_uint*> arg_shape_data, out_shape_data, aux_shape_data;
   /*! \brief uint32_t buffer for returning shape pointer */
   std::vector<uint32_t> arg_shape_buffer, out_shape_buffer, aux_shape_buffer;
+  /*! \brief bool buffer */
+  std::vector<bool> save_inputs, save_outputs;
   // helper function to setup return value of shape array
   inline static void SetupShapeArrayReturnWithBuffer(
       const std::vector<TShape> &shapes,
diff --git a/src/c_api/c_api_error.cc b/src/c_api/c_api_error.cc
index 4d93b908fb31..6dd4719281b6 100644
--- a/src/c_api/c_api_error.cc
+++ b/src/c_api/c_api_error.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_api_error.cc
  * \brief C error handling
  */
diff --git a/src/c_api/c_api_executor.cc b/src/c_api/c_api_executor.cc
index a4c48e426879..40df49144fa4 100644
--- a/src/c_api/c_api_executor.cc
+++ b/src/c_api/c_api_executor.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file c_api_executor.cc
  * \brief C API of mxnet
  */
@@ -198,6 +199,9 @@ int MXExecutorBindEX(SymbolHandle symbol_handle,
  * \param num_provided_arg_dtypes number of user provided in_arg and axu_state dtypes
  * \param provided_arg_dtype_names argument name list of provided dtypes
  * \param provided_arg_dtypes data of provided dtypes
+ * \param num_provided_arg_stypes number of user provided in_arg and axu_state storage types
+ * \param provided_arg_stype_names argument name list of provided storage types
+ * \param provided_arg_stypes data of provided storage types
  * \param num_shared_arg_names number of parameter names passed from _bind_ith_exec
  * \param shared_arg_name_list parameter name list passed from _bind_ith_exec
  * \param shared_buffer_len number of shared data arrays passed from _bind_ith_exec
@@ -230,6 +234,9 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
                          const mx_uint num_provided_arg_dtypes,
                          const char** provided_arg_dtype_names,
                          const int* provided_arg_dtypes,
+                         const mx_uint num_provided_arg_stypes,
+                         const char** provided_arg_stype_names,
+                         const int* provided_arg_stypes,
                          const mx_uint num_shared_arg_names,
                          const char** shared_arg_name_list,
                          int* shared_buffer_len,
@@ -254,7 +261,7 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
 
   // attr_dict for setting up type_dict and arg/aux ctx
   std::unordered_map<std::string, std::unordered_map<std::string, std::string>> attr_dict;
-  if (nullptr == provided_arg_dtypes || nullptr != g2c_keys) {
+  if (nullptr == provided_arg_dtypes || nullptr != g2c_keys || nullptr == provided_arg_stypes) {
     std::vector<std::tuple<std::string, std::string, std::string>> attrs =
       sym->ListAttrsRecursive();
     attr_dict.reserve(attrs.size());
@@ -280,6 +287,23 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
     }
   }
 
+  // setup arg_stype_map
+  std::unordered_map<std::string, int> arg_stype_map;
+  if (nullptr == provided_arg_stypes) {  // use attr_dict
+    for (const auto& arg_name : in_arg_names) {
+      const auto it = attr_dict.find(arg_name);
+      if (it == attr_dict.end() || !it->second.count("__storage_type__")) {
+        arg_stype_map[arg_name] = kDefaultStorage;
+      }
+    }
+  } else {  // use user input type_dict
+    // create stype map for in_args and aux_states
+    arg_stype_map.reserve(num_provided_arg_stypes);
+    for (mx_uint i = 0; i < num_provided_arg_stypes; ++i) {
+      arg_stype_map[provided_arg_stype_names[i]] = provided_arg_stypes[i];
+    }
+  }
+
   // create default ctx
   Context ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
   // create ctx map
@@ -399,9 +423,6 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
 
   // create shared_buffer_map
   std::unordered_map<std::string, NDArray> shared_buffer_map;
-  std::vector<NDArray> shared_exec_in_args;
-  std::vector<NDArray> shared_exec_arg_grads;
-  std::vector<NDArray> shared_exec_aux_states;
   bool use_shared_buffer = (*shared_buffer_len >= 0);
   if (*shared_buffer_len > 0) {
     // create shared_buffer_map
@@ -420,9 +441,10 @@ int MXExecutorSimpleBind(SymbolHandle symbol_handle,
   std::vector<NDArray> aux_state_vec;
 
   *out = Executor::SimpleBind(*sym, ctx, ctx_map, in_arg_ctx_vec, arg_grad_ctx_vec,
-                              aux_state_ctx_vec, arg_shape_map, arg_dtype_map, grad_req_type_vec,
-                              shared_arg_name_set, &in_arg_vec, &arg_grad_vec, &aux_state_vec,
-                              use_shared_buffer? &shared_buffer_map : nullptr,
+                              aux_state_ctx_vec, arg_shape_map, arg_dtype_map, arg_stype_map,
+                              grad_req_type_vec, shared_arg_name_set, &in_arg_vec,
+                              &arg_grad_vec, &aux_state_vec,
+                              use_shared_buffer ? &shared_buffer_map : nullptr,
                               reinterpret_cast<Executor*>(shared_exec_handle));
 
   // copy ndarray ptrs to ret->handles so that front end
diff --git a/src/c_api/c_api_function.cc b/src/c_api/c_api_function.cc
index 3d8b5328c1a0..3cd4f66a7202 100644
--- a/src/c_api/c_api_function.cc
+++ b/src/c_api/c_api_function.cc
@@ -25,9 +25,10 @@
 #include <mxnet/c_api.h>
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
+#include <mxnet/imperative.h>
 
 #include "./c_api_common.h"
-#include "../ndarray/autograd.h"
+#include "../operator/operator_common.h"
 
 namespace mxnet {
 namespace custom_function {
@@ -94,8 +95,8 @@ void Backward(const OpStatePtr& state,
     ptrs.push_back(reinterpret_cast<NDArrayHandle>(nd));
   }
 
-  bool prev_recording = autograd::AutogradRuntime::Get()->SetIsRecording(false);
-  bool prev_training = autograd::AutogradRuntime::Get()->SetIsTraining(ctx.is_train);
+  bool prev_recording = Imperative::Get()->set_is_recording(false);
+  bool prev_training = Imperative::Get()->set_is_training(ctx.is_train);
 
   CHECK(reinterpret_cast<CustomFunctionBwdFunc>(
       params.info->callbacks[kCustomFunctionBackward])(
@@ -103,10 +104,26 @@ void Backward(const OpStatePtr& state,
           reinterpret_cast<const int*>(req.data()), ctx.is_train,
           params.info->contexts[kCustomFunctionBackward]));
 
-  autograd::AutogradRuntime::Get()->SetIsTraining(prev_training);
-  autograd::AutogradRuntime::Get()->SetIsRecording(prev_recording);
+  Imperative::Get()->set_is_training(prev_training);
+  Imperative::Get()->set_is_recording(prev_recording);
 }
 
+// infer storage function for custom op, which assigns kDefaultStorage for
+// all undefined stypes, and dispatch on DispatchMode::kFComputeEx.
+inline bool InferStorageType(const nnvm::NodeAttrs& attrs,
+                             const int dev_mask,
+                             DispatchMode* dispatch_mode,
+                             std::vector<int> *iattr,
+                             std::vector<int> *oattr) {
+  for (int& v : *oattr) {
+    if (v == -1) v = kDefaultStorage;
+  }
+  for (int& v : *iattr) {
+    if (v == -1) v = kDefaultStorage;
+  }
+  op::dispatch_mode_assign(dispatch_mode, DispatchMode::kFComputeEx);
+  return true;
+}
 
 NNVM_REGISTER_OP(_CustomFunction)
 .set_num_inputs([](const NodeAttrs& attrs) {
@@ -134,7 +151,8 @@ NNVM_REGISTER_OP(_CustomFunction)
 .set_attr<FCreateOpState>("FCreateOpState", CreateState)
 .set_attr<nnvm::FGradient>("FGradient", Gradient)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", Forward)
-.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Forward);
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Forward)
+.set_attr<FInferStorageType>("FInferStorageType", InferStorageType);
 
 
 NNVM_REGISTER_OP(_backward_CustomFunction)
@@ -152,7 +170,8 @@ NNVM_REGISTER_OP(_backward_CustomFunction)
     return ExecType::kLocal;
   })
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", Backward)
-.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Backward);
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Backward)
+.set_attr<FInferStorageType>("FInferStorageType", InferStorageType);
 
 }  // namespace custom_function
 }  // namespace mxnet
@@ -162,38 +181,35 @@ int MXCustomFunctionRecord(int num_inputs, NDArrayHandle *inputs,
                            MXCallbackList *callbacks) {
   using namespace mxnet;
   using namespace mxnet::custom_function;
-  using mxnet::autograd::AutogradRuntime;
   API_BEGIN();
-  CHECK(AutogradRuntime::Get()->IsRecording());
-  std::vector<NDArray> ndinputs, ndoutputs;
-  for (int i = 0; i < num_inputs; ++i) {
-    ndinputs.emplace_back(*reinterpret_cast<NDArray*>(inputs[i]));
-  }
-  for (int i = 0; i < num_outputs; ++i) {
-    ndoutputs.emplace_back(*reinterpret_cast<NDArray*>(outputs[i]));
-  }
-  CustomFunctionParam params;
+  CHECK(Imperative::Get()->is_recording());
+  auto state = OpStatePtr::Create<CustomFunctionParam>();
+  CustomFunctionParam& params = state.get_state<CustomFunctionParam>();
   params.num_args = num_inputs;
   params.num_outs = num_outputs;
   params.info.reset(callbacks, [](MXCallbackList* ptr){
       reinterpret_cast<CustomFunctionDelFunc>(ptr->callbacks[kCustomFunctionDelete])(
         ptr->contexts[kCustomFunctionDelete]);
     });
-  for (const auto& i : ndoutputs) {
-    params.out_shapes.emplace_back(i.shape());
-    params.out_dtypes.emplace_back(i.dtype());
+  std::vector<NDArray*> ndinputs, ndoutputs;
+  ndinputs.reserve(num_inputs);
+  ndoutputs.reserve(num_outputs);
+  params.out_shapes.reserve(num_outputs);
+  params.out_dtypes.reserve(num_outputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    ndinputs.emplace_back(reinterpret_cast<NDArray*>(inputs[i]));
+  }
+  for (int i = 0; i < num_outputs; ++i) {
+    NDArray* arr = reinterpret_cast<NDArray*>(outputs[i]);
+    ndoutputs.emplace_back(arr);
+    params.out_shapes.emplace_back(arr->shape());
+    params.out_dtypes.emplace_back(arr->dtype());
   }
   nnvm::NodeAttrs attrs;
   attrs.op = nnvm::Op::Get("_CustomFunction");
   attrs.parsed = params;
-  // TODO(piiswrong): remove state by using FComputeEx
-  auto state = OpStatePtr::Create<CustomFunctionParam>(params);
-  AutogradRuntime::Get()->RecordImperativeOperator(
-      state, attrs.op, attrs, &ndinputs, &ndoutputs);
-
-  for (size_t i = 0; i < ndoutputs.size(); ++i) {
-    *reinterpret_cast<NDArray*>(outputs[i]) = ndoutputs[i];
-  }
+  Imperative::Get()->RecordOp(
+      std::move(attrs), ndinputs, ndoutputs, state);
 
   API_END();
 }
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 3202f55abea7..2c4a30501147 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -18,7 +18,8 @@
  */
 
 /*!
- * \file c_api_symbolic.cc
+ *  Copyright (c) 2016 by Contributors
+ * \file c_api_ndarray.cc
  * \brief C API of mxnet
  */
 
@@ -27,396 +28,89 @@
 #include <mxnet/operator.h>
 #include <mxnet/operator_util.h>
 #include <mxnet/op_attr_types.h>
+#include <mxnet/imperative.h>
 #include <nnvm/node.h>
 #include <nnvm/op_attr_types.h>
 #include <string>
 #include "./c_api_common.h"
 #include "../common/utils.h"
-#include "../ndarray/autograd.h"
+#include "../common/exec_utils.h"
+#include "../imperative/imperative_utils.h"
 
 using namespace mxnet;
-using mxnet::autograd::AutogradRuntime;
-
-void SetOpAttrs(const nnvm::Op *op,
-                nnvm::NodeAttrs *p_attrs,
-                const int& num_inputs,
-                const int& num_params,
-                const char **param_keys,
-                const char **param_vals) {
-  static auto& num_args = nnvm::Op::GetAttr<std::string>("key_var_num_args");
-  nnvm::NodeAttrs& attrs = *p_attrs;
-  attrs.op = op;
-  for (int i = 0; i < num_params; ++i) {
-    attrs.dict.emplace(param_keys[i], param_vals[i]);
-  }
-
-  if (num_args.count(op)) {
-    attrs.dict.emplace(num_args[op], std::to_string(num_inputs));
-  }
-  if (op->attr_parser != nullptr) {
-    op->attr_parser(&attrs);
-  }
-}
-
-void SetNumOutputs(const nnvm::Op *op,
-                   const nnvm::NodeAttrs& attrs,
-                   const int& num_inputs,
-                   int* infered_num_outputs,
-                   int* num_visible_outputs) {
-  static auto& visible_out = nnvm::Op::GetAttr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs");
-  int infered_num_inputs;
-  if (op->get_num_inputs != nullptr) {
-    infered_num_inputs = op->get_num_inputs(attrs);
-  } else {
-    infered_num_inputs = op->num_inputs;
-  }
-  CHECK_EQ(num_inputs, infered_num_inputs)
-    << "Expecting " << infered_num_inputs << " inputs, got "
-    << num_inputs << " in operator " << op->name;
-  if (op->get_num_outputs != nullptr) {
-    *infered_num_outputs = op->get_num_outputs(attrs);
-  } else {
-    *infered_num_outputs = op->num_outputs;
-  }
-  *num_visible_outputs = *infered_num_outputs;
-  if (visible_out.count(op)) {
-    *num_visible_outputs = visible_out[op](attrs);
-    CHECK_LE(*num_visible_outputs, *infered_num_outputs);
-  }
-}
 
 void SetNDInputsOutputs(const nnvm::Op* op,
-                        std::vector<NDArray>* p_ndinputs,
-                        std::vector<NDArray>* p_ndoutputs,
-                        const int& num_inputs,
+                        std::vector<NDArray*>* ndinputs,
+                        std::vector<NDArray*>* ndoutputs,
+                        int num_inputs,
                         const NDArrayHandle *inputs,
                         int *num_outputs,
-                        const int& infered_num_outputs,
-                        const int& num_visible_outputs,
-                        NDArray** outarray) {
-  std::vector<NDArray>& ndinputs  = *p_ndinputs;
-  std::vector<NDArray>& ndoutputs = *p_ndoutputs;
-  ndinputs.reserve(num_inputs);
+                        int infered_num_outputs,
+                        int num_visible_outputs,
+                        NDArrayHandle **outputs) {
+  NDArray** out_array = *reinterpret_cast<NDArray***>(outputs);
+
+  ndinputs->clear();
+  ndinputs->reserve(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
-    ndinputs.emplace_back(*reinterpret_cast<NDArray*>(inputs[i]));
+    ndinputs->emplace_back(reinterpret_cast<NDArray*>(inputs[i]));
   }
-  if (outarray == nullptr) {
+
+  ndoutputs->clear();
+  ndoutputs->reserve(infered_num_outputs);
+  if (out_array == nullptr) {
+    for (int i = 0; i < infered_num_outputs; ++i) {
+      ndoutputs->emplace_back(new NDArray());
+    }
     *num_outputs = num_visible_outputs;
-    ndoutputs.resize(infered_num_outputs);
   } else {
     CHECK(*num_outputs == infered_num_outputs || *num_outputs == num_visible_outputs)
-      << "Expecting " << infered_num_outputs << " (all) or "
-      << num_visible_outputs << " (visible only) outputs, got "
-      << *num_outputs << " in operator " << op->name;
-    ndoutputs.reserve(infered_num_outputs);
-    for (int i = 0; i < num_visible_outputs; ++i) {
-      ndoutputs.emplace_back(std::move(*outarray[i]));
+      << "Operator expects " << infered_num_outputs << " (all) or "
+      << num_visible_outputs << " (visible only) outputs, but got "
+      << *num_outputs << " instead.";
+    for (int i = 0; i < *num_outputs; ++i) {
+      ndoutputs->emplace_back(out_array[i]);
     }
-    ndoutputs.resize(infered_num_outputs);
-  }
-}
-
-void SetContext(Context* p_ctx,
-                const nnvm::NodeAttrs& attrs,
-                const std::vector<NDArray>& ndinputs,
-                const std::vector<NDArray>& ndoutputs,
-                const Context& default_ctx) {
-  Context& ctx = *p_ctx;
-  if (ndinputs.size()) {
-    ctx = ndinputs[0].ctx();
-    for (size_t i = 1; i < ndinputs.size(); ++i) {
-      CHECK_EQ(ndinputs[i].ctx().dev_mask(), ctx.dev_mask())
-          << "All inputs must live on the same context. "
-          << "But the first argument is on "
-          << ctx << " while the " << i+1 << "-th argument is on "
-          << ndinputs[i].ctx();
+    for (int i = *num_outputs; i < infered_num_outputs; ++i) {
+      ndoutputs->emplace_back(new NDArray());
     }
-  } else if (ndoutputs.size() && !ndoutputs[0].is_none()) {
-    ctx = ndoutputs[0].ctx();
-  } else if (attrs.dict.find("ctx") != attrs.dict.end()) {
-    ctx = Context::FromString(attrs.dict.at("ctx"));
-  } else {
-    ctx = default_ctx;
-  }
-  // Pinned context doesn't propagate
-  if (ctx.dev_type == Context::kCPUPinned) {
-    ctx = Context::CPU();
-  }
-#if !MXNET_USE_CUDA
-  if (ctx.dev_mask() == gpu::kDevMask) {
-    LOG(INFO) << "GPU support is disabled. Compile MXNet with "
-              << "USE_CUDA=1 to enable GPU support.";
   }
-#endif  // MXNET_USE_CUDA
 }
 
-void SetShapeType(const nnvm::Op* op,
-                  const nnvm::NodeAttrs& attrs,
-                  const Context& ctx,
-                  const std::vector<NDArray>& ndinputs,
-                  std::vector<NDArray>* p_ndoutputs) {
-  std::vector<NDArray>& ndoutputs = *p_ndoutputs;
-  static auto& infershape = nnvm::Op::GetAttr<nnvm::FInferShape>("FInferShape");
-  static auto& infertype = nnvm::Op::GetAttr<nnvm::FInferType>("FInferType");
+void MXImperativeInvokeImpl(AtomicSymbolCreator creator,
+                            int num_inputs,
+                            NDArrayHandle *inputs,
+                            int *num_outputs,
+                            NDArrayHandle **outputs,
+                            int num_params,
+                            const char **param_keys,
+                            const char **param_vals) {
+  const nnvm::Op* op = static_cast<nnvm::Op*>(creator);
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
-  // infer shape
-  std::vector<TShape>& in_shapes  = ret->arg_shapes;
-  std::vector<TShape>& out_shapes = ret->out_shapes;
-  in_shapes.clear();
-  out_shapes.clear();
-
-  for (auto& i : ndinputs) {
-    in_shapes.emplace_back(i.shape());
-  }
-  for (auto& i : ndoutputs) {
-    out_shapes.emplace_back(i.shape());
-  }
-  CHECK(infershape.count(op))
-    << "Operator " << op->name << " is missing FInferShape attribute";
-  CHECK(infershape[op](attrs, &in_shapes, &out_shapes));
-  CHECK_EQ(out_shapes.size(), ndoutputs.size());
-
-  // infer type
-  std::vector<int>& in_types = ret->arg_types;
-  std::vector<int>& out_types = ret->out_types;
-  in_types.clear();
-  out_types.clear();
-
-  for (auto& i : ndinputs) {
-    in_types.push_back(i.dtype());
-  }
-  for (auto& i : ndoutputs) {
-    out_types.push_back(i.dtype());
-  }
-  CHECK(infertype.count(op))
-    << "Operator " << op->name << " is missing FInferType attribute";
-  CHECK(infertype[op](attrs, &in_types, &out_types));
-  CHECK_EQ(out_types.size(), ndoutputs.size());
-
-  for (size_t i = 0; i < ndoutputs.size(); ++i) {
-    if (ndoutputs[i].is_none()) {
-      ndoutputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]);
-    } else {
-      CHECK_EQ(ndoutputs[i].shape(), out_shapes[i])
-        << i << "th output has invalid shape. "
-        << "Expecting " << out_shapes[i] << " got "
-        << ndoutputs[i].shape() << " in operator " << op->name;
-      CHECK_EQ(ndoutputs[i].dtype(), out_types[i])
-        << i << "th output has invalid shape. "
-        << "Expecting " << out_types[i] << " got "
-        << ndoutputs[i].dtype()  << " in operator " << op->name;
-    }
-  }
-}
 
-void SetDependency(std::vector<engine::VarHandle> *p_read_vars,
-                   std::vector<engine::VarHandle> *p_write_vars,
-                   std::vector<Resource> *p_requested,
-                   std::vector<uint32_t> *p_auxidx,
-                   const nnvm::Op* op,
-                   const nnvm::NodeAttrs& attrs,
-                   const Context& ctx,
-                   const std::vector<NDArray>& ndinputs,
-                   const std::vector<NDArray>& ndoutputs) {
-  static auto& mutate = nnvm::Op::GetAttr<nnvm::FMutateInputs>("FMutateInputs");
-  static auto& tmp_resource = nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
-
-  std::vector<engine::VarHandle>& read_vars  = *p_read_vars;
-  std::vector<engine::VarHandle>& write_vars = *p_write_vars;
-  std::vector<Resource>& requested = *p_requested;
-  std::vector<uint32_t>& auxidx = *p_auxidx;
-
-  if (tmp_resource.count(op)) {
-    int ntmp = 0;
-    for (const auto& req : tmp_resource[op](attrs)) {
-      switch (req.type) {
-       case ResourceRequest::kTempSpace:
-        ++ntmp;
-       case ResourceRequest::kRandom:
-        requested.push_back(ResourceManager::Get()->Request(ctx, req));
-        write_vars.push_back(requested.back().var);
-        break;
-       default:
-        LOG(FATAL) << "resource type not yet supported";
-      }
-    }
-    CHECK_LE(ntmp, 1) << "Only support 1 temp space request";
-  }
-
-  for (auto& i : ndinputs) {
-    read_vars.push_back(i.var());
-  }
-  for (auto& i : ndoutputs) {
-    write_vars.push_back(i.var());
-  }
-  if (mutate.count(op)) {
-    auxidx = mutate[op](attrs);
-    std::sort(auxidx.begin(), auxidx.end());
-    for (auto & i : auxidx) {
-      write_vars.push_back(ndinputs[i].var());
-    }
-  }
-  Engine::Get()->DeduplicateVarHandle(&read_vars, &write_vars);
-}
+  nnvm::NodeAttrs attrs = imperative::ParseAttrs(op, num_inputs, num_params,
+                                                 param_keys, param_vals);
 
-void PushFCompute(const FCompute& fn,
-                  const nnvm::Op* op,
-                  const nnvm::NodeAttrs& attrs,
-                  const Context& ctx,
-                  const std::vector<engine::VarHandle>& read_vars,
-                  const std::vector<engine::VarHandle>& write_vars,
-                  const std::vector<Resource>& requested,
-                  const std::vector<NDArray>& ndinputs,
-                  const std::vector<NDArray>& ndoutputs) {
-  bool is_train = AutogradRuntime::Get()->IsTraining();
-  Engine::Get()->PushAsync(
-    [ctx, attrs, fn, ndinputs, ndoutputs, requested, is_train](
-        RunContext rctx,
-        engine::CallbackOnComplete on_complete) {
-      std::vector<TBlob> input_blobs, output_blobs;
-      for (auto& i : ndinputs) {
-        input_blobs.push_back(i.data());
-      }
-      for (auto& i : ndoutputs) {
-        output_blobs.push_back(i.data());
-      }
-      OpContext opctx{is_train, rctx,
-                      engine::CallbackOnComplete(),
-                      requested};
-      std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
-      fn(attrs, opctx, input_blobs, req, output_blobs);
-      if (ctx.dev_mask() == gpu::kDevMask) {
-        rctx.get_stream<gpu>()->Wait();
-      }
-      on_complete();
-    }, ctx, read_vars, write_vars, FnProperty::kNormal,
-    0, PROFILER_MESSAGE(op->name.c_str()));
-}
+  int infered_num_outputs;
+  int num_visible_outputs;
+  imperative::SetNumOutputs(op, attrs, num_inputs, &infered_num_outputs, &num_visible_outputs);
 
-void PushOperator(const OpStatePtr& state,
-                  const nnvm::Op* op,
-                  const nnvm::NodeAttrs& attrs,
-                  const Context& ctx,
-                  const std::vector<engine::VarHandle>& read_vars,
-                  const std::vector<engine::VarHandle>& write_vars,
-                  const std::vector<Resource>& requested,
-                  const std::vector<NDArray>& ndinputs,
-                  const std::vector<NDArray>& ndoutputs) {
-  static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
-
-  bool is_train = AutogradRuntime::Get()->IsTraining();
-  ExecType exec_type = ExecType::kSync;
-  if (fexec_type.count(op)) {
-    exec_type = fexec_type[op](attrs);
-  }
+  std::vector<NDArray*> ndinputs, ndoutputs;
+  SetNDInputsOutputs(op, &ndinputs, &ndoutputs, num_inputs, inputs,
+      num_outputs, infered_num_outputs, num_visible_outputs, outputs);
 
-  auto fcompute = common::GetFCompute<FStatefulCompute>(op, "FStatefulCompute", ctx);
-  if (fcompute != nullptr) {
-    CHECK(exec_type == ExecType::kSync || exec_type == ExecType::kAsync);
-    Engine::Get()->PushAsync(
-      [state, fcompute, ndinputs, ndoutputs, requested, is_train, exec_type](
-          RunContext rctx,
-          engine::CallbackOnComplete on_complete) {
-        OpContext opctx{is_train, rctx, on_complete, requested};
-        std::vector<TBlob> input_blobs, output_blobs;
-        for (const auto& i : ndinputs) input_blobs.push_back(i.data());
-        for (const auto& i : ndoutputs) output_blobs.push_back(i.data());
-        std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
-        fcompute(state, opctx, input_blobs, req, output_blobs);
-        if (exec_type == ExecType::kSync) {
-          if (rctx.get_ctx().dev_mask() == gpu::kDevMask) {
-            rctx.get_stream<gpu>()->Wait();
-          }
-          on_complete();
-        }
-      }, ctx, read_vars, write_vars, FnProperty::kNormal,
-      0, PROFILER_MESSAGE(op->name.c_str()));
-  } else {
-    auto fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
-        op, "FStatefulComputeEx", ctx);
-    CHECK(fcompute_ex != nullptr)
-        << "One of FStatefulCompute and FStatefulComputeEx must be registered "
-        << "for stateful operator " << op->name;
-    const auto& run = [state, fcompute_ex, ndinputs, ndoutputs, requested, is_train, exec_type](
-          RunContext rctx,
-          engine::CallbackOnComplete on_complete) {
-        OpContext opctx{is_train, rctx, on_complete, requested};
-        std::vector<OpReqType> req(ndoutputs.size(), kWriteTo);
-        fcompute_ex(state, opctx, ndinputs, req, ndoutputs);
-        if (exec_type == ExecType::kSync) {
-          if (rctx.get_ctx().dev_mask() == gpu::kDevMask) {
-            rctx.get_stream<gpu>()->Wait();
-          }
-          on_complete();
-        }
-      };
-    if (exec_type == ExecType::kLocal) {
-      run(RunContext{ctx, nullptr}, engine::CallbackOnComplete());
-    } else {
-      Engine::Get()->PushAsync(run, ctx, read_vars, write_vars, FnProperty::kNormal,
-                               0, PROFILER_MESSAGE(op->name.c_str()));
-    }
+  auto state = Imperative::Get()->Invoke(Context::CPU(), attrs, ndinputs, ndoutputs);
+  if (Imperative::Get()->is_recording()) {
+    Imperative::Get()->RecordOp(std::move(attrs), ndinputs, ndoutputs, state);
   }
-}
-
-void ImperativeInvokeImpl(const Context& default_ctx,
-                          const nnvm::NodeAttrs& attrs,
-                          std::vector<NDArray>* p_ndinputs,
-                          std::vector<NDArray>* p_ndoutputs) {
-  static auto& fcpu = nnvm::Op::GetAttr<FCompute>("FCompute<cpu>");
-  static auto& fgpu = nnvm::Op::GetAttr<FCompute>("FCompute<gpu>");
-  static auto& ndfunc = nnvm::Op::GetAttr<FNDArrayFunction>("FNDArrayFunction");
-  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
-  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
-
-  const nnvm::Op *op = attrs.op;
-  std::vector<NDArray>& ndinputs  = *p_ndinputs;
-  std::vector<NDArray>& ndoutputs = *p_ndoutputs;
 
+  for (int i = *num_outputs; i < infered_num_outputs; ++i) delete ndoutputs[i];
 
-  if (ndfunc.count(op)) {
-    ndfunc[op](attrs, ndinputs, &ndoutputs);
-  } else {
-    // TODO(piiswrong): infer ctx
-    Context ctx;
-    SetContext(&ctx, attrs, ndinputs, ndoutputs, default_ctx);
-    SetShapeType(op, attrs, ctx, ndinputs, &ndoutputs);
-
-    std::vector<engine::VarHandle> read_vars, write_vars;
-    std::vector<Resource> requested;
-    std::vector<uint32_t> auxidx;
-    SetDependency(&read_vars, &write_vars, &requested, &auxidx,
-        op, attrs, ctx, ndinputs, ndoutputs);
-
-    FCompute fn;
-    if (ctx.dev_mask() == cpu::kDevMask && fcpu.count(op)) {
-      fn = fcpu[op];
-    } else if (ctx.dev_mask() == gpu::kDevMask && fgpu.count(op)) {
-      fn = fgpu[op];
-    }
-
-    if (fn) {
-      if (AutogradRuntime::Get()->IsRecording()) {
-        AutogradRuntime::Get()->RecordImperativeFCompute(op,
-            attrs, &ndinputs, &ndoutputs);
-      }
-      PushFCompute(fn, op, attrs, ctx, read_vars, write_vars,
-          requested, ndinputs, ndoutputs);
-    } else if (createop.count(op)) {
-      auto state =
-          createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types);
-      if (AutogradRuntime::Get()->IsRecording()) {
-        AutogradRuntime::Get()->RecordImperativeOperator(state, op,
-            attrs, &ndinputs, &ndoutputs);
-      }
-      write_vars.push_back(state.get_var());
-      PushOperator(state, op, attrs, ctx, read_vars, write_vars,
-          requested, ndinputs, ndoutputs);
-    } else {
-      LOG(FATAL)
-        << "Operator " << op->name << " is not implemented for "
-        << (ctx.dev_mask() == gpu::kDevMask ? "GPU." : "CPU.");
-    }
+  if (*outputs == nullptr) {
+    ret->ret_handles.clear();
+    ret->ret_handles.reserve(*num_outputs);
+    for (int i = 0; i < *num_outputs; ++i) ret->ret_handles.push_back(ndoutputs[i]);
+    *outputs = reinterpret_cast<NDArrayHandle*>(dmlc::BeginPtr(ret->ret_handles));
   }
 }
 
@@ -428,36 +122,32 @@ int MXImperativeInvoke(AtomicSymbolCreator creator,
                        int num_params,
                        const char **param_keys,
                        const char **param_vals) {
-  const nnvm::Op* op = static_cast<nnvm::Op*>(creator);
-  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
-  NDArray** outarray = *reinterpret_cast<NDArray***>(outputs);
-
   API_BEGIN();
-  nnvm::NodeAttrs attrs;
-  SetOpAttrs(op, &attrs, num_inputs, num_params, param_keys, param_vals);
-
-  int infered_num_outputs;
-  int num_visible_outputs;
-  SetNumOutputs(op, attrs, num_inputs, &infered_num_outputs, &num_visible_outputs);
-
-  std::vector<NDArray> ndinputs, ndoutputs;
-  SetNDInputsOutputs(op, &ndinputs, &ndoutputs, num_inputs, inputs,
-      num_outputs, infered_num_outputs, num_visible_outputs, outarray);
-
-  ImperativeInvokeImpl(Context::CPU(), attrs, &ndinputs, &ndoutputs);
+  MXImperativeInvokeImpl(creator, num_inputs, inputs, num_outputs, outputs,
+                         num_params, param_keys, param_vals);
+  API_END();
+}
 
-  if (outarray == nullptr) {
-    ret->ret_handles.clear();
-    for (int i = 0; i < num_visible_outputs; ++i) {
-      ret->ret_handles.push_back(
-        reinterpret_cast<NDArrayHandle>(new NDArray(std::move(ndoutputs[i]))));
-    }
-    *outputs = dmlc::BeginPtr(ret->ret_handles);
-  } else {
-    for (int i = 0; i < *num_outputs; ++i) {
-      *outarray[i] = std::move(ndoutputs[i]);
-    }
+int MXImperativeInvokeEx(AtomicSymbolCreator creator,
+                         int num_inputs,
+                         NDArrayHandle *inputs,
+                         int *num_outputs,
+                         NDArrayHandle **outputs,
+                         int num_params,
+                         const char **param_keys,
+                         const char **param_vals,
+                         const int **out_stypes) {  // outputs storage types
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
+  MXImperativeInvokeImpl(creator, num_inputs, inputs, num_outputs, outputs,
+                         num_params, param_keys, param_vals);
+  NDArray** out_array = *reinterpret_cast<NDArray***>(outputs);
+  ret->out_types.clear();
+  ret->out_types.reserve(*num_outputs);
+  for (int i = 0; i < *num_outputs; ++i) {
+    ret->out_types.emplace_back(out_array[i]->storage_type());
   }
+  *out_stypes = dmlc::BeginPtr(ret->out_types);
   API_END();
 }
 
@@ -466,17 +156,13 @@ int MXCreateCachedOp(SymbolHandle handle,
   nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(handle);
 
   API_BEGIN();
-  nnvm::Graph *g = new nnvm::Graph;
-  g->outputs = sym->outputs;
-  auto vars = sym->ListInputs(nnvm::Symbol::kAll);
-  CHECK_GE(vars.size(), 1) << "CachedOp must have at least 1 input.";
-  g->attrs["vars"] = std::make_shared<dmlc::any>(std::move(vars));
-  *out = g;
+  *out = new std::shared_ptr<Imperative::CachedOp>(
+      new Imperative::CachedOp(*sym));
   API_END();
 }
 
 int MXFreeCachedOp(CachedOpHandle handle) {
-  nnvm::Graph *g = static_cast<nnvm::Graph*>(handle);
+  CachedOpPtr* g = static_cast<CachedOpPtr*>(handle);
   API_BEGIN();
   delete g;
   API_END();
@@ -487,80 +173,95 @@ int MXInvokeCachedOp(CachedOpHandle handle,
                      NDArrayHandle *inputs,
                      int *num_outputs,
                      NDArrayHandle **outputs) {
-  nnvm::Graph *g = static_cast<nnvm::Graph*>(handle);
+  static const auto cached_op = nnvm::Op::Get("_CachedOp");
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
-  NDArray** outarray = *reinterpret_cast<NDArray***>(outputs);
 
   API_BEGIN();
-  const std::vector<nnvm::NodePtr>& vars =
-    g->GetAttr<std::vector<nnvm::NodePtr> >("vars");
-  const nnvm::IndexedGraph& idx = g->indexed_graph();
-  CHECK_EQ(static_cast<size_t>(num_inputs), vars.size())
-      << "Actually number of inputs differs from expected number of inputs";
-  Context default_ctx = static_cast<NDArray*>(inputs[0])->ctx();
-
-  std::vector<NDArray> buff(idx.num_node_entries());
-  for (size_t i = 0; i < vars.size(); ++i) {
-    buff[idx.entry_id(idx.node_id(vars[i].get()), 0)] =
-        *static_cast<NDArray*>(inputs[i]);
+  CachedOpPtr op = *static_cast<CachedOpPtr*>(handle);
+  std::vector<NDArray*> ndinputs;
+  ndinputs.reserve(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    ndinputs.push_back(reinterpret_cast<NDArray*>(inputs[i]));
   }
 
-  for (size_t i = 0; i < idx.num_nodes(); ++i) {
-    const nnvm::IndexedGraph::Node& node = idx[i];
-    if (node.source->attrs.op == nullptr) continue;
-    std::vector<NDArray> in;
-    in.reserve(node.inputs.size());
-    for (const auto& j : node.inputs) {
-      in.emplace_back(buff[idx.entry_id(j)]);
+  std::vector<NDArray*> ndoutputs;
+  ndoutputs.reserve(op->num_outputs());
+  if (*outputs == nullptr) {
+    *num_outputs = op->num_outputs();
+    for (int i = 0; i < *num_outputs; ++i) ndoutputs.push_back(new NDArray());
+  } else {
+    CHECK_EQ(*num_outputs, op->num_outputs())
+        << "CachedOp expects " << op->num_outputs() << " outputs, but "
+        << *num_outputs << " was given.";
+    for (int i = 0; i < *num_outputs; ++i) {
+      ndoutputs.push_back(reinterpret_cast<NDArray*>((*outputs)[i]));
     }
-    std::vector<NDArray> out(node.source->num_outputs());
-    ImperativeInvokeImpl(default_ctx, node.source->attrs, &in, &out);
+  }
 
-    for (size_t j = 0; j < node.source->num_outputs(); ++j) {
-      buff[idx.entry_id(i, j)] = std::move(out[j]);
-    }
+  OpStatePtr state = op->Forward(ndinputs, ndoutputs);
+  if (Imperative::Get()->is_recording()) {
+    nnvm::NodeAttrs attrs;
+    attrs.op = cached_op;
+    attrs.name = "_cachedop";
+    attrs.parsed = op;
+    Imperative::Get()->RecordOp(
+        std::move(attrs), ndinputs, ndoutputs, state,
+        &op->save_inputs(), &op->save_outputs());
   }
 
-  if (outarray == nullptr) {
+  if (*outputs == nullptr) {
     ret->ret_handles.clear();
-    for (const auto& i : idx.outputs()) {
-      ret->ret_handles.push_back(
-        reinterpret_cast<NDArrayHandle>(
-          new NDArray(buff[idx.entry_id(i)])));
+    ret->ret_handles.reserve(*num_outputs);
+    for (int i = 0; i < *num_outputs; ++i) {
+      ret->ret_handles.push_back(ndoutputs[i]);
     }
-    *num_outputs = idx.outputs().size();
     *outputs = dmlc::BeginPtr(ret->ret_handles);
-  } else {
-    CHECK_EQ(static_cast<size_t>(*num_outputs), idx.outputs().size())
-        << "Specifed number of output differs from expected number of outputs";
-    for (size_t i = 0; i < idx.outputs().size(); ++i) {
-      *outarray[i] = buff[idx.entry_id(idx.outputs()[i])];
-    }
   }
+
+  API_END();
+}
+
+int MXInvokeCachedOpEx(CachedOpHandle handle,
+                       int num_inputs,
+                       NDArrayHandle *inputs,
+                       int *num_outputs,
+                       NDArrayHandle **outputs,
+                       const int **out_stypes) {  // outputs storage types
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  int err = MXInvokeCachedOp(handle, num_inputs, inputs, num_outputs, outputs);
+  if (err != 0) return err;
+  API_BEGIN();
+  NDArray** out_array = reinterpret_cast<NDArray**>(*outputs);
+  ret->out_types.clear();
+  ret->out_types.reserve(*num_outputs);
+  for (int i = 0; i < *num_outputs; ++i) {
+    ret->out_types.emplace_back(out_array[i]->storage_type());
+  }
+  *out_stypes = dmlc::BeginPtr(ret->out_types);
   API_END();
 }
 
 int MXAutogradIsTraining(bool* curr) {
   API_BEGIN();
-  *curr = AutogradRuntime::Get()->IsTraining();
+  *curr = Imperative::Get()->is_training();
   API_END();
 }
 
 int MXAutogradSetIsTraining(int is_training, int* prev) {
   API_BEGIN();
-  *prev = AutogradRuntime::Get()->SetIsTraining(static_cast<bool>(is_training));
+  *prev = Imperative::Get()->set_is_training(static_cast<bool>(is_training));
   API_END();
 }
 
 int MXAutogradIsRecording(bool* curr) {
   API_BEGIN();
-  *curr = AutogradRuntime::Get()->IsRecording();
+  *curr = Imperative::Get()->is_recording();
   API_END();
 }
 
 int MXAutogradSetIsRecording(int is_recording, int* prev) {
   API_BEGIN();
-  *prev = AutogradRuntime::Get()->SetIsRecording(static_cast<bool>(is_recording));
+  *prev = Imperative::Get()->set_is_recording(static_cast<bool>(is_recording));
   API_END();
 }
 
@@ -579,7 +280,7 @@ int MXAutogradMarkVariables(mx_uint num_var,
     gradients.emplace_back(static_cast<NDArray*>(grad_handles[i]));
     grad_reqs.emplace_back(reqs_array[i]);
   }
-  AutogradRuntime::Get()->MarkVariables(variables, grad_reqs, gradients);
+  Imperative::Get()->MarkVariables(variables, grad_reqs, gradients);
   API_END();
 }
 
@@ -592,33 +293,58 @@ int MXAutogradBackward(mx_uint num_output,
                        NDArrayHandle *output_handles,
                        NDArrayHandle *ograd_handles,
                        int retain_graph) {
-  return MXAutogradBackwardEx(num_output, output_handles, ograd_handles, retain_graph, true);
+  return MXAutogradBackwardEx(num_output, output_handles, ograd_handles,
+                              0, nullptr, retain_graph, false, true,
+                              nullptr, nullptr);
 }
 
 int MXAutogradBackwardEx(mx_uint num_output,
                          NDArrayHandle *output_handles,
                          NDArrayHandle *ograd_handles,
+                         mx_uint num_variables,
+                         NDArrayHandle *var_handles,
                          int retain_graph,
-                         int is_train) {
-  API_BEGIN();
+                         int create_graph,
+                         int is_train,
+                         NDArrayHandle **grad_handles,
+                         int **grad_stypes) {
   MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  API_BEGIN();
 
-  std::vector<NDArray> outputs, ograds;
+  std::vector<NDArray*> outputs, ograds, variables;
   outputs.reserve(num_output);
   for (mx_uint i = 0; i < num_output; ++i) {
-    outputs.emplace_back(*static_cast<NDArray*>(output_handles[i]));
+    outputs.emplace_back(reinterpret_cast<NDArray*>(output_handles[i]));
   }
 
   ograds.reserve(num_output);
   for (mx_uint i = 0; i < num_output; ++i) {
-    if (ograd_handles != nullptr && ograd_handles[i] != nullptr) {
-      ograds.emplace_back(*static_cast<NDArray*>(ograd_handles[i]));
+    if (ograd_handles != nullptr) {
+      ograds.emplace_back(reinterpret_cast<NDArray*>(ograd_handles[i]));
     } else {
-      ograds.emplace_back();
+      ograds.emplace_back(nullptr);
     }
   }
 
-  AutogradRuntime::Get()->ComputeGradient(outputs, ograds, retain_graph, is_train);
+  variables.reserve(num_variables);
+  for (mx_uint i = 0; i < num_variables; ++i) {
+    variables.emplace_back(reinterpret_cast<NDArray*>(var_handles[i]));
+  }
+
+  auto grads = Imperative::Get()->Backward(outputs, ograds, variables, is_train,
+                                                  retain_graph, create_graph);
+  if (num_variables != 0) {
+    ret->ret_handles.clear();
+    ret->out_types.clear();
+    ret->ret_handles.reserve(grads.size());
+    ret->out_types.reserve(grads.size());
+    for (const auto& i : grads) {
+      ret->ret_handles.push_back(i);
+      ret->out_types.push_back(i->storage_type());
+    }
+    *grad_handles = dmlc::BeginPtr(ret->ret_handles);
+    *grad_stypes = dmlc::BeginPtr(ret->out_types);
+  }
   API_END();
 }
 
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index e2c29b888ada..dad71b081630 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file c_api_symbolic.cc
  * \brief C API of mxnet
  */
@@ -29,6 +30,7 @@
 #include <nnvm/symbolic.h>
 #include "./c_api_common.h"
 #include "../operator/operator_common.h"
+#include "../executor/exec_pass.h"
 
 namespace mxnet {
 namespace op {
@@ -459,7 +461,7 @@ int MXSymbolInferShape(SymbolHandle sym,
   }
 
   try {
-    g = nnvm::pass::InferShape(std::move(g), arg_shapes, "__shape__");
+    g = mxnet::exec::InferShape(std::move(g), std::move(arg_shapes), "__shape__");
   } catch (const mxnet::op::InferShapeError &err) {
     throw dmlc::Error(err.msg);
   }
@@ -544,7 +546,7 @@ int MXSymbolInferType(SymbolHandle sym,
     mxnet::MatchArguments(g.indexed_graph(), kwargs, &arg_types, "InferType");
   }
 
-  g = nnvm::pass::InferType(std::move(g), arg_types, "__dtype__");
+  g = mxnet::exec::InferType(std::move(g), std::move(arg_types), "__dtype__");
   // copy back
   CopyAttr(g.indexed_graph(), g.GetAttr<nnvm::DTypeVector>("dtype"),
            &(ret->arg_types), &(ret->out_types), &(ret->aux_types));
diff --git a/src/c_api/c_predict_api.cc b/src/c_api/c_predict_api.cc
index 5ca01492800e..3a693dbfcb9f 100644
--- a/src/c_api/c_predict_api.cc
+++ b/src/c_api/c_predict_api.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file c_predict_api.cc
  * \brief C predict API of mxnet
  */
@@ -32,6 +33,7 @@
 #include <unordered_map>
 #include "./c_api_common.h"
 #include "../operator/operator_common.h"
+#include "../executor/exec_pass.h"
 
 using namespace mxnet;
 
@@ -194,7 +196,7 @@ int MXPredCreatePartialOut(const char* symbol_json_str,
       }
     }
     nnvm::Graph g; g.outputs = sym.outputs;
-    g = nnvm::pass::InferShape(std::move(g), in_shapes, "__shape__");
+    g = mxnet::exec::InferShape(std::move(g), std::move(in_shapes), "__shape__");
     bool infer_complete = (g.GetAttr<size_t>("shape_num_unknown_nodes") == 0);
     CHECK(infer_complete)
       << "The shape information of is not enough to get the shapes";
diff --git a/src/common/cuda_utils.h b/src/common/cuda_utils.h
index 483390fc9bea..a1c37a9478a7 100644
--- a/src/common/cuda_utils.h
+++ b/src/common/cuda_utils.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cuda_utils.h
  * \brief CUDA debugging utilities.
  */
@@ -200,7 +201,7 @@ inline DType __device__ CudaMin(DType a, DType b) {
   {                                                             \
     cublasStatus_t e = (func);                                  \
     CHECK_EQ(e, CUBLAS_STATUS_SUCCESS)                          \
-        << "cuBLAS: " << common::cuda::CublasGetErrorString(e); \
+        << "cuBLAS: " << mxnet::common::cuda::CublasGetErrorString(e); \
   }
 
 /*!
@@ -213,7 +214,7 @@ inline DType __device__ CudaMin(DType a, DType b) {
   {                                                                 \
     cusolverStatus_t e = (func);                                    \
     CHECK_EQ(e, CUSOLVER_STATUS_SUCCESS)                            \
-        << "cuSolver: " << common::cuda::CusolverGetErrorString(e); \
+        << "cuSolver: " << mxnet::common::cuda::CusolverGetErrorString(e); \
   }
 
 /*!
@@ -226,9 +227,43 @@ inline DType __device__ CudaMin(DType a, DType b) {
   {                                                             \
     curandStatus_t e = (func);                                  \
     CHECK_EQ(e, CURAND_STATUS_SUCCESS)                          \
-        << "cuRAND: " << common::cuda::CurandGetErrorString(e); \
+        << "cuRAND: " << mxnet::common::cuda::CurandGetErrorString(e); \
   }
 
+/*!
+ * \brief Protected NVRTC call.
+ * \param func Expression to call.
+ *
+ * It checks for NVRTC errors after invocation of the expression.
+ */
+#define NVRTC_CALL(x)                                   \
+  {                                                     \
+    nvrtcResult result = x;                             \
+    CHECK_EQ(result, NVRTC_SUCCESS)                     \
+      << #x " failed with error "                       \
+      << nvrtcGetErrorString(result);                   \
+  }
+
+/*!
+ * \brief Protected CUDA driver call.
+ * \param func Expression to call.
+ *
+ * It checks for CUDA driver errors after invocation of the expression.
+ */
+#define CUDA_DRIVER_CALL(func)                                          \
+  {                                                                     \
+    CUresult e = (func);                                                \
+    if (e != CUDA_SUCCESS) {                                            \
+      char const * err_msg = nullptr;                                         \
+      if (cuGetErrorString(e, &err_msg) == CUDA_ERROR_INVALID_VALUE) {  \
+        LOG(FATAL) << "CUDA Driver: Unknown error " << e;               \
+      } else {                                                          \
+        LOG(FATAL) << "CUDA Driver: " << err_msg;                       \
+      }                                                                 \
+    }                                                                   \
+  }
+
+
 #if !defined(_MSC_VER)
 #define CUDA_UNROLL _Pragma("unroll")
 #define CUDA_NOUNROLL _Pragma("nounroll")
@@ -274,26 +309,31 @@ inline int SMArch(int device_id) {
 
 /*!
  * \brief Determine whether a cuda-capable gpu's architecture supports float16 math.
+ *        Assume not if device_id is negative.
  * \param device_id The device index of the cuda-capable gpu of interest.
  * \return whether the gpu's architecture supports float16 math.
  */
 inline bool SupportsFloat16Compute(int device_id) {
-  // Kepler and most Maxwell GPUs do not support fp16 compute
-  int computeCapabilityMajor = ComputeCapabilityMajor(device_id);
-  int computeCapabilityMinor = ComputeCapabilityMinor(device_id);
-  return (computeCapabilityMajor > 5) ||
-      (computeCapabilityMajor == 5 && computeCapabilityMinor >= 3);
+  if (device_id < 0) {
+    return false;
+  } else {
+    // Kepler and most Maxwell GPUs do not support fp16 compute
+    int computeCapabilityMajor = ComputeCapabilityMajor(device_id);
+    return (computeCapabilityMajor > 5) ||
+           (computeCapabilityMajor == 5 && ComputeCapabilityMinor(device_id) >= 3);
+  }
 }
 
 /*!
  * \brief Determine whether a cuda-capable gpu's architecture supports Tensor Core math.
+ *        Assume not if device_id is negative.
  * \param device_id The device index of the cuda-capable gpu of interest.
  * \return whether the gpu's architecture supports Tensor Core math.
  */
 inline bool SupportsTensorCore(int device_id) {
   // Volta (sm_70) supports TensorCore algos
-  int computeCapabilityMajor = ComputeCapabilityMajor(device_id);
-  return (computeCapabilityMajor >= 7);
+  return device_id >= 0 &&
+         ComputeCapabilityMajor(device_id) >=7;
 }
 
 // The policy if the user hasn't set the environment variable MXNET_CUDA_ALLOW_TENSOR_CORE
@@ -304,11 +344,31 @@ inline bool SupportsTensorCore(int device_id) {
  * \return whether to allow TensorCore algo (if not specified by the Operator locally).
  */
 inline bool GetEnvAllowTensorCore() {
-  // Use of optional<bool> here permits: "0", "1", "true" and "false" to all be legal.
-  bool default_value = MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT;
-  return dmlc::GetEnv("MXNET_CUDA_ALLOW_TENSOR_CORE",
-                      dmlc::optional<bool>(default_value)).value();
+  // Since these statics are in the '.h' file, they will exist and will be set
+  // separately in each compilation unit.  Not ideal, but cleaner than creating a
+  // cuda_utils.cc solely to have a single instance and initialization.
+  static bool allow_tensor_core = false;
+  static bool is_set = false;
+  if (!is_set) {
+    // Use of optional<bool> here permits: "0", "1", "true" and "false" to all be legal.
+    bool default_value = MXNET_CUDA_ALLOW_TENSOR_CORE_DEFAULT;
+    allow_tensor_core = dmlc::GetEnv("MXNET_CUDA_ALLOW_TENSOR_CORE",
+                                     dmlc::optional<bool>(default_value)).value();
+    is_set = true;
+  }
+  return allow_tensor_core;
 }
+
+#if CUDA_VERSION >= 9000
+// Sets the cuBLAS math mode that determines the 'allow TensorCore' policy.  Returns previous.
+inline cublasMath_t SetCublasMathMode(cublasHandle_t blas_handle, cublasMath_t new_math_type) {
+  auto handle_math_mode = CUBLAS_DEFAULT_MATH;
+  CUBLAS_CALL(cublasGetMathMode(blas_handle, &handle_math_mode));
+  CUBLAS_CALL(cublasSetMathMode(blas_handle, new_math_type));
+  return handle_math_mode;
+}
+#endif
+
 #endif  // MXNET_USE_CUDA
 
 #if MXNET_USE_CUDNN
diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h
new file mode 100644
index 000000000000..dcd1504fb88e
--- /dev/null
+++ b/src/common/exec_utils.h
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file exec_utils.h
+ * \brief Common utility functions for executors.
+ */
+#ifndef MXNET_COMMON_EXEC_UTILS_H_
+#define MXNET_COMMON_EXEC_UTILS_H_
+
+#include <vector>
+#include "../common/utils.h"
+
+namespace mxnet {
+namespace common {
+
+/*
+ * \brief setup default-storage tblobs from source NDArrays. If any source NDArray has non-default
+ *        storage, it creates a temp NDArray with default storage and uses the temp tblob. The
+ *        function also records the indices of non-default source NDArrays and the indices of
+ *        their corresponding temporary NDArrays in the temp array.
+ * \param src list of source NDArray
+ * \param blobs list of tblobs to return
+ * \param temp_src list of source NDArrays which requires temporary default storage representation
+ * \param temp_dst list of temporary destination NDArrays for default storage representation
+ * \param idx_map mapping from indices in source NDArrays to indices in temp_dst. When not set,
+          indices are not recorded
+ * \return true if any source NDArray need to cast storage
+ */
+inline bool SetupDefaultBlobs(const std::vector<NDArray>& src,
+                              std::vector<TBlob> *blobs,
+                              std::vector<NDArray> *temp_src,
+                              std::vector<NDArray> *temp_dst,
+                              std::unordered_map<uint32_t, uint32_t> *idx_map = nullptr) {
+  bool require_cast = false;
+  for (size_t i = 0; i < src.size(); i++) {
+    auto& nd = src[i];
+    if (nd.storage_type() != kDefaultStorage) {
+      if (idx_map != nullptr) {
+        (*idx_map)[i] = temp_dst->size();
+      }
+      NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype());
+      temp_src->emplace_back(nd);
+      temp_dst->emplace_back(temp);
+      blobs->emplace_back(temp.data());
+      require_cast = true;
+    } else {
+      blobs->push_back(nd.data());
+    }
+  }
+  return require_cast;
+}
+
+/*
+ * \brief setup default-storage tblobs for input and output NDArrays.
+ *        If any NDArray has non-default storage,
+ *        it creates a temp NDArray with default storage and uses the temp tblob. The
+ *        function also records the indices of non-default source NDArrays and the indices of
+ *        their corresponding temporary NDArrays in the temp array.
+ */
+inline void SetupDefaultBlobsInOut(const std::vector<NDArray> &ndinputs,
+                                   const std::vector<NDArray> &ndoutputs,
+                                   std::vector<TBlob> *input_blobs,
+                                   std::vector<TBlob> *output_blobs,
+                                   std::vector<NDArray> *pre_temp_src,
+                                   std::vector<NDArray> *pre_temp_dst,
+                                   std::vector<NDArray> *post_temp_src,
+                                   std::vector<NDArray> *post_temp_dst,
+                                   std::unordered_map<uint32_t, uint32_t> *in_temp_idx_map,
+                                   const std::vector<uint32_t> &mutate_idx) {
+  // populate input blobs
+  SetupDefaultBlobs(ndinputs, input_blobs, pre_temp_src, pre_temp_dst, in_temp_idx_map);
+  // populate output blobs
+  SetupDefaultBlobs(ndoutputs, output_blobs, post_temp_dst, post_temp_src);
+  // add mutable inputs to post temp list
+  for (const auto idx : mutate_idx) {
+    auto map_iter = in_temp_idx_map->find(idx);
+    if (map_iter != in_temp_idx_map->end()) {
+      post_temp_src->push_back(pre_temp_dst->at(map_iter->second));
+      post_temp_dst->push_back(ndinputs[idx]);
+    }
+  }
+}
+
+/*
+ * \brief cast the NDArrays in `src` and store the result in NDArrays in `dst`.
+ *        This is only used for storage fallback in executor.
+ * \param src list of source NDArray to cast
+ * \param dst list of destionation NDArray which hold the result of cast_storage operation
+ * \param ctx operator context for cast_storage operation
+ */
+inline void CastNonDefaultStorage(const std::vector<NDArray>& src,
+                                  const std::vector<NDArray>& dst,
+                                  const OpContext& ctx,
+                                  const bool is_gpu) {
+  CHECK_EQ(dst.size(), src.size());
+  for (size_t i = 0; i < src.size(); i++) {
+    if (is_gpu) {
+#if MXNET_USE_CUDA
+      CastStorageDispatch<gpu>(ctx, src[i], dst[i]);
+#else
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    } else {
+      CastStorageDispatch<cpu>(ctx, src[i], dst[i]);
+    }
+  }
+}
+}  // namespace common
+}  // namespace mxnet
+#endif  // MXNET_COMMON_EXEC_UTILS_H_
diff --git a/src/common/lazy_alloc_array.h b/src/common/lazy_alloc_array.h
index aa2cd4a139ee..0fd5acd63d59 100644
--- a/src/common/lazy_alloc_array.h
+++ b/src/common/lazy_alloc_array.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file lazy_alloc_array.h
  * \brief An array that lazily allocate elements as
  *   First time the cell get visited.
@@ -56,8 +57,6 @@ class LazyAllocArray {
   /*! \brief clear all the allocated elements in array */
   inline void Clear();
 
-  void SignalForKill();
-
  private:
   template<typename SyncObject>
   class unique_unlock {
@@ -86,12 +85,12 @@ class LazyAllocArray {
   /*! \brief overflow array of more elements */
   std::vector<std::shared_ptr<TElem> > more_;
   /*! \brief Signal shutdown of array */
-  std::atomic<bool> exit_now_;
+  std::atomic<bool> is_clearing_;
 };
 
 template<typename TElem>
 inline LazyAllocArray<TElem>::LazyAllocArray()
-  : exit_now_(false) {
+  : is_clearing_(false) {
 }
 
 // implementations
@@ -106,7 +105,7 @@ inline std::shared_ptr<TElem> LazyAllocArray<TElem>::Get(int index, FCreate crea
       return ptr;
     } else {
       std::lock_guard<std::mutex> lock(create_mutex_);
-      if (!exit_now_.load()) {
+      if (!is_clearing_.load()) {
         std::shared_ptr<TElem> ptr = head_[idx];
         if (ptr) {
           return ptr;
@@ -117,7 +116,7 @@ inline std::shared_ptr<TElem> LazyAllocArray<TElem>::Get(int index, FCreate crea
     }
   } else {
     std::lock_guard<std::mutex> lock(create_mutex_);
-    if (!exit_now_.load()) {
+    if (!is_clearing_.load()) {
       idx -= kInitSize;
       if (more_.size() <= idx) {
         more_.reserve(idx + 1);
@@ -139,7 +138,7 @@ inline std::shared_ptr<TElem> LazyAllocArray<TElem>::Get(int index, FCreate crea
 template<typename TElem>
 inline void LazyAllocArray<TElem>::Clear() {
   std::unique_lock<std::mutex> lock(create_mutex_);
-  exit_now_.store(true);
+  is_clearing_.store(true);
   // Currently, head_ and more_ never get smaller, so it's safe to
   // iterate them outside of the lock.  The loops should catch
   // any growth which might happen when create_mutex_ is unlocked
@@ -155,6 +154,8 @@ inline void LazyAllocArray<TElem>::Clear() {
     unique_unlock<std::mutex> unlocker(&lock);
     p = std::shared_ptr<TElem>(nullptr);
   }
+  more_.clear();
+  is_clearing_.store(false);
 }
 
 template<typename TElem>
@@ -173,12 +174,6 @@ inline void LazyAllocArray<TElem>::ForEach(FVisit fvisit) {
   }
 }
 
-template<typename TElem>
-inline void LazyAllocArray<TElem>::SignalForKill() {
-  std::lock_guard<std::mutex> lock(create_mutex_);
-  exit_now_.store(true);
-}
-
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_LAZY_ALLOC_ARRAY_H_
diff --git a/src/common/mxrtc.cc b/src/common/mxrtc.cc
deleted file mode 100644
index e72ac0bacdde..000000000000
--- a/src/common/mxrtc.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mxrtc.cc
- * \brief Wrapper for NVRTC
- * \author Junyuan Xie
- */
-#include <mxnet/mxrtc.h>
-#if ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
-namespace mxnet {
-const char MXRtc::str_type[] = "float";
-std::unordered_map<std::string, char*> MXRtc::kernel_registry;
-
-MXRtc::MXRtc(const std::string& name,
-             std::vector<std::pair<std::string, NDArray> > const& input,
-             std::vector<std::pair<std::string, NDArray> > const& output,
-             const std::string& kernel) {
-    name_ = name;
-    num_input_ = input.size();
-    num_output_ = output.size();
-    code_ = decorate(name, input, output, kernel);
-    if (MXRtc::kernel_registry.find(code_) != MXRtc::kernel_registry.end()) {
-        ptx_ = MXRtc::kernel_registry[code_];
-    } else {
-        ptx_ = compile(name, code_);
-    }
-}
-
-void MXRtc::push(std::vector<NDArray> const& input,
-                 std::vector<NDArray> const& output,
-                 unsigned int grid_dim_X,
-                 unsigned int grid_dim_Y,
-                 unsigned int grid_dim_Z,
-                 unsigned int block_dim_X,
-                 unsigned int block_dim_Y,
-                 unsigned int block_dim_Z) {
-    CHECK_EQ(num_input_, input.size());
-    CHECK_EQ(num_output_, output.size());
-    CHECK(output.size());
-    cudaError_enum err;
-    CUfunction func;
-    int dev_id = output[0].ctx().dev_id;
-    if (func_.find(dev_id) != func_.end()) {
-        func = func_[dev_id];
-    } else {
-        CUmodule module;
-        CHECK_EQ(err = cuModuleLoadDataEx(&module, ptx_, 0, 0, 0), CUDA_SUCCESS)
-            << "CudaError: " << err;
-        CHECK_EQ(err = cuModuleGetFunction(&func, module, name_.c_str()), CUDA_SUCCESS)
-            << "CudaError: " << err;
-        module_[dev_id] = module;
-        func_[dev_id] = func;
-    }
-    auto op = [this, func, input, output,
-               grid_dim_X, grid_dim_Y, grid_dim_Z,
-               block_dim_X, block_dim_Y, block_dim_Z](RunContext rctx) {
-        std::vector<float*> float_args;
-        for (auto& i : input) float_args.push_back(static_cast<float*>(i.data().dptr_));
-        for (auto& i : output) float_args.push_back(static_cast<float*>(i.data().dptr_));
-        std::vector<void*> args;
-        for (auto& i : float_args) args.push_back(&i);
-        cudaError_enum err;
-        cudaError_t cuerr;
-        CHECK_EQ(err = cuLaunchKernel(func,
-                                grid_dim_X, grid_dim_Y, grid_dim_Z,
-                                block_dim_X, block_dim_Y, block_dim_Z,
-                                0, rctx.get_stream<mshadow::gpu>()->stream_,
-                                args.data(), 0), CUDA_SUCCESS) << "CudaError: " << err;
-        CHECK_EQ(cuerr = cudaStreamSynchronize(rctx.get_stream<mshadow::gpu>()->stream_),
-                 cudaSuccess) << "CudaError: " << cuerr;
-    };
-    std::vector<Engine::VarHandle> var_in, var_out;
-    for (auto& i : input) var_in.push_back(i.var());
-    for (auto& i : output) var_out.push_back(i.var());
-    Engine::Get()->PushSync(op, output[0].ctx(), var_in, var_out,
-            FnProperty::kNormal, 0, PROFILER_MESSAGE("MXRtc"));
-}
-
-std::string MXRtc::decorate(const std::string& name,
-                         std::vector<std::pair<std::string, NDArray> > const& input,
-                         std::vector<std::pair<std::string, NDArray> > const& output,
-                         const std::string kernel) {
-    std::string source;
-    source = source + "\nextern \"C\" __global__ void " + name + "(";
-    for (auto &i : input) {
-        source = source + "const " + str_type + "* " + i.first + ",";
-    }
-    for (auto &i : output) {
-        source = source + str_type + "* " + i.first + ",";
-    }
-    source.pop_back();
-    source = source + ") {\n";
-    for (auto &i : input) {
-        source = source + "const int " + i.first + "_ndim = " +
-                  std::to_string(i.second.shape().ndim()) + ";\n";
-        source = source + "const int " + i.first + "_dims[] = {";
-        for (index_t j = 0; j < i.second.shape().ndim(); ++j) {
-            source = source + std::to_string(i.second.shape()[j]) + ",";
-        }
-        source.pop_back();
-        source = source + "};\n";
-    }
-    for (auto &i : output) {
-        source = source + "const int " + i.first + "_ndim = " +
-                  std::to_string(i.second.shape().ndim()) + ";\n";
-        source = source + "const int " + i.first + "_dims[] = {";
-        for (index_t j = 0; j < i.second.shape().ndim(); ++j) {
-            source = source + std::to_string(i.second.shape()[j]) + ",";
-        }
-        source.pop_back();
-        source = source + "};\n";
-    }
-    source = source + kernel + "\n}\n";
-    return source;
-}
-
-char* MXRtc::compile(const std::string& name, const std::string& code) {
-    nvrtcProgram prog;
-    CHECK_EQ(nvrtcCreateProgram(&prog,
-                                code.c_str(),
-                                (name+".cu").c_str(),
-                                0,
-                                NULL,
-                                NULL), NVRTC_SUCCESS);
-    nvrtcResult compile_res = nvrtcCompileProgram(prog, 0, NULL);
-    size_t log_size;
-    CHECK_EQ(nvrtcGetProgramLogSize(prog, &log_size), NVRTC_SUCCESS);
-    char *log = new char[log_size];
-    CHECK_EQ(nvrtcGetProgramLog(prog, log), NVRTC_SUCCESS);
-    CHECK_EQ(compile_res, NVRTC_SUCCESS) << log;
-
-    size_t ptx_size;
-    CHECK_EQ(nvrtcGetPTXSize(prog, &ptx_size), NVRTC_SUCCESS);
-    char *ptx = new char[ptx_size];
-    CHECK_EQ(nvrtcGetPTX(prog, ptx), NVRTC_SUCCESS);
-    CHECK_EQ(nvrtcDestroyProgram(&prog), NVRTC_SUCCESS);
-    return ptx;
-}
-
-}  // namespace mxnet
-
-#endif  // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
diff --git a/src/common/object_pool.h b/src/common/object_pool.h
index 6e11ce5ca785..576ff9aea1cc 100644
--- a/src/common/object_pool.h
+++ b/src/common/object_pool.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  */
 #ifndef MXNET_COMMON_OBJECT_POOL_H_
 #define MXNET_COMMON_OBJECT_POOL_H_
diff --git a/src/common/rtc.cc b/src/common/rtc.cc
new file mode 100644
index 000000000000..c48afc68951b
--- /dev/null
+++ b/src/common/rtc.cc
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <mxnet/rtc.h>
+#include <typeinfo>
+
+#include "../common/cuda_utils.h"
+#include "../operator/operator_common.h"
+
+#if MXNET_USE_CUDA
+
+namespace mxnet {
+namespace rtc {
+
+CudaModule::Chunk::Chunk(
+    const char* source,
+    const std::vector<std::string>& options,
+    const std::vector<std::string>& exports) {
+  NVRTC_CALL(nvrtcCreateProgram(&prog_, source, "source.cu", 0, NULL, NULL));
+  for (const auto& i : exports) exports_.insert(i);
+#if CUDA_VERSION >= 8000
+  for (const auto& func : exports) {
+    NVRTC_CALL(nvrtcAddNameExpression(prog_, func.c_str()));
+  }
+#else
+  CHECK_EQ(exports.size(), 0)
+      << "Exporting is only supported with CUDA 8.0 and above. "
+      << "For lower version of CUDA, please prepend your kernel defintiions "
+      << "with extern \"C\" instead.";
+#endif
+  std::vector<const char*> c_options;
+  for (const auto& i : options) c_options.push_back(i.c_str());
+  nvrtcResult compile_res = nvrtcCompileProgram(prog_, c_options.size(), c_options.data());
+  if (compile_res != NVRTC_SUCCESS) {
+    size_t err_size;
+    NVRTC_CALL(nvrtcGetProgramLogSize(prog_, &err_size));
+    std::vector<char> err(err_size);
+    NVRTC_CALL(nvrtcGetProgramLog(prog_, err.data()));
+    LOG(FATAL) << err.data();
+  }
+
+  size_t ptx_size;
+  NVRTC_CALL(nvrtcGetPTXSize(prog_, &ptx_size));
+  ptx_ = new char[ptx_size];
+  NVRTC_CALL(nvrtcGetPTX(prog_, ptx_));
+}
+
+
+CudaModule::Chunk::~Chunk() {
+  for (const auto& kv : mod_) {
+    CUDA_DRIVER_CALL(cuModuleUnload(kv.second));
+  }
+  NVRTC_CALL(nvrtcDestroyProgram(&prog_));
+  delete ptx_;
+}
+
+
+CUfunction CudaModule::Chunk::GetFunction(
+    const std::string& mangled_name,
+    const Context& ctx) {
+  CHECK_EQ(ctx.dev_mask(), Context::kGPU)
+      << "CUDA Runtime compilation only supports Nvidia GPU.";
+  auto iter = mod_.find(ctx.dev_id);
+  CUmodule module;
+  if (iter != mod_.end()) {
+    module = iter->second;
+  } else {
+    CUDA_CALL(cudaSetDevice(ctx.dev_id));
+    CUDA_DRIVER_CALL(cuModuleLoadDataEx(&module, ptx_, 0, 0, 0));
+    mod_[ctx.dev_id] = module;
+  }
+  CUfunction function;
+  auto err = cuModuleGetFunction(&function, module, mangled_name.c_str());
+  if (err == CUDA_ERROR_NOT_FOUND) {
+    LOG(FATAL) << "Cannot find cuda kernel with name '" << mangled_name
+               << "'. Please either prepend kernel definition "
+               << "with 'extern \"C\"' or add its name to exports "
+               << "when creating CudaModule.";
+  }
+  CUDA_DRIVER_CALL(err);
+  return function;
+}
+
+
+std::shared_ptr<CudaModule::Kernel> CudaModule::GetKernel(
+    const std::string& name, const std::vector<ArgType>& signature) {
+  std::string mangled_name = name;
+#if CUDA_VERSION >= 8000
+  if (ptr_->exports_.count(name)) {
+    const char * c_mangled_name;
+    NVRTC_CALL(nvrtcGetLoweredName(ptr_->prog_, name.c_str(), &c_mangled_name));
+    mangled_name = c_mangled_name;
+  }
+#endif
+  return std::shared_ptr<Kernel>(new Kernel(ptr_, mangled_name, signature));
+}
+
+
+CudaModule::Kernel::Kernel(
+    const std::shared_ptr<CudaModule::Chunk>& mod,
+    const std::string& mangled_name,
+    const std::vector<ArgType>& signature)
+      : mangled_name_(mangled_name), signature_(signature), mod_(mod) {
+}
+
+void CudaModule::Kernel::Launch(
+    const Context& ctx, const std::vector<dmlc::any>& args,
+    uint32_t grid_dim_x, uint32_t grid_dim_y, uint32_t grid_dim_z,
+    uint32_t block_dim_x, uint32_t block_dim_y, uint32_t block_dim_z,
+    uint32_t shared_mem) {
+  CHECK_EQ(ctx.dev_mask(), Context::kGPU)
+      << "CUDA Runtime compilation only supports Nvidia GPU.";
+
+  auto mod = mod_;
+  auto arg_types = signature();
+
+  CUfunction function;
+  auto iter = func_.find(ctx.dev_id);
+  if (iter != func_.end()) {
+    function = iter->second;
+  } else {
+    function = mod_->GetFunction(mangled_name_, ctx);
+    func_[ctx.dev_id] = function;
+  }
+
+  std::vector<Engine::VarHandle> read_vars, write_vars;
+  for (size_t i = 0; i < arg_types.size(); ++i) {
+    if (!arg_types[i].is_ndarray) continue;
+    const auto& array = dmlc::get<NDArray>(args[i]);
+    CHECK_EQ(array.dtype(), arg_types[i].dtype)
+        << "The i-th argument is expected to be an NDArray of "
+        << op::type_string(arg_types[i].dtype) << " type, but got "
+        << op::type_string(array.dtype()) << " instead.";
+    if (arg_types[i].is_const) {
+      read_vars.emplace_back(array.var());
+    } else {
+      write_vars.emplace_back(array.var());
+    }
+  }
+
+  Engine::Get()->PushSync(
+    [function, mod, args, arg_types, grid_dim_x, grid_dim_y, grid_dim_z,
+     block_dim_x, block_dim_y, block_dim_z, shared_mem](RunContext rctx) {
+    std::vector<void*> p_args;
+    for (size_t i = 0; i < arg_types.size(); ++i) {
+      if (arg_types[i].is_ndarray) {
+        const auto& array = dmlc::get<NDArray>(args[i]);
+        p_args.push_back(reinterpret_cast<void*>(const_cast<void**>(&array.data().dptr_)));
+      } else {
+        MSHADOW_TYPE_SWITCH(arg_types[i].dtype, DType, {
+          const auto& number = dmlc::get<DType>(args[i]);
+          p_args.push_back(const_cast<DType*>(&number));
+        });
+      }
+    }
+
+    mshadow::Stream<gpu> *s = rctx.get_stream<gpu>();
+    CUDA_DRIVER_CALL(cuLaunchKernel(
+        function, grid_dim_x, grid_dim_y, grid_dim_z,
+        block_dim_x, block_dim_y, block_dim_z,
+        shared_mem, s->stream_,
+        p_args.data(), 0));
+    CUDA_CALL(cudaStreamSynchronize(s->stream_));
+  }, ctx, read_vars, write_vars, FnProperty::kNormal, 0,
+  PROFILER_MESSAGE(mangled_name_.c_str()));
+}
+
+
+}  // namespace rtc
+}  // namespace mxnet
+
+#endif  // ((MXNET_USE_CUDA) && (MXNET_USE_NVRTC))
diff --git a/src/common/static_array.h b/src/common/static_array.h
new file mode 100644
index 000000000000..8d51967b172d
--- /dev/null
+++ b/src/common/static_array.h
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file static_array.h
+ */
+#ifndef MXNET_COMMON_STATIC_ARRAY_H_
+#define MXNET_COMMON_STATIC_ARRAY_H_
+
+#include <mshadow/base.h>
+
+namespace mxnet {
+namespace common {
+
+/*! \brief
+ * Static array. This code is borrowed from struct Shape<ndim>,
+ * except that users can specify the type of the elements of
+ * the statically allocated array.
+ * The object instance of the struct is copyable between CPU and GPU.
+ * \tparam T element type of the array, must be copyable between CPU and GPU
+ * \tparam num number of elements in the array
+ */
+template<typename T, int num>
+struct StaticArray {
+  static const int kNum = num;
+
+  T array_[kNum];
+
+  /*! \brief default constructor, do nothing */
+  MSHADOW_XINLINE StaticArray(void) {}
+
+  /*! \brief constructor, fill in the array with the input value */
+  MSHADOW_XINLINE StaticArray(const T& val) {
+    #pragma unroll
+    for (int i = 0; i < num; ++i) {
+      this->array_[i] = val;
+    }
+  }
+
+  /*! \brief constuctor */
+  MSHADOW_XINLINE StaticArray(const StaticArray<T, num>& sa) {
+    #pragma unroll
+    for (int i = 0; i < num; ++i) {
+      this->array_[i] = sa[i];
+    }
+  }
+
+  MSHADOW_XINLINE T& operator[](const index_t idx) {
+    return array_[idx];
+  }
+
+  MSHADOW_XINLINE const T& operator[](const index_t idx) const {
+    return array_[idx];
+  }
+};  // StaticArray
+
+}  // namespace common
+}  // namespace mxnet
+#endif  // MXNET_COMMON_STATIC_ARRAY_H_
diff --git a/src/common/utils.cc b/src/common/utils.cc
new file mode 100644
index 000000000000..784fcf8651ae
--- /dev/null
+++ b/src/common/utils.cc
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file utils.cc
+ * \brief cpu implementation of util functions
+ */
+
+#include "./utils.h"
+#include "../operator/tensor/cast_storage-inl.h"
+
+namespace mxnet {
+namespace common {
+
+template<>
+void CheckFormatWrapper<cpu>(const RunContext &rctx, const NDArray &input,
+                             const TBlob &err_cpu, const bool full_check) {
+  CheckFormatImpl<cpu>(rctx, input, err_cpu, full_check);
+}
+
+template<>
+void CastStorageDispatch<cpu>(const OpContext& ctx,
+                              const NDArray& input,
+                              const NDArray& output) {
+  mxnet::op::CastStorageComputeImpl<cpu>(ctx, input, output);
+}
+
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/utils.cu b/src/common/utils.cu
new file mode 100644
index 000000000000..c6e2bf813876
--- /dev/null
+++ b/src/common/utils.cu
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file utils.cu
+ * \brief gpu implementation of util functions
+ */
+
+#include "./utils.h"
+#include "../operator/tensor/cast_storage-inl.h"
+
+namespace mxnet {
+namespace common {
+
+template<>
+void CheckFormatWrapper<gpu>(const RunContext &rctx, const NDArray &input,
+                             const TBlob &err_cpu,  const bool full_check) {
+  CheckFormatImpl<gpu>(rctx, input, err_cpu, full_check);
+}
+
+template<>
+void CastStorageDispatch<gpu>(const OpContext& ctx,
+                              const NDArray& input,
+                              const NDArray& output) {
+  mxnet::op::CastStorageComputeImpl<gpu>(ctx, input, output);
+}
+
+}  // namespace common
+}  // namespace mxnet
diff --git a/src/common/utils.h b/src/common/utils.h
index 85e30970f1a0..038ab2a04721 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -18,13 +18,22 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file utils.h
  * \brief Basic utilility functions.
  */
 #ifndef MXNET_COMMON_UTILS_H_
 #define MXNET_COMMON_UTILS_H_
 
-#if DMLC_USE_CXX11
+#include <dmlc/logging.h>
+#include <dmlc/omp.h>
+#include <nnvm/graph.h>
+#include <mxnet/engine.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/op_attr_types.h>
+#include <mxnet/graph_attr_types.h>
+#include <nnvm/graph_attr_types.h>
+
 #include <memory>
 #include <vector>
 #include <type_traits>
@@ -33,15 +42,303 @@
 #include <string>
 #include <thread>
 #include <algorithm>
-#endif  // DMLC_USE_CXX11
+#include <functional>
 
-#include <dmlc/logging.h>
-#include <mxnet/engine.h>
+#include "../operator/mxnet_op.h"
 
 namespace mxnet {
 namespace common {
 
-#if DMLC_USE_CXX11
+
+/*!
+ * \brief IndPtr should be non-negative, in non-decreasing order, start with 0
+ *           and end with value equal with size of indices.
+ */
+struct csr_indptr_check {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const IType* indptr,
+                                  const nnvm::dim_t end, const nnvm::dim_t idx_size) {
+    if (indptr[i+1] < 0 || indptr[i+1] < indptr[i] ||
+        (i == 0 && indptr[i] != 0) ||
+        (i == end - 1 && indptr[end] != idx_size))
+      *out = kCSRIndPtrErr;
+  }
+};
+
+/*!
+ *  \brief Indices should be non-negative, less than the number of columns
+ *           and in ascending order per row.
+ */
+struct csr_idx_check {
+  template<typename DType, typename IType, typename RType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const IType* idx,
+                                  const RType* indptr, const nnvm::dim_t ncols) {
+    for (RType j = indptr[i]; j < indptr[i+1]; j++) {
+      if (idx[j] >= ncols || idx[j] < 0 ||
+          (j < indptr[i+1] - 1 && idx[j] >= idx[j+1])) {
+        *out = kCSRIdxErr;
+        break;
+      }
+    }
+  }
+};
+
+/*!
+ *  \brief Indices of RSPNDArray should be non-negative,
+ *           less than the size of first dimension and in ascending order
+ */
+struct rsp_idx_check {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const IType* idx,
+                                  const nnvm::dim_t end, const nnvm::dim_t nrows) {
+    if ((i < end && idx[i+1] <= idx[i])
+        || idx[i] < 0 || idx[i] >= nrows)
+      *out = kRSPIdxErr;
+  }
+};
+
+template<typename xpu>
+void CheckFormatWrapper(const RunContext &rctx, const NDArray &input,
+                        const TBlob &err_cpu, const bool full_check);
+
+/*!
+ * \brief Check the validity of CSRNDArray.
+ * \param rctx Execution context.
+ * \param input Input NDArray of CSRStorage.
+ * \param err_cpu Error number on cpu.
+ * \param full_check If true, rigorous check, O(N) operations,
+ *          otherwise basic check, O(1) operations.
+ */
+template<typename xpu>
+void CheckFormatCSRImpl(const RunContext &rctx, const NDArray &input,
+                        const TBlob &err_cpu, const bool full_check) {
+  using namespace op::mxnet_op;
+  CHECK_EQ(input.storage_type(), kCSRStorage)
+          << "CheckFormatCSRImpl is for CSRNDArray";
+  const TShape shape = input.shape();
+  const TShape idx_shape = input.aux_shape(csr::kIdx);
+  const TShape indptr_shape = input.aux_shape(csr::kIndPtr);
+  const TShape storage_shape = input.storage_shape();
+  if ((shape.ndim() != 2) ||
+      (idx_shape.ndim() != 1 || indptr_shape.ndim() != 1 || storage_shape.ndim() != 1) ||
+      (indptr_shape[0] != shape[0] + 1) ||
+      (idx_shape[0] != storage_shape[0])) {
+     MSHADOW_TYPE_SWITCH(err_cpu.type_flag_, DType, {
+       DType* err = err_cpu.dptr<DType>();
+       *err = kCSRShapeErr;
+     });
+     return;
+  }
+  if (full_check) {
+    MSHADOW_TYPE_SWITCH(err_cpu.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(input.aux_type(csr::kIndPtr), RType, {
+        MSHADOW_IDX_TYPE_SWITCH(input.aux_type(csr::kIdx), IType, {
+          mshadow::Stream<xpu> *s = rctx.get_stream<xpu>();
+          NDArray ret_xpu = NDArray(mshadow::Shape1(1),
+                                    rctx.get_ctx(), false, err_cpu.type_flag_);
+          TBlob val_xpu = ret_xpu.data();
+          Kernel<set_to_int<kNormalErr>, xpu>::Launch(s, val_xpu.Size(), val_xpu.dptr<DType>());
+          Kernel<csr_indptr_check, xpu>::Launch(s, indptr_shape[0] - 1, val_xpu.dptr<DType>(),
+            input.aux_data(csr::kIndPtr).dptr<RType>(),
+            indptr_shape[0] - 1, idx_shape[0]);
+          // no need to check indices if indices are empty
+          if (idx_shape[0] != 0) {
+            Kernel<csr_idx_check, xpu>::Launch(s, indptr_shape[0] - 1, val_xpu.dptr<DType>(),
+              input.aux_data(csr::kIdx).dptr<IType>(),
+              input.aux_data(csr::kIndPtr).dptr<RType>(), shape[1]);
+          }
+          mshadow::Copy(err_cpu.get<cpu, 1, DType>(),
+                        val_xpu.get<xpu, 1, DType>(s), s);
+        });
+      });
+    });
+  }
+}
+
+/*!
+ * \brief Check the validity of RowSparseNDArray.
+ * \param rctx Execution context.
+ * \param input Input NDArray of RowSparseStorage.
+ * \param err_cpu Error number on cpu.
+ * \param full_check If true, rigorous check, O(N) operations,
+ *          otherwise basic check, O(1) operations.
+ */
+template<typename xpu>
+void CheckFormatRSPImpl(const RunContext &rctx, const NDArray &input,
+                        const TBlob &err_cpu, const bool full_check) {
+  using namespace op::mxnet_op;
+  CHECK_EQ(input.storage_type(), kRowSparseStorage)
+          << "CheckFormatRSPImpl is for RSPNDArray";
+  const TShape idx_shape = input.aux_shape(rowsparse::kIdx);
+  if (idx_shape[0] != input.storage_shape()[0]) {
+    MSHADOW_TYPE_SWITCH(err_cpu.type_flag_, DType, {
+      DType* err = err_cpu.dptr<DType>();
+      *err = kRSPShapeErr;
+    });
+    return;
+  }
+  if (idx_shape[0] == 0) {
+    return;
+  }
+  if (full_check) {
+    MSHADOW_TYPE_SWITCH(err_cpu.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(input.aux_type(rowsparse::kIdx), IType, {
+        mshadow::Stream<xpu> *s = rctx.get_stream<xpu>();
+        NDArray ret_xpu = NDArray(mshadow::Shape1(1),
+                                  rctx.get_ctx(), false, err_cpu.type_flag_);
+        TBlob val_xpu = ret_xpu.data();
+        Kernel<set_to_int<kNormalErr>, xpu>::Launch(s, val_xpu.Size(), val_xpu.dptr<DType>());
+
+        Kernel<rsp_idx_check, xpu>::Launch(s, idx_shape[0],
+          val_xpu.dptr<DType>(), input.aux_data(rowsparse::kIdx).dptr<IType>(),
+          idx_shape[0] - 1, input.shape()[0]);
+        mshadow::Copy(err_cpu.get<cpu, 1, DType>(),
+                      val_xpu.get<xpu, 1, DType>(s), s);
+      });
+    });
+  }
+}
+
+template<typename xpu>
+void CheckFormatImpl(const RunContext &rctx, const NDArray &input,
+                     const TBlob &err_cpu, const bool full_check) {
+  int stype = input.storage_type();
+  if (stype == kCSRStorage) {
+    CheckFormatCSRImpl<xpu>(rctx, input, err_cpu, full_check);
+  } else if (stype == kRowSparseStorage) {
+    CheckFormatRSPImpl<xpu>(rctx, input, err_cpu, full_check);
+  } else if (stype == kDefaultStorage) {
+    // no-op for default storage
+  } else {
+    LOG(FATAL) << "Unknown storage type " << stype;
+  }
+}
+
+
+template<typename xpu>
+void CastStorageDispatch(const OpContext& ctx, const NDArray& input, const NDArray& output);
+
+/*! \brief returns true if all storage types in `vstorage` are the same as target `stype`.
+ *         false is returned for empty inputs.
+ */
+inline bool ContainsOnlyStorage(const StorageTypeVector& vstorage,
+                                const NDArrayStorageType stype) {
+  if (!vstorage.empty()) {
+    for (const auto& i : vstorage) {
+      if (i != stype) return false;
+    }
+    return true;
+  }
+  return false;
+}
+
+/*! \brief returns true if all storage types in `vstorage` are the same as target `stype1`
+ *         or `stype2'. Sets boolean if both found.
+ *         false is returned for empty inputs.
+ */
+inline bool ContainsOnlyStorage(const StorageTypeVector& vstorage,
+                                const NDArrayStorageType stype1,
+                                const NDArrayStorageType stype2,
+                                bool *has_both) {
+  if (has_both) {
+    *has_both = false;
+  }
+  if (!vstorage.empty()) {
+    uint8_t has = 0;
+    for (const auto i : vstorage) {
+      if (i == stype1) {
+        has |= 1;
+      } else if (i == stype2) {
+        has |= 2;
+      } else {
+        return false;
+      }
+    }
+    if (has_both) {
+      *has_both = has == 3;
+    }
+    return true;
+  }
+  return false;
+}
+
+/*! \brief returns true if the storage types of arrays in `ndarrays`
+ *         are the same as target `stype`. false is returned for empty inputs.
+ */
+inline bool ContainsOnlyStorage(const std::vector<NDArray>& ndarrays,
+                                const NDArrayStorageType stype) {
+  if (!ndarrays.empty()) {
+    for (const auto& nd : ndarrays) {
+      if (nd.storage_type() != stype) {
+        return false;
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
+/*! \brief returns true if the storage types of arrays in `ndarrays`
+ *         are the same as targets `stype1` or `stype2`. false is returned for empty inputs.
+ */
+inline bool ContainsOnlyStorage(const std::vector<NDArray>& ndarrays,
+                                const NDArrayStorageType stype1,
+                                const NDArrayStorageType stype2,
+                                bool *has_both) {
+  if (has_both) {
+    *has_both = false;
+  }
+  if (!ndarrays.empty()) {
+    uint8_t has = 0;
+    for (const auto& nd : ndarrays) {
+      const NDArrayStorageType stype = nd.storage_type();
+      if (stype == stype1) {
+        has |= 1;
+      } else if (stype == stype2) {
+        has |= 2;
+      } else {
+        return false;
+      }
+    }
+    if (has_both) {
+      *has_both = has == 3;
+    }
+    return true;
+  }
+  return false;
+}
+
+/*! \brief get string representation of dispatch_mode */
+inline std::string dispatch_mode_string(const DispatchMode x) {
+  switch (x) {
+    case DispatchMode::kFCompute:
+      return "fcompute";
+    case DispatchMode::kFComputeEx:
+      return "fcompute_ex";
+    case DispatchMode::kFComputeFallback:
+      return "fcompute_fallback";
+    case DispatchMode::kVariable:
+      return "variable";
+    case DispatchMode::kUndefined:
+      return "undefined";
+  }
+  return "unknown";
+}
+
+
+/*! \brief get string representation of storage_type */
+inline std::string stype_string(const int x) {
+  switch (x) {
+    case kDefaultStorage:
+      return "default";
+    case kCSRStorage:
+      return "csr";
+    case kRowSparseStorage:
+      return "row_sparse";
+  }
+  return "unknown";
+}
+
 // heuristic to dermine number of threads per GPU
 inline int GetNumThreadPerGPU() {
   // This is resource efficient option.
@@ -56,6 +353,67 @@ inline int GetExecNumMatchColor() {
   return std::min(num_match_color, GetNumThreadPerGPU());
 }
 
+template<typename T, typename V>
+V ParallelAccumulate(const T* a, const int n, V start) {
+  V sum = start;
+#pragma omp parallel for reduction(+:sum)
+  for (int i = 0; i < n; ++i) {
+    sum += a[i];
+  }
+  return sum;
+}
+
+/*!
+ * \brief
+ * Helper function for ParallelSort.
+ * DO NOT call this function directly.
+ * Use the interface ParallelSort instead.
+ * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h
+ */
+template<typename RandomIt, typename Compare>
+void ParallelSortHelper(RandomIt first, size_t len,
+                        size_t grainsize, const Compare& comp) {
+  if (len < grainsize) {
+    std::sort(first, first+len, comp);
+  } else {
+    std::thread thr(ParallelSortHelper<RandomIt, Compare>, first, len/2, grainsize, comp);
+    ParallelSortHelper(first+len/2, len - len/2, grainsize, comp);
+    thr.join();
+    std::inplace_merge(first, first+len/2, first+len, comp);
+  }
+}
+
+/*!
+ * \brief
+ * Sort the elements in the range [first, last) into the ascending order defined by
+ * the comparator comp.
+ * If the length of the range [first, last) is greater than a certain threshold,
+ * the range will be recursively divided into two and assign two threads
+ * to sort each half range.
+ * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h
+ */
+template<typename RandomIt, typename Compare>
+void ParallelSort(RandomIt first, RandomIt last, size_t num_threads, Compare comp) {
+  const auto num = std::distance(first, last);
+  size_t grainsize = std::max(num / num_threads + 5, static_cast<size_t>(1024*16));
+  ParallelSortHelper(first, num, grainsize, comp);
+}
+
+/*!
+ * \brief
+ * Sort the elements in the range [first, last) into ascending order.
+ * The elements are compared using the default < operator.
+ * If the length of the range [first, last) is greater than a certain threshold,
+ * the range will be recursively divided into two and assign two threads
+ * to sort each half range.
+ * Ref: https://github.com/dmlc/difacto/blob/master/src/common/parallel_sort.h
+ */
+template<typename RandomIt>
+void ParallelSort(RandomIt first, RandomIt last, size_t num_threads) {
+  ParallelSort(first, last, num_threads,
+               std::less<typename std::iterator_traits<RandomIt>::value_type>());
+}
+
 /*!
  * \brief Random Engine
  */
@@ -159,8 +517,6 @@ FCompType GetFCompute(const nnvm::Op* op, const std::string& name,
   }
 }
 
-#endif  // DMLC_USE_CXX11
-
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/engine/engine.cc b/src/engine/engine.cc
index d6196085bee9..1c72f33d24e1 100644
--- a/src/engine/engine.cc
+++ b/src/engine/engine.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file engine.cc
  * \brief Implementation of engine.
  */
diff --git a/src/engine/engine_impl.h b/src/engine/engine_impl.h
index cf727366f6d9..b3ec34dc857a 100644
--- a/src/engine/engine_impl.h
+++ b/src/engine/engine_impl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file engine_impl.h
  * \brief Internal implementation header of engine components.
  */
diff --git a/src/engine/naive_engine.cc b/src/engine/naive_engine.cc
index 9814e19d047b..86f387739722 100644
--- a/src/engine/naive_engine.cc
+++ b/src/engine/naive_engine.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file naive_engine.cc
  * \brief Implementation of NaiveEngine
  */
@@ -26,6 +27,7 @@
 #include <thread>
 #include "./engine_impl.h"
 #include "./profiler.h"
+#include "./openmp.h"
 
 namespace mxnet {
 namespace engine {
@@ -39,7 +41,6 @@ class NaiveEngine final : public Engine {
     std::vector<VarHandle> mutable_vars;
     FnProperty prop;
     const char* opr_name;
-    const char* attr_name;
     /*! \brief indicate whether to profile this operator */
     bool profiling{false};
     /*! \brief operator execution statistics */
@@ -72,15 +73,13 @@ class NaiveEngine final : public Engine {
                         std::vector<VarHandle> const& const_vars,
                         std::vector<VarHandle> const& mutable_vars,
                         FnProperty prop = FnProperty::kNormal,
-                        const char* opr_name = nullptr,
-                        const char* attr_name = nullptr) override {
+                        const char* opr_name = nullptr) override {
     NaiveOpr *opr = new NaiveOpr();
     opr->fn = fn;
     opr->const_vars = const_vars;
     opr->mutable_vars = mutable_vars;
     opr->prop = prop;
     opr->opr_name = opr_name;
-    opr->attr_name = attr_name;
     return opr;
   }
 
@@ -102,9 +101,6 @@ class NaiveEngine final : public Engine {
           strncpy(opr->opr_stat->opr_name,
             opr->opr_name,
             sizeof(opr->opr_stat->opr_name) - 1);
-          strncpy(opr->opr_stat->attr_name,
-            opr->attr_name,
-            sizeof(opr->opr_stat->attr_name) - 1);
           SetOprStart(opr->opr_stat);
         }
         opr->fn(ctx, on_complete);
@@ -120,8 +116,7 @@ class NaiveEngine final : public Engine {
       opr->mutable_vars,
       opr->prop,
       priority,
-      PROFILER_MESSAGE(opr->opr_name),
-      PROFILER_MESSAGE(opr->attr_name));
+      PROFILER_MESSAGE(opr->opr_name));
   }
 
   void PushAsync(AsyncFn exec_fun,
@@ -130,8 +125,7 @@ class NaiveEngine final : public Engine {
                  std::vector<VarHandle> const& mutable_vars,
                  FnProperty prop = FnProperty::kNormal,
                  int priority = 0,
-                 const char* opr_name = nullptr,
-                 const char* attr_name = nullptr) override {
+                 const char* opr_name = nullptr) override {
     CallbackOnComplete callback = CreateCallback(
         NaiveEngine::OnComplete, nullptr);
     this->req_completed_ = false;
@@ -140,10 +134,10 @@ class NaiveEngine final : public Engine {
     NaiveOpr *opr = nullptr;
     bool profiling = (profiler->GetState() == Profiler::kRunning) &&
                    (profiler->GetMode() == Profiler::kAllOperator) &&
-                   opr_name && attr_name;
+                   opr_name;
     if (profiling) {
       opr = NewOperator(exec_fun, const_vars, mutable_vars,
-                        prop, opr_name, attr_name)->Cast<NaiveOpr>();
+                        prop, opr_name)->Cast<NaiveOpr>();
       opr->profiling = profiling;
       opr->opr_stat = Profiler::Get()->AddOprStat(exec_ctx.dev_type, exec_ctx.dev_id);
       uint64_t id = std::hash<std::thread::id>()(std::this_thread::get_id());
@@ -151,9 +145,6 @@ class NaiveEngine final : public Engine {
       strncpy(opr->opr_stat->opr_name,
               opr->opr_name,
               sizeof(opr->opr_stat->opr_name) - 1);
-      strncpy(opr->opr_stat->attr_name,
-              opr->attr_name,
-              sizeof(opr->opr_stat->attr_name) - 1);
       SetOprStart(opr->opr_stat);
     }
 #endif
@@ -165,7 +156,7 @@ class NaiveEngine final : public Engine {
         streams_.resize(dev_id + 1, nullptr);
       }
       if (streams_[dev_id] == nullptr) {
-        streams_[dev_id] = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0);
+        streams_[dev_id] = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0, dev_id);
       }
       exec_fun(RunContext{exec_ctx, streams_[dev_id]}, callback);
 #else
diff --git a/src/engine/openmp.cc b/src/engine/openmp.cc
new file mode 100644
index 000000000000..bd7f7fdea789
--- /dev/null
+++ b/src/engine/openmp.cc
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <dmlc/omp.h>
+#include <dmlc/base.h>
+#include <dmlc/parameter.h>
+#include <climits>
+#include "./openmp.h"
+
+namespace mxnet {
+namespace engine {
+
+#if defined(__i386__) || defined(_M_X86) || defined(_M_X64) || defined(__x86_64__)
+#define ARCH_IS_INTEL_X86
+#endif
+
+static inline bool is_env_set(const char *var) {
+  return dmlc::GetEnv(var, INT_MIN) != INT_MIN;
+}
+
+OpenMP *OpenMP::Get() {
+  static OpenMP openMP;
+  return &openMP;
+}
+
+OpenMP::OpenMP()
+  : omp_num_threads_set_in_environment(is_env_set("OMP_NUM_THREADS")) {
+#ifdef _OPENMP
+  const int max = dmlc::GetEnv("MXNET_OMP_MAX_THREADS", INT_MIN);
+  if (max != INT_MIN) {
+    omp_thread_max_ = max;
+  } else {
+    if (!omp_num_threads_set_in_environment) {
+      omp_thread_max_ = omp_get_num_procs();
+#ifdef ARCH_IS_INTEL_X86
+      omp_thread_max_ >>= 1;
+#endif
+      omp_set_num_threads(omp_thread_max_);
+    } else {
+      omp_thread_max_ = omp_get_max_threads();
+    }
+  }
+#else
+  enabled_ = false;
+  omp_thread_max_ = 1;
+#endif
+}
+
+void OpenMP::set_reserve_cores(int cores) {
+  CHECK_GE(cores, 0);
+  reserve_cores_ = cores;
+#ifdef _OPENMP
+  if (reserve_cores_ >= omp_thread_max_) {
+    omp_set_num_threads(1);
+  } else {
+    omp_set_num_threads(omp_thread_max_ - reserve_cores_);
+  }
+#endif
+}
+
+int OpenMP::GetRecommendedOMPThreadCount(bool exclude_reserved) const {
+#ifdef _OPENMP
+  if (omp_num_threads_set_in_environment) {
+    return omp_get_max_threads();
+  }
+  if (enabled_) {
+    int thread_count = omp_get_max_threads();
+    if (exclude_reserved) {
+      if (reserve_cores_ >= thread_count) {
+        thread_count = 1;
+      } else {
+        thread_count -= reserve_cores_;
+      }
+    }
+    // Check that OMP doesn't suggest more than our 'omp_thread_max_' value
+    if (!omp_thread_max_ || thread_count < omp_thread_max_) {
+      return thread_count;
+    }
+    return omp_thread_max_;
+  }
+  return 1;
+#else
+  return 1;
+#endif
+}
+
+OpenMP *__init_omp__ = OpenMP::Get();
+
+}  // namespace engine
+}  // namespace mxnet
+
diff --git a/src/engine/openmp.h b/src/engine/openmp.h
new file mode 100644
index 000000000000..02e73c095514
--- /dev/null
+++ b/src/engine/openmp.h
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef MXNET_ENGINE_OPENMP_H_
+#define MXNET_ENGINE_OPENMP_H_
+
+namespace mxnet {
+namespace engine {
+
+/*! \brief OpenMP wrapper and management class
+ *         This class manages a layer on top of the OMP implementation and does not
+ *         interact bidirectionally with the OMP implementation for all behaviors
+ *         (i.e. it's meant to be use explicitly for explicit arguments to omp pragmas
+ *         without affecting the behavior when no arguments are given)
+ */
+class OpenMP {
+ public:
+  OpenMP();
+
+  /*!
+   * \brief Get the recommended number of OMP threads to use given the current context
+   * \return Recommended number of OMP threads to use in a parallel operation
+   */
+  int GetRecommendedOMPThreadCount(bool exclude_reserved = true) const;
+
+  /*!
+   * \brief Set whether clients of this class receive pro-OMP behavior guidance
+   * \param enabled Set to 'true' if this class should provide OMP behavior
+   */
+  void set_enabled(bool enabled) { enabled_ = enabled; }
+  bool enabled() const { return enabled_; }
+
+  /*!
+   * \brief Set maximum number of threads to be used in an OMP region
+   * \param thread_max Maximum number of threads to be used in an OMP region
+   */
+  void set_thread_max(int thread_max) { omp_thread_max_ = thread_max; }
+  /*!
+   * \brief Maximum number of threads to be used in an OMP region
+   * \return Maximum number of threads
+   */
+  int thread_max() const { return omp_thread_max_; }
+
+  /*!
+   * \brief Reserve cores to be excluded from OMP regions
+   * \param cores Number of cores to be excluded from OMP region usage
+   */
+  void set_reserve_cores(int cores);
+  /*!
+   * \brief Get number of cores to be excluded from OMP regions
+   * \return Number of cores to be excluded from OMP regions
+   */
+  int reserve_cores() const { return reserve_cores_; }
+
+  /*!
+   * \brief Get the OpenMP object's singleton pointer
+   * \return Singleton OpenMP object pointer
+   */
+  static OpenMP *Get();
+
+ private:
+  /*!
+   * \brief Whether OpenMP layer is enabled (use more then one thread).  Independent of OMP library
+   *        behavior
+   */
+  volatile bool enabled_ = true;
+  /*!
+   * \brief Maximum number of threads for any OMP region
+   */
+  volatile int omp_thread_max_ = 0;
+  /*!
+   * \brief Number of cores to reserve for non-OMP regions
+   */
+  volatile int reserve_cores_ = 0;
+  /*!
+   * \brief Whether OMP_NUM_THREADS was set in the environment.  If it is, we fall back to
+   *        the OMP's implementation's handling of that environment variable
+   */
+  const bool omp_num_threads_set_in_environment;
+};
+
+}  // namespace engine
+}  // namespace mxnet
+
+#endif  // MXNET_ENGINE_OPENMP_H_
diff --git a/src/engine/profiler.cc b/src/engine/profiler.cc
index d79ed8870230..21c476f64ae3 100644
--- a/src/engine/profiler.cc
+++ b/src/engine/profiler.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file profiler.cc
  * \brief implements profiler
  */
@@ -99,7 +100,6 @@ OprExecStat *Profiler::AddOprStat(int dev_type, uint32_t dev_id) {
   opr_stat->dev_type = dev_type;
   opr_stat->dev_id   = dev_id;
   opr_stat->opr_name[sizeof(opr_stat->opr_name)-1] = '\0';
-  opr_stat->attr_name[sizeof(opr_stat->attr_name)-1] = '\0';
 
   int idx;
   switch (dev_type) {
@@ -186,10 +186,10 @@ void Profiler::DumpProfile() {
         file << ",";
       }
       file << std::endl;
-      this->EmitEvent(&file, opr_stat->attr_name, opr_stat->opr_name, "B",
+      this->EmitEvent(&file, opr_stat->opr_name, "category", "B",
             opr_stat->opr_start_rel_micros, pid, tid);
       file << ",\n";
-      this->EmitEvent(&file, opr_stat->attr_name, opr_stat->opr_name, "E",
+      this->EmitEvent(&file, opr_stat->opr_name, "category", "E",
             opr_stat->opr_end_rel_micros, pid, tid);
     }
   }
diff --git a/src/engine/profiler.h b/src/engine/profiler.h
index 57f42760452d..dbbc773351a7 100644
--- a/src/engine/profiler.h
+++ b/src/engine/profiler.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file profiler.h
  * \brief implements profiler
  */
@@ -38,8 +39,6 @@ namespace engine {
 struct OprExecStat {
   /*! \brief operation name */
   char opr_name[32];
-  /*! \brief layer name */
-  char attr_name[32];
   /*!
    * \brief operation execution start relative timestamp
    *        time unit is microsecond (10^-6 s)
diff --git a/src/engine/stream_manager.h b/src/engine/stream_manager.h
index 1a66277bb4ec..432bccf27df4 100644
--- a/src/engine/stream_manager.h
+++ b/src/engine/stream_manager.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  */
 #ifndef MXNET_ENGINE_STREAM_MANAGER_H_
 #define MXNET_ENGINE_STREAM_MANAGER_H_
@@ -77,7 +78,7 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
         auto&& counter = gpu_cnt_.at(ctx.dev_id);
         if (counter == -1) {
           for (auto&& i : gpu_streams_.at(ctx.dev_id)) {
-            i = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0);
+            i = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0, ctx.dev_id);
           }
           counter = 0;
         }
@@ -89,6 +90,8 @@ RunContext StreamManager<kNumGpus, kStreams>::GetRunContext(
 #else
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif  // MXNET_USE_CUDA
+    default:
+      LOG(FATAL) << "Not Reached";
     }
   }
   return ret;
@@ -108,7 +111,7 @@ RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
       {
         std::lock_guard<std::mutex> lock{m_};
         if (gpu_io_streams_.at(ctx.dev_id) == nullptr) {
-          gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream<gpu>(false, false);
+          gpu_io_streams_.at(ctx.dev_id) = mshadow::NewStream<gpu>(false, false, ctx.dev_id);
         }
       }
       ret = RunContext{ctx, gpu_io_streams_.at(ctx.dev_id)};
@@ -116,6 +119,8 @@ RunContext StreamManager<kNumGpus, kStreams>::GetIORunContext(
 #else
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif  // MXNET_USE_CUDA
+    default:
+      LOG(FATAL) << "Not Reached";
     }
   }
   return ret;
diff --git a/src/engine/thread_pool.h b/src/engine/thread_pool.h
index b6fe3c2d5d6a..b4dae6bfd41d 100644
--- a/src/engine/thread_pool.h
+++ b/src/engine/thread_pool.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  */
 #ifndef MXNET_ENGINE_THREAD_POOL_H_
 #define MXNET_ENGINE_THREAD_POOL_H_
@@ -57,8 +58,8 @@ class ThreadPool {
 
     /*! \brief Signal event upon destruction, even for exceptions (RAII) */
     struct SetReadyOnDestroy {
-      explicit inline SetReadyOnDestroy(std::shared_ptr<SimpleEvent> event)
-        : event_(event) {
+      explicit inline SetReadyOnDestroy(std::shared_ptr<SimpleEvent> *event)
+        : event_(*event) {
       }
       inline ~SetReadyOnDestroy() {
         if (event_) {
diff --git a/src/engine/threaded_engine.cc b/src/engine/threaded_engine.cc
index 290c2166694e..b17d92863725 100644
--- a/src/engine/threaded_engine.cc
+++ b/src/engine/threaded_engine.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file threaded_engine.cc
  * \brief implements base threaded engine.
  * \author Yutian Li
@@ -205,11 +206,9 @@ ThreadedOpr* ThreadedEngine::NewOperator(
     std::vector<VarHandle> const& const_vars,
     std::vector<VarHandle> const& mutable_vars,
     FnProperty prop,
-    const char* opr_name,
-    const char* attr_name) {
+    const char* opr_name) {
   auto ret = ThreadedOpr::New();
   ret->opr_name = opr_name;
-  ret->attr_name = attr_name;
   ret->fn = std::move(fn);
   ret->prop = prop;
   ret->const_vars.resize(const_vars.size());
@@ -269,8 +268,9 @@ void ThreadedEngine::DeleteOperator(OprHandle op) {
   deps.insert(deps.end(),
               threaded_opr->mutable_vars.begin(),
               threaded_opr->mutable_vars.end());
-  this->PushSync([threaded_opr](RunContext) {
+  this->PushAsync([threaded_opr](RunContext, CallbackOnComplete on_complete) {
       ThreadedOpr::Delete(threaded_opr);
+      on_complete();
     }, Context::CPU(), {}, deps, FnProperty::kAsync, 0,
     PROFILER_MESSAGE("DeleteOperator"));
 }
@@ -305,9 +305,9 @@ void ThreadedEngine::PushAsync(AsyncFn fn, Context exec_ctx,
                                std::vector<VarHandle> const& mutable_vars,
                                FnProperty prop,
                                int priority,
-                               const char* opr_name,
-                               const char* attr_name) {
-  ThreadedOpr *opr = NewOperator(std::move(fn), const_vars, mutable_vars, prop, opr_name, attr_name);
+                               const char* opr_name) {
+  BulkFlush();
+  ThreadedOpr *opr = NewOperator(std::move(fn), const_vars, mutable_vars, prop, opr_name);
   opr->temporary = true;
 #if MXNET_USE_PROFILER
   Profiler *profiler = Profiler::Get();
@@ -319,20 +319,42 @@ void ThreadedEngine::PushAsync(AsyncFn fn, Context exec_ctx,
   Push(opr, exec_ctx, priority, profiling);
 }
 
+void ThreadedEngine::PushSync(SyncFn exec_fn, Context exec_ctx,
+                              std::vector<VarHandle> const& const_vars,
+                              std::vector<VarHandle> const& mutable_vars,
+                              FnProperty prop,
+                              int priority,
+                              const char* opr_name) {
+  BulkStatus& bulk_status = *BulkStatusStore::Get();
+  if (!bulk_status.bulk_size || prop != FnProperty::kNormal || priority) {
+    this->PushAsync([exec_fn](RunContext ctx, CallbackOnComplete on_complete) {
+        exec_fn(ctx);
+        on_complete();
+      }, exec_ctx, const_vars, mutable_vars, prop, priority, opr_name);
+    return;
+  }
+
+  if (bulk_status.count && exec_ctx != bulk_status.ctx) BulkFlush();
+  BulkAppend(exec_fn, exec_ctx, const_vars, mutable_vars);
+  return;
+}
+
 void ThreadedEngine::DeleteVariable(SyncFn delete_fn,
                                     Context exec_ctx,
                                     VarHandle var) {
   ThreadedVar* threaded_var = ThreadedVar::CastFromBase(var);
-  this->PushSync([delete_fn, threaded_var](RunContext ctx) {
+  this->PushAsync([delete_fn, threaded_var](RunContext ctx, CallbackOnComplete on_complete) {
       // Mark variable as orphan,
       // so during `ThreadedEngine::OnComplete` it could be recycled.
       threaded_var->SetToDelete();
       delete_fn(ctx);
-    }, exec_ctx, {}, {var}, FnProperty::kAsync, 0,
+      on_complete();
+    }, exec_ctx, {}, {var}, FnProperty::kDeleteVar, 0,
     PROFILER_MESSAGE("DeleteVariable"));
 }
 
 void ThreadedEngine::WaitForVar(VarHandle var) {
+  BulkFlush();
   ThreadedVar* threaded_var = ThreadedVar::CastFromBase(var);
   if (threaded_var->ready_to_read()) return;
   if (engine_info_) {
@@ -340,7 +362,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) {
     debug_wait_var_ = threaded_var;
   }
   std::atomic<bool> done{false};
-  this->PushSync([this, &done](RunContext) {
+  this->PushAsync([this, &done](RunContext, CallbackOnComplete on_complete) {
       if (engine_info_) {
         LOG(INFO) << "Sync is executed";
       }
@@ -352,6 +374,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) {
       if (engine_info_) {
         LOG(INFO) << "Sync is notified";
       }
+      on_complete();
     }, Context::CPU(), {var}, {}, FnProperty::kNormal, 0,
     PROFILER_MESSAGE("WaitForVar"));
   {
@@ -363,6 +386,7 @@ void ThreadedEngine::WaitForVar(VarHandle var) {
 }
 
 void ThreadedEngine::WaitForAll() {
+  BulkFlush();
   std::unique_lock<std::mutex> lock{finished_m_};
   finished_cv_.wait(lock, [this]() {
       return pending_.load() == 0 || kill_.load();
@@ -424,7 +448,7 @@ void ThreadedEngine::OnCompleteStatic(
   OprBlock *opr_block = static_cast<OprBlock*>(opr_block_);
   ThreadedOpr *threaded_opr = opr_block->opr;
 #if MXNET_USE_PROFILER
-  if (opr_block->profiling && threaded_opr->opr_name && threaded_opr->attr_name) {
+  if (opr_block->profiling && threaded_opr->opr_name) {
     // record operator end timestamp
     SetOprEnd(opr_block->opr_stat);
   }
diff --git a/src/engine/threaded_engine.h b/src/engine/threaded_engine.h
index 350f70401951..d85321c52c9f 100644
--- a/src/engine/threaded_engine.h
+++ b/src/engine/threaded_engine.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file threaded_engine.h
  * \brief Implements base class of threaded engine
  *    that tracks the dependency and pushes actions to execute.
@@ -28,15 +29,18 @@
 
 #include <dmlc/base.h>
 #include <dmlc/logging.h>
+#include <dmlc/omp.h>
 #include <vector>
 #include <functional>
 #include <condition_variable>
 #include <atomic>
+#include <utility>
 #include <mutex>
 #include <string>
 #include <thread>
 #include "./engine_impl.h"
 #include "./profiler.h"
+#include "./openmp.h"
 #include "../common/object_pool.h"
 
 namespace mxnet {
@@ -227,8 +231,6 @@ struct ThreadedOpr final : public Opr,
   FnProperty prop;
   /*! \brief The name of the operator */
   const char* opr_name{nullptr};
-  /*! \brief The name of the attribute */
-  const char* attr_name{nullptr};
   /*!
    * \brief Whether this is an temporary operator
    *        that can be deleted right after the operation completed.
@@ -263,8 +265,7 @@ class ThreadedEngine : public Engine {
                            std::vector<VarHandle> const& const_vars,
                            std::vector<VarHandle> const& mutable_vars,
                            FnProperty prop = FnProperty::kNormal,
-                           const char* opr_name = nullptr,
-                           const char* attr_name = nullptr) override;
+                           const char* opr_name = nullptr) override;
   void DeleteOperator(OprHandle op) override;
   void Push(OprHandle op, Context exec_ctx, int priority = 0, bool profiling = false) override;
   void PushAsync(AsyncFn exec_fun, Context exec_ctx,
@@ -272,8 +273,13 @@ class ThreadedEngine : public Engine {
                  std::vector<VarHandle> const& mutable_vars,
                  FnProperty prop = FnProperty::kNormal,
                  int priority = 0,
-                 const char* opr_name = nullptr,
-                 const char* attr_name = nullptr) override;
+                 const char* opr_name = nullptr) override;
+  void PushSync(SyncFn exec_fn, Context exec_ctx,
+                std::vector<VarHandle> const& const_vars,
+                std::vector<VarHandle> const& mutable_vars,
+                FnProperty prop = FnProperty::kNormal,
+                int priority = 0,
+                const char* opr_name = nullptr) override;
   void DeleteVariable(SyncFn delete_fn, Context exec_ctx, VarHandle var) override;
   void WaitForVar(VarHandle var) override;
   void WaitForAll() override;
@@ -288,6 +294,8 @@ class ThreadedEngine : public Engine {
     objpool_blk_ref_    = common::ObjectPool<OprBlock>::_GetSharedRef();
     objpool_varblk_ref_ = common::ObjectPool<VersionedVarBlock>::_GetSharedRef();
     objpool_var_ref_    = common::ObjectPool<ThreadedVar>::_GetSharedRef();
+
+    /*! \brief Set default OMP threads per kernel worker to default */
   }
   ~ThreadedEngine() {
     {
@@ -316,7 +324,7 @@ class ThreadedEngine : public Engine {
   void ExecuteOprBlock(RunContext run_ctx, OprBlock *opr_block) {
     ThreadedOpr* threaded_opr = opr_block->opr;
 #if MXNET_USE_PROFILER
-    if (opr_block->profiling && threaded_opr->opr_name && threaded_opr->attr_name) {
+    if (opr_block->profiling && threaded_opr->opr_name) {
       const Context& ctx = opr_block->ctx;
       opr_block->opr_stat = Profiler::Get()->AddOprStat(ctx.dev_type, ctx.dev_id);
       uint64_t id = std::hash<std::thread::id>()(std::this_thread::get_id());
@@ -324,9 +332,6 @@ class ThreadedEngine : public Engine {
       strncpy(opr_block->opr_stat->opr_name,
         threaded_opr->opr_name,
         sizeof(opr_block->opr_stat->opr_name) - 1);
-      strncpy(opr_block->opr_stat->attr_name,
-          threaded_opr->attr_name,
-          sizeof(opr_block->opr_stat->attr_name) - 1);
       // record operator start timestamp
       SetOprStart(opr_block->opr_stat);
     }
@@ -352,7 +357,7 @@ class ThreadedEngine : public Engine {
         if (what.find("driver shutting down") == std::string::npos &&
             !shutdown_phase_) {
           LOG(FATAL) << e.what() << "\n" <<
-            "An fatal error occurred in asynchronous engine operation. "
+            "A fatal error occurred in asynchronous engine operation. "
             "If you do not know what caused this error, "
             "you can try set environment variable MXNET_ENGINE_TYPE "
             "to NaiveEngine and run with debugger (i.e. gdb). "
@@ -367,7 +372,35 @@ class ThreadedEngine : public Engine {
     }
   }
 
+  int bulk_size() const override {
+    return BulkStatusStore::Get()->bulk_size;
+  }
+
+  int set_bulk_size(int bulk_size) override {
+    BulkStatus& bulk_status = *BulkStatusStore::Get();
+    std::swap(bulk_status.bulk_size, bulk_size);
+    if (bulk_status.count >= bulk_status.bulk_size) BulkFlush();
+    return bulk_size;
+  }
+
  private:
+  /*! \brief structure for holding bulk execution status */
+  struct BulkStatus {
+    /*! \brief maximum number of ops per bulk */
+    int bulk_size = 0;
+    /*! \brief current number of ops in bulk */
+    int count = 0;
+    /*! \brief context of current ops */
+    Context ctx;
+    /*! \brief current op functions */
+    SyncFn fn;
+    /*! \brief constant variables */
+    std::vector<VarHandle> const_vars;
+    /*! \brief mutable variables */
+    std::vector<VarHandle> mutable_vars;
+  };
+  /*! thread local store for bulk */
+  typedef dmlc::ThreadLocalStore<BulkStatus> BulkStatusStore;
   /*!
    * \brief check if thee is duplication in const_vars and mutable_vars.
    * \param const_vars the variables to read from.
@@ -383,6 +416,46 @@ class ThreadedEngine : public Engine {
   inline void OnComplete(ThreadedOpr* threaded_opr);
   // callback to the threaded engine
   static void OnCompleteStatic(Engine *engine, void *threaded_opr);
+  /*! \brief append an operator to bulk */
+  inline void BulkAppend(SyncFn exec_fn, Context exec_ctx,
+                         std::vector<VarHandle> const& const_vars,
+                         std::vector<VarHandle> const& mutable_vars) {
+    BulkStatus& bulk_status = *BulkStatusStore::Get();
+    if (!bulk_status.count) {
+      bulk_status.ctx = exec_ctx;
+      bulk_status.fn = std::move(exec_fn);
+    } else {
+      auto prev_fn = std::move(bulk_status.fn);
+      bulk_status.fn = [exec_fn, prev_fn](RunContext rctx) {
+          prev_fn(rctx);
+          exec_fn(rctx);
+        };
+    }
+
+    ++bulk_status.count;
+    bulk_status.const_vars.insert(
+        bulk_status.const_vars.end(), const_vars.begin(), const_vars.end());
+    bulk_status.mutable_vars.insert(
+        bulk_status.mutable_vars.end(), mutable_vars.begin(), mutable_vars.end());
+
+    if (bulk_status.count >= bulk_status.bulk_size) BulkFlush();
+  }
+  /*! \brief flush current bulk to execution */
+  inline void BulkFlush() {
+    BulkStatus& bulk_status = *BulkStatusStore::Get();
+    if (!bulk_status.count) return;
+    bulk_status.count = 0;
+    DeduplicateVarHandle(&bulk_status.const_vars, &bulk_status.mutable_vars);
+    auto fn = std::move(bulk_status.fn);
+    this->PushAsync([fn](RunContext ctx, CallbackOnComplete on_complete) {
+        fn(ctx);
+        on_complete();
+      }, bulk_status.ctx, bulk_status.const_vars, bulk_status.mutable_vars,
+      FnProperty::kNormal, 0, "ImperativeBulk");
+
+    bulk_status.const_vars.clear();
+    bulk_status.mutable_vars.clear();
+  }
   /*!
    * \brief Number of pending operations.
    */
@@ -412,6 +485,7 @@ class ThreadedEngine : public Engine {
   std::shared_ptr<common::ObjectPool<OprBlock> >          objpool_blk_ref_;
   std::shared_ptr<common::ObjectPool<VersionedVarBlock> > objpool_varblk_ref_;
   std::shared_ptr<common::ObjectPool<ThreadedVar> >       objpool_var_ref_;
+
   /*!
    * \brief Disallow copy construction and assignment.
    */
@@ -420,4 +494,5 @@ class ThreadedEngine : public Engine {
 
 }  // namespace engine
 }  // namespace mxnet
+
 #endif  // MXNET_ENGINE_THREADED_ENGINE_H_
diff --git a/src/engine/threaded_engine_perdevice.cc b/src/engine/threaded_engine_perdevice.cc
index 66cfc9de1468..28bc92f7b207 100644
--- a/src/engine/threaded_engine_perdevice.cc
+++ b/src/engine/threaded_engine_perdevice.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file threaded_engine_perdevice.cc
  * \brief ThreadedEngine that uses fix amount of thread for each device.
  */
@@ -50,6 +51,44 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
   static auto constexpr kWorkerQueue = kFIFO;
 
   ThreadedEnginePerDevice() noexcept(false) {
+    this->Start();
+#ifndef _WIN32
+    pthread_atfork(
+      []() {
+        Engine::Get()->Stop();
+      },
+      []() {
+        Engine::Get()->Start();
+      },
+      []() {
+        // Make children single threaded since they are typically workers
+        dmlc::SetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
+        dmlc::SetEnv("OMP_NUM_THREADS", 1);
+        OpenMP::Get()->set_enabled(false);
+        Engine::Get()->Start();
+      });
+#endif
+  }
+  ~ThreadedEnginePerDevice() noexcept(false) {
+    this->StopNoWait();
+  }
+
+  void StopNoWait() {
+    SignalQueuesForKill();
+    gpu_normal_workers_.Clear();
+    gpu_copy_workers_.Clear();
+    cpu_normal_workers_.Clear();
+    cpu_priority_worker_.reset(nullptr);
+  }
+
+  void Stop() override {
+    if (is_worker_) return;
+    WaitForAll();
+    StopNoWait();
+  }
+
+  void Start() override {
+    if (is_worker_) return;
     gpu_worker_nthreads_ = common::GetNumThreadPerGPU();
     cpu_worker_nthreads_ = dmlc::GetEnv("MXNET_CPU_WORKER_NTHREADS", 1);
     // create CPU task
@@ -61,26 +100,20 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
         }));
     // GPU tasks will be created lazily
   }
-  ~ThreadedEnginePerDevice() noexcept(false) {
-    SignalQueuesForKill();
-    gpu_normal_workers_.Clear();
-    gpu_copy_workers_.Clear();
-    cpu_normal_workers_.Clear();
-    cpu_priority_worker_.reset(nullptr);
-  }
 
  protected:
   void PushToExecute(OprBlock *opr_block, bool pusher_thread) override {
     const Context& ctx = opr_block->ctx;
-    if (opr_block->opr->prop == FnProperty::kAsync && pusher_thread) {
-      if (ctx.dev_mask() == gpu::kDevMask) {
+    if ((opr_block->opr->prop == FnProperty::kAsync ||
+         opr_block->opr->prop == FnProperty::kDeleteVar) && pusher_thread) {
+      if (ctx.dev_mask() == Context::kGPU) {
         #if MXNET_USE_CUDA
         MSHADOW_CATCH_ERROR(mshadow::SetDevice<gpu>(ctx.dev_id));
         #endif
       }
       this->ExecuteOprBlock(RunContext{ctx, nullptr}, opr_block);
     } else {
-      if (ctx.dev_mask() == cpu::kDevMask) {
+      if (ctx.dev_mask() == Context::kCPU) {
         if (opr_block->opr->prop == FnProperty::kCPUPrioritized) {
           cpu_priority_worker_->task_queue.Push(opr_block, opr_block->priority);
         } else {
@@ -95,11 +128,15 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
               return blk;
             });
           if (ptr) {
-            ptr->task_queue.Push(opr_block, opr_block->priority);
+            if (opr_block->opr->prop == FnProperty::kDeleteVar) {
+              ptr->task_queue.PushFront(opr_block, opr_block->priority);
+            } else {
+              ptr->task_queue.Push(opr_block, opr_block->priority);
+            }
           }
         }
       } else {
-        CHECK_EQ(ctx.dev_mask(), gpu::kDevMask);
+        CHECK_EQ(ctx.dev_mask(), Context::kGPU);
         // GPU execution.
         FnProperty prop = opr_block->opr->prop;
         bool is_copy = (prop == FnProperty::kCopyFromGPU ||
@@ -108,7 +145,9 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
         if (is_copy) {
           auto ptr =
           gpu_copy_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
-              auto blk = new ThreadWorkerBlock<kCopyQueue>();
+            // Signify to kernel that GPU is being used, so reserve cores as necessary
+            OpenMP::Get()->set_reserve_cores(GetReserveCoreCount(true));
+            auto blk = new ThreadWorkerBlock<kCopyQueue>();
               blk->pool.reset(new ThreadPool(
                 nthread,
                 [this, ctx, is_copy, blk]
@@ -118,10 +157,16 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
               return blk;
             });
           if (ptr) {
-            ptr->task_queue.Push(opr_block, opr_block->priority);
+            if (opr_block->opr->prop == FnProperty::kDeleteVar) {
+              ptr->task_queue.PushFront(opr_block, opr_block->priority);
+            } else {
+              ptr->task_queue.Push(opr_block, opr_block->priority);
+            }
           }
         } else {
           auto ptr = gpu_normal_workers_.Get(ctx.dev_id, [this, ctx, is_copy, nthread]() {
+            // Signify to kernel that GPU is being used, so reserve cores as necessary
+            OpenMP::Get()->set_reserve_cores(GetReserveCoreCount(true));
               auto blk = new ThreadWorkerBlock<kWorkerQueue>();
               blk->pool.reset(new ThreadPool(
                 nthread,
@@ -132,7 +177,11 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
               return blk;
             });
           if (ptr) {
-            ptr->task_queue.Push(opr_block, opr_block->priority);
+            if (opr_block->opr->prop == FnProperty::kDeleteVar) {
+              ptr->task_queue.PushFront(opr_block, opr_block->priority);
+            } else {
+              ptr->task_queue.Push(opr_block, opr_block->priority);
+            }
           }
         }
       }
@@ -153,6 +202,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     ~ThreadWorkerBlock() noexcept(false) {}
   };
 
+  /*! \brief whether this is a worker thread. */
+  static MX_THREAD_LOCAL bool is_worker_;
   /*! \brief number of concurrent thread cpu worker uses */
   int cpu_worker_nthreads_;
   /*! \brief number of concurrent thread each gpu worker uses */
@@ -176,16 +227,17 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
                         bool is_copy_worker,
                         ThreadWorkerBlock<type> *block,
                         std::shared_ptr<ThreadPool::SimpleEvent> ready_event) {
+    this->is_worker_ = true;
 #if MXNET_USE_CUDA
     mshadow::Stream<gpu> *stream;
     do {
-      ThreadPool::SimpleEvent::SetReadyOnDestroy setReady(ready_event);
+      ThreadPool::SimpleEvent::SetReadyOnDestroy setReady(&ready_event);
       // allocate stream
       mshadow::SetDevice<gpu>(ctx.dev_id);
       if (is_copy_worker) {
-        stream = mshadow::NewStream<gpu>(false, false);
+        stream = mshadow::NewStream<gpu>(false, false, ctx.dev_id);
       } else {
-        stream = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0);
+        stream = mshadow::NewStream<gpu>(true, MXNET_USE_CUDNN != 0, ctx.dev_id);
       }
     } while (false);
     // execute task
@@ -208,6 +260,7 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
   template<dmlc::ConcurrentQueueType type>
   inline void CPUWorker(Context ctx,
                         ThreadWorkerBlock<type> *block) {
+    this->is_worker_ = true;
     auto* task_queue = &(block->task_queue);
     RunContext run_ctx{ctx, nullptr};
     // execute task
@@ -217,7 +270,28 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
     }
   }
 
-/*! \brief Signal a single queue for shutdown */
+  /*!
+   * \brief Get number of cores this engine should reserve for its own use
+   * \param using_gpu Whether there is GPU usage
+   * \return number of cores that this engine wishes to be reserved
+   * \note Testing found no degradation of performance using these values
+   *       running cifar10 with resnet50 on various GPU systems,
+   *       including AWS p2.16xlarge, which has 16 GPU's
+   */
+  int GetReserveCoreCount(const bool using_gpu) const {
+    int reserve = 0;
+    if (using_gpu) {
+      // Save at least one for GPU tasks
+      ++reserve;
+      // If we have 8 or more real cores, reserve another core for GPU tasks
+      if (OpenMP::Get()->GetRecommendedOMPThreadCount(true) >= 8) {
+        ++reserve;
+      }
+    }
+    return reserve;
+  }
+
+  /*! \brief Signal a single queue for shutdown */
   template<typename Object>
   static inline void SignalQueueForKill(common::LazyAllocArray<Object> *array) {
     array->ForEach([](size_t i, Object *block) {
@@ -239,5 +313,8 @@ class ThreadedEnginePerDevice : public ThreadedEngine {
 Engine *CreateThreadedEnginePerDevice() {
   return new ThreadedEnginePerDevice();
 }
+
+MX_THREAD_LOCAL bool ThreadedEnginePerDevice::is_worker_ = false;
+
 }  // namespace engine
 }  // namespace mxnet
diff --git a/src/engine/threaded_engine_pooled.cc b/src/engine/threaded_engine_pooled.cc
index 6db7c4bb7a92..074ea4e8472a 100644
--- a/src/engine/threaded_engine_pooled.cc
+++ b/src/engine/threaded_engine_pooled.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file threaded_engine_pooled.cc
  * \brief Pooled threaded engine
  * \author Yutian Li
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 47b74758d702..1bcc40a894dd 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -18,14 +18,17 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file attach_op_execs_pass.cc
  * \brief Operator executor to execute each operator.
  */
 #include <mxnet/base.h>
 #include <mxnet/operator.h>
 #include <mxnet/op_attr_types.h>
+#include <mxnet/graph_attr_types.h>
 #include <nnvm/graph_attr_types.h>
 #include "../common/utils.h"
+#include "../common/exec_utils.h"
 #include "./exec_pass.h"
 #if MXNET_USE_MKL2017 == 1
 #include <mkl_memory.h>
@@ -40,33 +43,75 @@ const OperatorProperty* OpPropGetOpProperty(const NodeAttrs& attrs);
 
 namespace exec {
 
-// forward executor
-class StatefulComputeExecutor : public OpExecutor {
+// abstract OpExecutor which provides storage fallback procedure on
+// non-default inputs and outputs
+// FComputeExecutor and FStatefulComputeExecutor inherit from this class
+class StorageFallbackOpExecutor : public OpExecutor {
  public:
-  void Run(RunContext rctx) override {
+  explicit StorageFallbackOpExecutor(const std::vector<uint32_t> &mutate_idx)
+      : mutate_idx_(mutate_idx) {}
+
+  void Setup() override {
+    init_ = false;
+  }
+
+ protected:
+  // initialize the data blobs
+  void InitBlobs() {
+    using namespace common;
     if (!init_) {
-      in_data_.clear();
-      for (size_t i = 0; i < in_array.size(); ++i) {
-        in_data_.push_back(in_array[i].data());
-      }
-      out_data_.clear();
-      for (size_t i = 0; i < out_array.size(); ++i) {
-        out_data_.push_back(out_array[i].data());
-      }
+      in_data_.clear(); out_data_.clear();
+      pre_temp_src_.clear(); pre_temp_dst_.clear();
+      post_temp_src_.clear(); post_temp_dst_.clear();
+      in_temp_idx_map_.clear();
+      SetupDefaultBlobsInOut(in_array, out_array, &in_data_, &out_data_,
+                             &pre_temp_src_, &pre_temp_dst_,
+                             &post_temp_src_, &post_temp_dst_,
+                             &in_temp_idx_map_, mutate_idx_);
       init_ = true;
     }
+  }
+
+  // storage fallback before fcompute is launched
+  void PreFCompute(bool is_gpu) {
+    InitBlobs();
+    common::CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx, is_gpu);
+  }
+
+  // storage fallback after fcompute is completed
+  void PostFCompute(bool is_gpu) {
+    common::CastNonDefaultStorage(post_temp_src_, post_temp_dst_, op_ctx, is_gpu);
+  }
+
+  // default storage tensor blobs for fcompute
+  std::vector<TBlob> in_data_, out_data_;
+  // source NDArray for cast storage
+  std::vector<NDArray> pre_temp_src_, post_temp_src_;
+  // destination NDArray for cast storage
+  std::vector<NDArray> pre_temp_dst_, post_temp_dst_;
+  // mapping from index in input_blobs to index in pre_temp_dst
+  std::unordered_map<uint32_t, uint32_t> in_temp_idx_map_;
+  // indices of mutatable inputs
+  std::vector<uint32_t> mutate_idx_;
+  // whether blobs are initialized
+  bool init_;
+};
+
+
+// stateful compute executor
+class StatefulComputeExecutor : public StorageFallbackOpExecutor {
+ public:
+  void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
+    PreFCompute(is_gpu);
     fcompute_(state_, op_ctx, in_data_, req, out_data_);
+    PostFCompute(is_gpu);
 #if MKL_EXPERIMENTAL == 1
     mkl_tblobs_prv_to_cpu(in_data_);
     mkl_tblobs_prv_to_cpu(out_data_);
 #endif
   }
 
-  void Setup() override {
-    init_ = false;
-  }
-
   ExecType exec_type() const override {
     return exec_type_;
   }
@@ -77,23 +122,23 @@ class StatefulComputeExecutor : public OpExecutor {
 
   explicit StatefulComputeExecutor(const OpStatePtr& state,
                                    const FStatefulCompute& fcompute,
-                                   ExecType exec_type)
-      : state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
+                                   ExecType exec_type,
+                                   const std::vector<uint32_t> &mutate_idx)
+      : StorageFallbackOpExecutor(mutate_idx),
+        state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
   friend Graph AttachOpExecs(Graph g);
   OpStatePtr state_;
   FStatefulCompute fcompute_;
   ExecType exec_type_;
-  bool init_;
-  std::vector<TBlob> in_data_, out_data_;
 };
 
 
-// forward executor
+// stateful compute_ex executor
 class StatefulComputeExExecutor : public OpExecutor {
  public:
-  void Run(RunContext rctx) override {
+  void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
     fcompute_(state_, op_ctx, in_array, req, out_array);
   }
@@ -121,47 +166,60 @@ class StatefulComputeExExecutor : public OpExecutor {
 };
 
 
-// fcompute executor executor
-class FComputeExecutor : public OpExecutor {
+// fcompute executor
+class FComputeExecutor : public StorageFallbackOpExecutor {
  public:
-  void Run(RunContext rctx) override {
-    if (!init_) {
-      in_data_.resize(in_array.size());
-      out_data_.resize(out_array.size());
-      auto get_blob =  [](const NDArray& nd) {
-        return nd.data();
-      };
-      std::transform(in_array.begin(), in_array.end(), in_data_.begin(), get_blob);
-      std::transform(out_array.begin(), out_array.end(), out_data_.begin(), get_blob);
-      init_ = true;
-    }
+  void Run(RunContext rctx, bool is_gpu) override {
+    using namespace common;
     op_ctx.run_ctx = rctx;
+    PreFCompute(is_gpu);
     fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
+    PostFCompute(is_gpu);
 #if MKL_EXPERIMENTAL == 1
     mkl_tblobs_prv_to_cpu(in_data_);
     mkl_tblobs_prv_to_cpu(out_data_);
 #endif
   }
 
-  void Setup() override {
-    init_ = false;
+  ExecType exec_type() const override {
+    return exec_type_;
   }
 
+  explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
+                            ExecType exec_type, const std::vector<uint32_t> &mutate_idx)
+      : StorageFallbackOpExecutor(mutate_idx),
+        attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) {
+  }
+
+ private:
+  NodeAttrs attrs_;
+  FCompute fcompute_;
+  ExecType exec_type_;
+};
+
+// fcompute_ex executor
+class FComputeExExecutor : public OpExecutor {
+ public:
+  void Run(RunContext rctx, bool is_gpu) override {
+    op_ctx.run_ctx = rctx;
+    fcompute_(attrs_, op_ctx, in_array, req, out_array);
+  }
+
+  void Setup() override {}
+
   ExecType exec_type() const override {
     return exec_type_;
   }
 
-  explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
-                            ExecType exec_type)
+  explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute,
+                              ExecType exec_type)
       : attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) {
   }
 
  private:
   NodeAttrs attrs_;
-  FCompute fcompute_;
+  FComputeEx fcompute_;
   ExecType exec_type_;
-  bool init_;
-  std::vector<TBlob> in_data_, out_data_;
 };
 
 // pass to attach operator executors
@@ -180,6 +238,7 @@ Graph AttachOpExecs(Graph g) {
   const auto& vctx = g.GetAttr<ContextVector>("context");
   const auto& saved_states = g.GetAttr<
     std::unordered_map<const nnvm::Node*, OpStatePtr> >("saved_states");
+  const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
 
   // get the graph
   const auto& idx = g.indexed_graph();
@@ -198,7 +257,7 @@ Graph AttachOpExecs(Graph g) {
     if (fexec_type.count(op)) {
       exec_type = fexec_type[op](inode.source->attrs);
     }
-
+    CHECK(dispatch_modes[i] != DispatchMode::kUndefined);
     if (fcreate_op_state.count(op)) {
       std::vector<TShape> ishape;
       std::vector<int> itype;
@@ -214,46 +273,53 @@ Graph AttachOpExecs(Graph g) {
         state = fcreate_op_state[op](
             inode.source->attrs, vctx[i], ishape, itype);
       }
-      FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
-          op, "FStatefulCompute", vctx[i]);
-      if (fcompute != nullptr) {
-        ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute, exec_type);
+      FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
+          op, "FStatefulComputeEx", vctx[i]);
+      // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
+      if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
+        ret[i] = std::make_shared<StatefulComputeExExecutor>(state, fcompute_ex, exec_type);
       } else {
-        FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
-            op, "FStatefulComputeEx", vctx[i]);
-        CHECK(fcompute_ex != nullptr)
+        FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
+            op, "FStatefulCompute", vctx[i]);
+        CHECK(fcompute != nullptr)
             << "One of FStatefulCompute and FStatefulComputeEx must be registered "
             << "for stateful operator " << op->name;
-        ret[i] = std::make_shared<StatefulComputeExExecutor>(state, fcompute_ex, exec_type);
+        ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
+                                                           exec_type, mutate_index);
       }
     } else if (is_layer_backward.get(op, false)) {
       CHECK_GE(inode.control_deps.size(), 1);
       uint32_t fwd_id = inode.control_deps[0];
       CHECK(vctx[fwd_id] == vctx[i]);
       CHECK(ret[fwd_id] != nullptr);
-      FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
-          op, "FStatefulCompute", vctx[i]);
-      if (fcompute != nullptr) {
-        ret[i] = std::make_shared<StatefulComputeExecutor>(
-            dynamic_cast<StatefulComputeExecutor*>(ret[fwd_id].get())->state_,
-            fcompute, exec_type);
-      } else {
-        FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
-            op, "FStatefulComputeEx", vctx[i]);
-        CHECK(fcompute_ex != nullptr)
-            << "One of FStatefulCompute and FStatefulComputeEx must be registered "
-            << "for stateful operator " << op->name;
+      FStatefulComputeEx fcompute_ex = common::GetFCompute<FStatefulComputeEx>(
+          op, "FStatefulComputeEx", vctx[i]);
+      // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
+      if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
         ret[i] = std::make_shared<StatefulComputeExExecutor>(
             dynamic_cast<StatefulComputeExExecutor*>(ret[fwd_id].get())->state_,
             fcompute_ex, exec_type);
+      } else {
+        FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
+            op, "FStatefulCompute", vctx[i]);
+        CHECK(fcompute != nullptr)
+            << "One of FStatefulCompute and FStatefulComputeEx must be registered "
+            << "for stateful operator " << op->name;
+        ret[i] = std::make_shared<StatefulComputeExecutor>(
+            dynamic_cast<StatefulComputeExecutor*>(ret[fwd_id].get())->state_,
+            fcompute, exec_type, mutate_index);
       }
     } else {
       FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
-      if (fcompute != nullptr) {
+      FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
+      if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
+        ret[i] = std::make_shared<FComputeExExecutor>(
+            inode.source->attrs, fcomp_ex, exec_type);
+      } else if (fcompute != nullptr) {
         ret[i] = std::make_shared<FComputeExecutor>(
-            inode.source->attrs, fcompute, exec_type);
+            inode.source->attrs, fcompute, exec_type, mutate_index);
       } else {
-        LOG(FATAL) << "FCompute not registered " << op->name;
+        LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
       }
     }
   }
diff --git a/src/executor/attach_op_resource_pass.cc b/src/executor/attach_op_resource_pass.cc
index ef26a3575c25..18feec795795 100644
--- a/src/executor/attach_op_resource_pass.cc
+++ b/src/executor/attach_op_resource_pass.cc
@@ -19,6 +19,7 @@
 
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file attach_op_resource_pass.cc
  * \brief Pass to attach resource to OpExecVector of the graph.
  */
@@ -30,10 +31,11 @@ namespace mxnet {
 namespace exec {
 
 Graph AttachOpResources(Graph g) {
-  auto& fresource =
+  static auto& fresource =
       nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
   auto& op_execs = nnvm::get<OpExecVector>(*g.attrs.at("op_execs"));
   const auto& vctx = g.GetAttr<ContextVector>("context");
+  const auto& vdispatch = g.GetAttr<DispatchModeVector>("dispatch_mode");
   const auto& idx = g.indexed_graph();
   // Use global resource pool for each executor for now.
   std::map<Context, Resource> cached_temp;
@@ -41,26 +43,33 @@ Graph AttachOpResources(Graph g) {
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     const auto& inode = idx[nid];
     if (inode.source->is_variable()) continue;
-    if (fresource.count(inode.source->op()) == 0) continue;
-    auto reqs = fresource[inode.source->op()](inode.source->attrs);
+    const Context &ctx = vctx[nid];
     auto& requested = op_execs[nid]->op_ctx.requested;
     requested.clear();
-    // Get the resource of temporal space.
-    for (const ResourceRequest& req : reqs) {
-      const Context &ctx = vctx[nid];
-      if (req.type == ResourceRequest::kTempSpace) {
-        if (cached_temp.count(ctx) != 0) {
-          requested.push_back(cached_temp.at(ctx));
+    const auto op = inode.source->op();
+    if (fresource.count(op) != 0) {
+      auto reqs = fresource[op](inode.source->attrs);
+      // Get the resource of temporal space.
+      for (const ResourceRequest& req : reqs) {
+        if (req.type == ResourceRequest::kTempSpace) {
+          if (cached_temp.count(ctx) != 0) {
+            requested.push_back(cached_temp.at(ctx));
+          } else {
+            Resource r = ResourceManager::Get()->Request(ctx, req);
+            requested.push_back(r);
+            cached_temp[ctx] = r;
+          }
+        } else if (req.type == ResourceRequest::kRandom) {
+          requested.push_back(ResourceManager::Get()->Request(ctx, req));
         } else {
-          Resource r = ResourceManager::Get()->Request(ctx, req);
-          requested.push_back(r);
-          cached_temp[ctx] = r;
+          LOG(FATAL) << "resource type not yet supported";
         }
-      } else if (req.type == ResourceRequest::kRandom) {
-        requested.push_back(ResourceManager::Get()->Request(ctx, req));
-      } else {
-        LOG(FATAL) << "resource type not yet supported";
       }
+      CHECK(vdispatch[nid] != DispatchMode::kUndefined);
+    }
+    // extra resource requests for storage fallback
+    if (vdispatch[nid] == DispatchMode::kFComputeFallback) {
+      requested.push_back(ResourceManager::Get()->Request(ctx, ResourceRequest::kTempSpace));
     }
   }
   return g;
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index 0eda71d98214..bf4b14771d72 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file exec_pass.h
  * \brief All the execution related pass and data structures.
  */
@@ -27,9 +28,12 @@
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/operator.h>
+#include <mxnet/graph_attr_types.h>
 #include <nnvm/graph.h>
+#include <nnvm/graph_attr_types.h>
 #include <vector>
 #include <memory>
+#include <string>
 
 namespace mxnet {
 namespace exec {
@@ -37,6 +41,12 @@ namespace exec {
 /*! \brief reuse graph definition */
 using nnvm::Graph;
 
+const int kBadStorageID = -1;
+const int kExternalStorageID = -2;
+const int kDynamicStorageID = -3;
+
+const int kNonDefaultStorage = -2;
+
 /*!
  * \brief executor to execute an operator
  * This is a graph executor dependent interface
@@ -44,7 +54,7 @@ using nnvm::Graph;
  */
 class OpExecutor {
  public:
-  /*! \brief input arrays */
+  /*! \brief input data arrays, which may be either input or aux */
   std::vector<NDArray> in_array;
   /*! \brief output data arrays */
   std::vector<NDArray> out_array;
@@ -65,7 +75,7 @@ class OpExecutor {
    *  This function call do not synchronize the stream.
    * \param rctx The runtime context passed in by environment.
    */
-  virtual void Run(RunContext rctx) = 0;
+  virtual void Run(RunContext rctx, bool is_gpu) = 0;
   /*! \return the execution type */
   virtual ExecType exec_type() const = 0;
   /*! \return return engine variable for operator states */
@@ -86,6 +96,12 @@ using OpExecVector = std::vector<std::shared_ptr<OpExecutor> >;
  */
 using ContextVector = std::vector<Context>;
 
+/*!
+ * \brief per node device mask vector
+ * \node stored under "dev_mask"
+ */
+using DevMaskVector = std::vector<int>;
+
 /*!
  * \brief Attach OpExecutor to the graph attributes.
  *
@@ -123,6 +139,56 @@ Graph AttachOpResources(Graph g);
  */
 Graph DetectInplaceAddTo(Graph g);
 
+/*!
+ * \brief Infer shapes in the graph given the information.
+ * \param graph The input graph.
+ * \param shape_inputs The shapes of input symbols to the graph.
+ * \param shape_attr_key The key to the node attribute that can indicate shape. This is
+ *                       the place where manual hint for shapes could be injected.
+ * \return A graph with new attribute "shape" containing inferred shape of each NodeEntry.
+ *         The index of ShapeVector is given by graph.indexed_graph().entry_id.
+ */
+Graph InferShape(Graph&& graph,
+                 nnvm::ShapeVector&& shape_inputs = nnvm::ShapeVector(),
+                 const std::string& shape_attr_key = "");
+
+/*!
+ * \brief Infer types in the graph given the information.
+ * \param graph The input graph.
+ * \param dtype_inputs The types of input symbols to the graph.
+ * \param dtype_attr_key The key to the node attribute that can indicate types. This is
+ *                       the place where manual hint for types could be injected.
+ * \return A graph with new attribute "dtype" containing inferred type of each NodeEntry.
+ *         The index of ShapeVector is given by graph.indexed_graph().entry_id.
+ */
+Graph InferType(Graph&& graph,
+                nnvm::DTypeVector&& dtype_inputs = nnvm::DTypeVector(),
+                const std::string& dtype_attr_key = "");
+
+/*!
+ * \brief Infer storage types in the graph given the information.
+ * \param graph The input graph.
+ * \param storage_type_inputs The storage types of input symbols to the graph.
+ * \param storage_type_attr_key The key to the node attribute that can indicate storage types.
+                                This is the place where manual hint for types could be injected.
+ * \return A graph with new attribute "storage_type" containing inferred type of each NodeEntry.
+ *         The index of StorageTypeVector is given by graph.indexed_graph().entry_id.
+ */
+Graph InferStorageType(Graph&& graph,
+                       StorageTypeVector&& storage_type_inputs = StorageTypeVector(),
+                       const std::string& storage_type_attr_key = "");
+
+/*! \brief The default storage type inference function, which assigns all undefined
+ *         storage types to kDefaultStorage. If all of input and output storage types
+ *         are kDefaultStorage, DispatchMode::kFCompute is assigned to dispatch_mode. Otherwise,
+ *         DispatchMode::kFComputeFallback is assigned to dispatch_mode.
+ */
+bool DefaultStorageType(const nnvm::NodeAttrs& attrs,
+                        const int dev_mask,
+                        DispatchMode* dispatch_mode,
+                        std::vector<int> *iattr,
+                        std::vector<int> *oattr);
+
 }  // namespace exec
 }  // namespace mxnet
 
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index a03179aa19cb..01484dac299e 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file graph_executor.cc
  * \brief graph executor
  */
@@ -30,9 +31,15 @@
 #include "./exec_pass.h"
 #include "./graph_executor.h"
 #include "../engine/profiler.h"
+#include "../common/utils.h"
 
 namespace mxnet {
 namespace exec {
+
+GraphExecutor::GraphExecutor() {
+  log_verbose_ = dmlc::GetEnv("MXNET_EXEC_VERBOSE_LOGGING", false);
+}
+
 GraphExecutor::~GraphExecutor() {
   for (auto& n : op_nodes_) {
     if (n.cached_opr != nullptr) {
@@ -47,6 +54,30 @@ GraphExecutor::~GraphExecutor() {
   }
 }
 
+inline NDArray InitZeros(const NDArrayStorageType stype, const TShape &shape,
+                                const Context &ctx, const int dtype) {
+  // NDArray with default storage
+  if (stype == kDefaultStorage) {
+    NDArray ret(shape, ctx, false, dtype);
+    ret = 0;
+    return ret;
+  }
+  // NDArray with non-default storage. Storage allocation is always delayed.
+  return NDArray(stype, shape, ctx, true, dtype);
+}
+
+inline void EmplaceBackZeros(const NDArrayStorageType stype, const TShape &shape,
+                             const Context &ctx, const int dtype,
+                             std::vector<NDArray> *vec) {
+  // NDArray with default storage
+  if (stype == kDefaultStorage) {
+    vec->emplace_back(shape, ctx, false, dtype);
+    vec->back() = 0;
+  } else {
+    // NDArray with non-default storage. Storage allocation is always delayed.
+    vec->emplace_back(stype, shape, ctx, true, dtype);
+  }
+}
 void GraphExecutor::Forward(bool is_train) {
   RunOps(is_train, 0, num_forward_nodes_);
 }
@@ -171,7 +202,15 @@ nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v) {
         // To put in plain text: v is gradient vector that get pushed in the order
         // that can generate them, which means if v[i] is not yet pushed,
         // all previous gradient cannot depend on it.
-        v[i].node->control_deps.push_back(ret.node);
+        // Note: For a symbol like the following:
+        // data = mx.sym.Variable('data')
+        // sym = data + data + data + data + data + data + data
+        // the node entries v passed in here are of the same node of
+        // op _identity_with_attr_like_rhs. We should skip adding a node
+        // to its own control_deps.
+        if (v[i-1].node != v[i].node) {
+          v[i].node->control_deps.push_back(ret.node);
+        }
 
         std::ostringstream os;
         os << "sum_grad_" << i;
@@ -253,8 +292,6 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
     if (type == "SoftmaxOutput") return false;
     if (type == "BatchNorm") return false;
     if (type == "CuDNNBatchNorm") return false;
-    if (type == "ROIPOoling") return false;
-    if (type == "Proposal") return false;
     return true;
   };
 
@@ -284,6 +321,7 @@ Graph AssignContext(Graph g,
                     const std::vector<Context>& in_arg_ctxes,
                     const std::vector<Context>& arg_grad_ctxes,
                     const std::vector<Context>& aux_state_ctxes,
+                    const std::vector<OpReqType>& grad_req_types,
                     size_t num_forward_inputs,
                     size_t num_forward_outputs) {
   const auto& idx = g.indexed_graph();
@@ -348,9 +386,15 @@ Graph AssignContext(Graph g,
 
   // loop through backward input nodes and populate maps and lists
   // the backward input nodes is the gradient of the loss wrt the output
-  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i) {
+  size_t arg_grad_offset = 0;
+  // keep an offset into the arg_grad_ctxes vector,
+  // since g.outputs exclude arg_grad whose req == null
+  CHECK_GE(grad_req_types.size(), g.outputs.size() - num_forward_outputs)
+           << "insufficient number of grad_reqs";
+  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
+    while (grad_req_types[arg_grad_offset] == kNullOp) ++arg_grad_offset;
     const uint32_t nid = idx.outputs()[i].node_id;
-    Context ctx = arg_grad_ctxes[i - num_forward_outputs];
+    Context ctx = arg_grad_ctxes[arg_grad_offset];
     if (ctx2id.count(ctx) == 0) {
       ctx2id[ctx] = static_cast<int>(ctx_list.size());
       ctx_list.push_back(ctx);
@@ -380,9 +424,11 @@ Graph AssignContext(Graph g,
   // if the assigned device of gradient node
   // corresponds to storage of grads
   auto &new_idx = g.indexed_graph();
-  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i) {
+  arg_grad_offset = 0;
+  for (size_t i = num_forward_outputs; i < g.outputs.size(); ++i, ++arg_grad_offset) {
+    while (grad_req_types[arg_grad_offset] == kNullOp) ++arg_grad_offset;
     const uint32_t nid = new_idx.outputs()[i].node_id;
-    Context ctx = arg_grad_ctxes[i - num_forward_outputs];
+    Context ctx = arg_grad_ctxes[arg_grad_offset];
     CHECK(ctx == vcontext[nid])
       << "Trying to save gradient to " << ctx
       << " while its source node \"" << new_idx[nid].source->attrs.name
@@ -440,6 +486,29 @@ void HandleInferTypeError(const size_t num_forward_inputs,
              << oss.str();
 }
 
+void HandleInferStorageTypeError(const size_t num_forward_inputs,
+                                 const nnvm::IndexedGraph& idx,
+                                 const StorageTypeVector& inferred_stypes) {
+  int cnt = 10;
+  std::ostringstream oss;
+  for (size_t i = 0; i < num_forward_inputs; ++i) {
+    const uint32_t nid = idx.input_nodes().at(i);
+    const uint32_t eid = idx.entry_id(nid, 0);
+    const int inferred_stype = inferred_stypes[eid];
+    if (inferred_stype == -1) {
+      const std::string& arg_name = idx[nid].source->attrs.name;
+      oss << arg_name << ": " << common::stype_string(inferred_stype) << ", ";
+      if (--cnt == 0) {
+        oss << "...";
+        break;
+      }
+    }
+  }
+  LOG(FATAL) << "InferStorageType pass cannot decide storage type for the following arguments "
+                "(-1 means unknown stype). Please consider providing them as inputs:\n"
+             << oss.str();
+}
+
 /*!
  * \brief GraphExecutor initializer for regular bind flow in which
  * input arguments and gradients are provided by users. This initializer
@@ -477,45 +546,67 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
   data_entry_.resize(idx.num_node_entries());
   nnvm::ShapeVector arg_shapes;
   nnvm::DTypeVector arg_dtypes;
+  StorageTypeVector arg_stypes(idx.num_node_entries(), -1);
   for (size_t i = 0; i < num_forward_inputs_; ++i) {
     const uint32_t nid = idx.input_nodes().at(i);
     const std::string& arg_name = idx[nid].source->attrs.name;
+    size_t eid = idx.entry_id(nid, 0);
     if (mutable_nodes.count(nid)) {
       CHECK_LT(aux_top, aux_states.size());
-      data_entry_[idx.entry_id(nid, 0)] = aux_states[aux_top];
+      data_entry_[eid] = aux_states[aux_top];
       arg_shapes.push_back(aux_states[aux_top].shape());
       arg_dtypes.push_back(aux_states[aux_top].dtype());
+      arg_stypes[eid] = aux_states[aux_top].storage_type();
       aux_state_map_.emplace(arg_name, aux_states[aux_top]);
       ++aux_top;
     } else {
       CHECK_LT(arg_top, in_args.size());
-      data_entry_[idx.entry_id(nid, 0)] = in_args[arg_top];
+      data_entry_[eid] = in_args[arg_top];
       arg_shapes.push_back(in_args[arg_top].shape());
       arg_dtypes.push_back(in_args[arg_top].dtype());
+      arg_stypes[eid] = in_args[arg_top].storage_type();
       in_arg_map_.emplace(arg_name, in_args[arg_top]);
       if (kNullOp != grad_req_types[arg_top]) {
+        auto grad_oid = grad_store_.size() + num_forward_outputs_;
+        auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
+        arg_stypes[grad_eid] = arg_grad_store[arg_top].storage_type();
         grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_store[arg_top]);
         arg_grad_map_.emplace(arg_name, arg_grad_store[arg_top]);
+        if (log_verbose_) {
+          LOG(INFO) << "\tassign data entry\t" << grad_eid << " as "
+                    << common::stype_string(arg_stypes[grad_eid]) << " (grad)";
+        }
       }
       ++arg_top;
     }
+    if (log_verbose_) {
+      LOG(INFO) << "\tassign data entry\t" << eid << " as "
+                << common::stype_string(data_entry_[eid].storage_type()) << " (input)";
+    }
   }
 
   // expand arg_shapes and arg_dtypes to contain backward inputs
   arg_shapes.resize(idx.input_nodes().size(), TShape());
-  g = nnvm::pass::InferShape(g, arg_shapes, "__shape__");
+  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
   if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
     HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
                           g.GetAttr<nnvm::ShapeVector>("shape"));
   }
 
   arg_dtypes.resize(idx.input_nodes().size(), -1);
-  g = nnvm::pass::InferType(g, arg_dtypes, "__dtype__");
+  g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__");
   if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
     HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
                          g.GetAttr<nnvm::DTypeVector>("dtype"));
   }
 
+  g.attrs["storage_type"] = std::make_shared<dmlc::any>(std::move(arg_stypes));
+  g = InferStorageType(std::move(g), std::move(StorageTypeVector()), "");
+  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
+    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
+                                g.GetAttr<StorageTypeVector>("storage_type"));
+  }
+
   // Initialize the rest attributes of the graph.
   // This function can be called by regular bind
   // operation flow as well.
@@ -531,6 +622,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
 void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
                                   const nnvm::ShapeVector& inferred_shapes,
                                   const nnvm::DTypeVector& inferred_dtypes,
+                                  const StorageTypeVector& inferred_stypes,
                                   const std::vector<Context>& in_arg_ctxes,
                                   const std::vector<Context>& arg_grad_ctxes,
                                   const std::vector<Context>& aux_state_ctxes,
@@ -548,22 +640,40 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
     const uint32_t eid = idx.entry_id(nid, 0);
     const TShape& inferred_shape = inferred_shapes[eid];
     const int inferred_dtype = inferred_dtypes[eid];
+    const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
     const std::string& arg_name = idx[nid].source->attrs.name;
     if (mutable_nodes.count(nid)) {  // aux_states
-      aux_state_vec->emplace_back(inferred_shape, aux_state_ctxes[aux_top], false, inferred_dtype);
-      aux_state_vec->back() = 0;
+      EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top],
+                       inferred_dtype, aux_state_vec);
       data_entry_[eid] = aux_state_vec->back();
       aux_state_map_.emplace(arg_name, aux_state_vec->back());
       ++aux_top;
+      if (log_verbose_) {
+        LOG(INFO) << "\tassign aux entry\t" << eid << "\t as "
+                  << common::stype_string(inferred_stype);
+      }
     } else {  // in_args
-      in_arg_vec->emplace_back(inferred_shape, in_arg_ctxes[arg_top], false, inferred_dtype);
-      in_arg_vec->back() = 0;
+      EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top],
+                       inferred_dtype, in_arg_vec);
       data_entry_[eid] = in_arg_vec->back();
+      if (log_verbose_) {
+        LOG(INFO) << "\tassign data entry\t" << eid << "\tas "
+                  << common::stype_string(inferred_stype);
+      }
+      // Get the storage type for grad
       if (kNullOp == grad_req_types[arg_top]) {
         arg_grad_vec->emplace_back();
       } else {
-        arg_grad_vec->emplace_back(inferred_shape, arg_grad_ctxes[arg_top], false, inferred_dtype);
-        arg_grad_vec->back() = 0;
+        // Init based on storage type
+        auto grad_oid = grad_store_.size() + num_forward_outputs_;
+        auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
+        auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
+        EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top],
+                         inferred_dtype, arg_grad_vec);
+        if (log_verbose_) {
+          LOG(INFO) << "\tassign grad entry\t" << grad_eid << "\tas "
+                    << common::stype_string(grad_stype);
+        }
         grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
         arg_grad_map_.emplace(arg_name, arg_grad_vec->back());
       }
@@ -575,33 +685,52 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
 
 /*!
  * \brief If the requested ndarray's shape size is less than
- * the corresponding shared_data_array's shape size, reuse
- * the memory allocation; otherwise, create a zero ndarray.
+ * the corresponding shared_data_array's shape size and the
+ * storage type is shareable, reuse the memory allocation
+ * in shared_buffer; otherwise, create a zero ndarray.
+ * Shareable storages include both default storage and row_sparse storage
+ * if enable_row_sparse_sharing is `True`, otherwise default storage only.
  */
 NDArray ReshapeOrCreate(const std::string& name,
                         const TShape& dest_arg_shape,
                         const int dest_arg_dtype,
+                        const NDArrayStorageType dest_arg_stype,
                         const Context& ctx,
-                        std::unordered_map<std::string, NDArray>* shared_buffer) {
+                        std::unordered_map<std::string, NDArray>* shared_buffer,
+                        bool enable_row_sparse_sharing) {
+  bool stype_shareable = dest_arg_stype == kDefaultStorage;
+  if (enable_row_sparse_sharing) {
+    stype_shareable = stype_shareable || dest_arg_stype == kRowSparseStorage;
+  }
   auto it = shared_buffer->find(name);
   if (it != shared_buffer->end()) {
-    if (it->second.shape().Size() >= dest_arg_shape.Size()) {  // memory can be reused
+    // check if size is large enough for sharing
+    bool size_shareable = it->second.shape().Size() >= dest_arg_shape.Size();
+    if (size_shareable && stype_shareable) {  // memory can be reused
       CHECK_EQ(it->second.dtype(), dest_arg_dtype)
-        << "Requested arg array's dtype does not match the reusable ndarray";
+        << "Requested arg array's dtype does not match that of the reusable ndarray";
+      CHECK_EQ(it->second.storage_type(), dest_arg_stype)
+        << "Requested arg array's stype does not match that of the reusable ndarray";
       return it->second.Reshape(dest_arg_shape);
-    } else {
+    } else if (stype_shareable) {
       LOG(WARNING) << "Bucketing: data " << name << " has a shape " << dest_arg_shape
                    << ", which is larger than already allocated shape " << it->second.shape()
                    << ". Need to re-allocate. Consider putting default bucket key to be "
                    << "the bucket taking the largest input for better memory sharing.";
-      it->second = NDArray(dest_arg_shape, ctx, false, dest_arg_dtype);
-      it->second = 0;
+      // size is not large enough, creating a larger one for sharing
+      // the NDArrays in shared_buffer are guaranteed to be of shareable storages
+      it->second = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
       return it->second;
-    }  // arg_array.shape().Size() >= arg_shape.Size()
+    } else {
+      // not shareable storage
+      return InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
+    }
   } else {
-    auto p = shared_buffer->emplace(name, NDArray(dest_arg_shape, ctx, false, dest_arg_dtype));
-    p.first->second = 0;
-    return p.first->second;
+    auto ret = InitZeros(dest_arg_stype, dest_arg_shape, ctx, dest_arg_dtype);
+    if (stype_shareable) {
+      shared_buffer->emplace(name, ret);
+    }
+    return ret;
   }  // if (it != shared_buffer->end())
 }
 
@@ -614,6 +743,7 @@ NDArray ReshapeOrCreate(const std::string& name,
 void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
                                   const nnvm::ShapeVector& inferred_shapes,
                                   const nnvm::DTypeVector& inferred_dtypes,
+                                  const StorageTypeVector& inferred_stypes,
                                   const std::vector<Context>& in_arg_ctxes,
                                   const std::vector<Context>& arg_grad_ctxes,
                                   const std::vector<Context>& aux_state_ctxes,
@@ -633,33 +763,52 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
     const uint32_t eid = idx.entry_id(nid, 0);
     const TShape& inferred_shape = inferred_shapes[eid];
     const int inferred_dtype = inferred_dtypes[eid];
+    const NDArrayStorageType inferred_stype = (NDArrayStorageType) inferred_stypes[eid];
     const std::string& arg_name = idx[nid].source->attrs.name;
-    if (mutable_nodes.count(nid)) {  // aux_states
+    // aux_states
+    if (mutable_nodes.count(nid)) {
       if (nullptr != shared_exec) {
         const NDArray& aux_nd = shared_exec->aux_state_map().at(arg_name);
+        CHECK(inferred_stype == kDefaultStorage && aux_nd.storage_type() == kDefaultStorage)
+          << "Non-default storage type detected when creating auxilliary NDArray. The allocated "
+          << "memory of shared_exec.aux_array cannot be resued for argument: "
+          << arg_name << " for the current executor";
         CHECK_EQ(inferred_shape, aux_nd.shape())
           << "Inferred shape does not match shared_exec.aux_array's shape."
              " Therefore, the allocated memory for shared_exec.aux_array cannot"
-             " be resued for creating auxilliary NDArray of the argument"
+             " be resued for creating auxilliary NDArray of the argument: "
           << arg_name << " for the current executor";
         CHECK_EQ(inferred_dtype, aux_nd.dtype())
           << "Inferred dtype does not match shared_exec.aux_array's dtype."
              " Therefore, the allocated memory for shared_exec.aux_array cannot"
-             " be resued for creating auxilliary NDArray of the argument"
+             " be resued for creating auxilliary NDArray of the argument: "
           << arg_name << " for the current executor";
         aux_state_vec->emplace_back(aux_nd);
       } else {
-        aux_state_vec->emplace_back(inferred_shape, aux_state_ctxes[aux_top],
-                                    false, inferred_dtype);
-        aux_state_vec->back() = 0;
+        EmplaceBackZeros(inferred_stype, inferred_shape, aux_state_ctxes[aux_top],
+                         inferred_dtype, aux_state_vec);
       }  // if (has_shared_exec)
       data_entry_[eid] = aux_state_vec->back();
       aux_state_map_.emplace(arg_name, aux_state_vec->back());
       ++aux_top;
-    } else {  // in_args
+    } else {  // in_args and grad for in_args
       if (shared_arg_names.count(arg_name)) {  // model parameter
+        // model parameter
         if (nullptr != shared_exec) {
           const NDArray& in_arg_nd = shared_exec->in_arg_map().at(arg_name);
+          auto arg_nd_stype = in_arg_nd.storage_type();
+          // for model parameter, both default storage and row_sparse storage can be shared
+          bool shareable_arg_stype = inferred_stype == kDefaultStorage ||
+                                     inferred_stype == kRowSparseStorage;
+          // try to reuse memory from shared_exec
+          CHECK(shareable_arg_stype) << "Inferred storage type "
+            << common::stype_string(inferred_stype)
+            << " does not support memory sharing with shared_exec.arg_array";
+          CHECK_EQ(inferred_stype, arg_nd_stype)
+            << "Inferred stype does not match shared_exec.arg_array's stype"
+               " Therefore, the allocated memory for shared_exec.arg_array cannot"
+               " be resued for creating NDArray of the argument"
+            << arg_name << " for the current executor";
           CHECK_EQ(inferred_shape, in_arg_nd.shape())
             << "Inferred shape does not match shared_exec.arg_array's shape"
                " Therefore, the allocated memory for shared_exec.arg_array cannot"
@@ -671,33 +820,47 @@ void GraphExecutor::InitArguments(const nnvm::IndexedGraph& idx,
                " be resued for creating NDArray of the argument"
             << arg_name << " for the current executor";
           in_arg_vec->emplace_back(in_arg_nd);
-          if (kNullOp == grad_req_types[arg_top]) {
-            arg_grad_vec->emplace_back();
-          } else {
+        } else {
+          // doesn't have shared_exec, or non-default storage
+          EmplaceBackZeros(inferred_stype, inferred_shape, in_arg_ctxes[arg_top],
+                           inferred_dtype, in_arg_vec);
+        }
+        // gradient for model parameter
+        if (kNullOp == grad_req_types[arg_top]) {
+          arg_grad_vec->emplace_back();
+        } else {
+          auto grad_oid = grad_store_.size() + num_forward_outputs_;
+          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
+          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
+          if (nullptr != shared_exec && grad_stype == kDefaultStorage &&
+              shared_exec->arg_grad_map().at(arg_name).storage_type() == kDefaultStorage) {
+            // try to reuse memory from shared_exec
             arg_grad_vec->emplace_back(shared_exec->arg_grad_map().at(arg_name));
-            grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
-          }  // if (kNullOp == grad_req_types[arg_top])
-        } else {  // !has shared_exec
-          in_arg_vec->emplace_back(inferred_shape, in_arg_ctxes[arg_top], false, inferred_dtype);
-          in_arg_vec->back() = 0;
-          if (kNullOp == grad_req_types[arg_top]) {
-            arg_grad_vec->emplace_back();
           } else {
-            arg_grad_vec->emplace_back(inferred_shape, arg_grad_ctxes[arg_top],
-                                       false, inferred_dtype);
-            arg_grad_vec->back() = 0;
-            grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
-          }  // if (kNullOp == grad_req_types[arg_top])
-        }  // if (has_shared_exec)
+            // no need to reuse memory from shared_exec for gradient of non-default storage
+            EmplaceBackZeros(grad_stype, inferred_shape, arg_grad_ctxes[arg_top],
+                             inferred_dtype, arg_grad_vec);
+          }
+          grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
+        }
       } else {  // !shared_arg_names.count(arg_name)
+        // model parameter, row_sparse ndarray sharing enabled
+        bool enable_row_sparse_sharing = true;
         in_arg_vec->emplace_back(ReshapeOrCreate(arg_name, inferred_shape, inferred_dtype,
-                                                 in_arg_ctxes[arg_top], shared_buffer));
+                                                 inferred_stype, in_arg_ctxes[arg_top],
+                                                 shared_buffer, enable_row_sparse_sharing));
+        // gradient for model parameter, row_sparse ndarray sharing disabled
         if (kNullOp == grad_req_types[arg_top]) {
           arg_grad_vec->emplace_back();
         } else {
+          auto grad_oid = grad_store_.size() + num_forward_outputs_;
+          auto grad_eid = idx.entry_id(idx.outputs()[grad_oid]);
+          auto grad_stype = (NDArrayStorageType) inferred_stypes[grad_eid];
+          bool enable_row_sparse_sharing = false;
           arg_grad_vec->emplace_back(ReshapeOrCreate("grad of " + arg_name, inferred_shape,
-                                                     inferred_dtype, arg_grad_ctxes[arg_top],
-                                                     shared_buffer));
+                                                     inferred_dtype, grad_stype,
+                                                     arg_grad_ctxes[arg_top], shared_buffer,
+                                                     enable_row_sparse_sharing));
           grad_store_.emplace_back(grad_req_types[arg_top], arg_grad_vec->back());
         }  // if (kNullOp == grad_req_types[arg_top])
       }  // if (shared_arg_names.count(arg_name))
@@ -720,14 +883,15 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol,
                                     Executor* shared_exec,
                                     const nnvm::NodeEntryMap<NDArray>& feed_dict) {
   const auto& idx = g.indexed_graph();
+  const auto& vstorage_type = g.GetAttr<StorageTypeVector>("storage_type");
+
+  // data entries for output gradients
   for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) {
     data_entry_[idx.entry_id(idx.outputs()[j])] = grad_store_[j - num_forward_outputs_].second;
   }
 
   {
     // memory allocator
-    const int kBadStorageID = -1;
-    const int kExternalStorageID = -2;
     nnvm::StorageVector arg_storage_id(idx.num_node_entries(), kBadStorageID);
     for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) {
       arg_storage_id[idx.entry_id(idx.outputs()[j])] = kExternalStorageID;
@@ -737,6 +901,9 @@ void GraphExecutor::FinishInitGraph(nnvm::Symbol symbol,
       data_entry_[eid] = kv.second;
       arg_storage_id[eid] = kExternalStorageID;
     }
+    for (size_t i = 0; i < idx.num_node_entries(); i++) {
+      if (vstorage_type[i] != kDefaultStorage) arg_storage_id[i] = kDynamicStorageID;
+    }
     g.attrs["storage"] = std::make_shared<dmlc::any>(std::move(arg_storage_id));
     g = nnvm::ApplyPass(g, "PlanMemory");
   }
@@ -794,6 +961,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
                          const std::vector<Context>& aux_state_ctxes,
                          const std::unordered_map<std::string, TShape>& arg_shape_map,
                          const std::unordered_map<std::string, int>& arg_dtype_map,
+                         const std::unordered_map<std::string, int>& arg_stype_map,
                          const std::vector<OpReqType>& grad_req_types,
                          const std::unordered_set<std::string>& shared_arg_names,
                          std::vector<NDArray>* in_arg_vec,
@@ -813,6 +981,7 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
   const nnvm::IndexedGraph& idx = g.indexed_graph();
   nnvm::ShapeVector arg_shapes(idx.input_nodes().size(), TShape());
   nnvm::DTypeVector arg_dtypes(idx.input_nodes().size(), -1);
+  StorageTypeVector arg_stypes(idx.input_nodes().size(), kUndefinedStorage);
   for (size_t i = 0; i < num_forward_inputs_; ++i) {
     const uint32_t nid = idx.input_nodes().at(i);
     const std::string& name = idx[nid].source->attrs.name;
@@ -824,29 +993,41 @@ void GraphExecutor::Init(nnvm::Symbol symbol,
     if (arg_dtype_map.end() != it2) {
       arg_dtypes[i] = it2->second;
     }
+    auto it3 = arg_stype_map.find(name);
+    if (arg_stype_map.end() != it3) {
+      arg_stypes[i] = it3->second;
+    }
   }
-  g = nnvm::pass::InferShape(g, arg_shapes, "__shape__");
+  g = InferShape(std::move(g), std::move(arg_shapes), "__shape__");
   if (g.GetAttr<size_t>("shape_num_unknown_nodes") != 0U) {
     HandleInferShapeError(num_forward_inputs_, g.indexed_graph(),
                           g.GetAttr<nnvm::ShapeVector>("shape"));
   }
 
-  g = nnvm::pass::InferType(g, arg_dtypes, "__dtype__");
+  g = InferType(std::move(g), std::move(arg_dtypes), "__dtype__");
   if (g.GetAttr<size_t>("dtype_num_unknown_nodes") != 0U) {
     HandleInferTypeError(num_forward_inputs_, g.indexed_graph(),
                          g.GetAttr<nnvm::DTypeVector>("dtype"));
   }
 
+  g = InferStorageType(std::move(g), std::move(arg_stypes), "__storage_type__");
+  if (g.GetAttr<size_t>("storage_type_num_unknown_nodes") != 0U) {
+    HandleInferStorageTypeError(num_forward_inputs_, g.indexed_graph(),
+                                g.GetAttr<StorageTypeVector>("storage_type"));
+  }
+
   // Create in_args, arg_grads, and aux_states using
   // the inferred shapes and dtypes.
   if (nullptr == shared_buffer) {  // regular simple bind
     InitArguments(idx, g.GetAttr<nnvm::ShapeVector>("shape"),
                   g.GetAttr<nnvm::DTypeVector>("dtype"),
+                  g.GetAttr<StorageTypeVector>("storage_type"),
                   in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
                   grad_req_types, in_arg_vec, arg_grad_vec, aux_state_vec);
   } else {  // simple bind using shared data arrays and shared_exec
     InitArguments(idx, g.GetAttr<nnvm::ShapeVector>("shape"),
                   g.GetAttr<nnvm::DTypeVector>("dtype"),
+                  g.GetAttr<StorageTypeVector>("storage_type"),
                   in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
                   grad_req_types, shared_arg_names, shared_exec,
                   shared_buffer, in_arg_vec, arg_grad_vec, aux_state_vec);
@@ -883,6 +1064,7 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
                     in_arg_ctxes,
                     arg_grad_ctxes,
                     aux_state_ctxes,
+                    grad_req_types,
                     num_forward_inputs_,
                     num_forward_outputs_);
 
@@ -907,20 +1089,29 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   const auto& vdtype = graph_.GetAttr<DTypeVector>("dtype");
   const auto& vshape = graph_.GetAttr<ShapeVector>("shape");
   const auto& vstorage = graph_.GetAttr<StorageVector>("storage_id");
+  const auto& vstorage_type = graph_.GetAttr<StorageTypeVector>("storage_type");
   const auto& vctx = graph_.GetAttr<ContextVector>("context");
   CHECK_EQ(idx.num_node_entries(), vshape.size());
   CHECK_EQ(idx.num_node_entries(), vdtype.size());
   CHECK_EQ(idx.num_node_entries(), vstorage.size());
   CHECK_EQ(data_entry_.size(), vshape.size());
   std::vector<Context> data_context(idx.num_node_entries());
+  std::vector<NDArrayStorageType> data_storage_type(idx.num_node_entries(), kUndefinedStorage);
   for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
     for (uint32_t i = 0; i < idx[nid].source->num_outputs(); ++i) {
-      data_context[idx.entry_id(nid, i)] = vctx[nid];
+      auto eid = idx.entry_id(nid, i);
+      data_context[eid] = vctx[nid];
+      CHECK_NE(vstorage_type[nid], kUndefinedStorage);
+      data_storage_type[eid] = (NDArrayStorageType) vstorage_type[nid];
     }
   }
 
   // information about the pool
-  using PoolEntry = std::pair<Context, size_t>;
+  struct PoolEntry {
+    Context ctx;
+    size_t bytes;
+    NDArrayStorageType stype;
+  };
   std::vector<PoolEntry> pool_info;
 
   // assign array to head gradient
@@ -928,26 +1119,37 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     uint32_t nid = idx.input_nodes().at(i);
     uint32_t oid = head_grad_map_.at(idx[nid].source);
     uint32_t eid = idx.entry_id(idx.outputs()[oid]);
+    NDArrayStorageType stype = (NDArrayStorageType) vstorage_type[eid];
     CHECK_NE(vshape[eid].ndim(), 0U);
     CHECK_NE(vdtype[eid], -1);
-    data_entry_[idx.entry_id(nid, 0)] =
-        NDArray(vshape[eid], data_context[eid], false, vdtype[eid]);
+    auto data_eid = idx.entry_id(nid, 0);
+    // initialize based on storage_type
+    if (stype != kDefaultStorage) {
+      data_entry_[data_eid] = NDArray(stype, vshape[eid], data_context[eid], true, vdtype[eid]);
+    } else {
+      data_entry_[data_eid] = NDArray(vshape[eid], data_context[eid], false, vdtype[eid]);
+    }
+    if (log_verbose_) {
+      LOG(INFO) << "\tinit head_grad entry\t" << data_eid << "\tas "
+                << common::stype_string(stype);
+    }
   }
   // get maximum bytes in each pool
   for (size_t i = 0; i < vshape.size(); ++i) {
     if (!data_entry_[i].is_none()) continue;
     size_t bytes = vshape[i].Size() * mshadow::mshadow_sizeof(vdtype[i]);
     int storage_id = vstorage[i];
+    // skip pool allocation for kBadStorageID, kExternalStorageID and kDynamicStorageID
     if (storage_id < 0) continue;
     size_t sid = static_cast<size_t>(storage_id);
     if (sid >= pool_info.size()) {
-      pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0)});
+      pool_info.resize(sid + 1, PoolEntry{Context::CPU(), size_t(0), kUndefinedStorage});
     }
     PoolEntry& info = pool_info[sid];
-    if (info.second == 0) {
-      info = PoolEntry{data_context[i], bytes};
+    if (info.bytes == 0) {
+      info = PoolEntry{data_context[i], bytes, data_storage_type[i]};
     } else {
-      info.second = std::max(info.second, bytes);
+      info.bytes = std::max(info.bytes, bytes);
     }
   }
   // construct the re-use pool, if needed
@@ -968,13 +1170,13 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     sorted_pool_index.push_back(i);
   }
   auto pool_comparator = [&pool_info](int lhs, int rhs){
-    return pool_info[lhs].second > pool_info[rhs].second;
+    return pool_info[lhs].bytes > pool_info[rhs].bytes;
   };
   std::sort(sorted_pool_index.begin(), sorted_pool_index.end(), pool_comparator);
 
   for (size_t i : sorted_pool_index) {
-    const Context& ctx = pool_info[i].first;
-    size_t bytes = pool_info[i].second;
+    const Context& ctx = pool_info[i].ctx;
+    size_t bytes = pool_info[i].bytes;
     bool allocated = false;
     for (auto it = free_pool.lower_bound(bytes); it != free_pool.end(); ++it) {
       if (it->second.ctx() == ctx && it->first >= bytes) {
@@ -989,7 +1191,9 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
       CHECK_LE(nword, std::numeric_limits<nnvm::dim_t>::max());
       // allocate float arrays
       TShape shape{static_cast<nnvm::dim_t>(nword)};
-      NDArray nd(shape, ctx);
+      // TODO(junwu): adding delay_alloc=true to create nd
+      // is a temporary solution.
+      NDArray nd(shape, ctx, true);
       data_pool_[i] = nd;
       // put the new allocated arrays to shared pool
       if (shared_pool != nullptr)  {
@@ -999,15 +1203,22 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
   }
   CHECK_EQ(data_pool_.size(), pool_info.size());
   // assign the data entries
-
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     // avoid pre-allocated arrays
     if (!data_entry_[i].is_none()) continue;
     // assign allocated array by storage id
     int storage_id = vstorage[i];
-    CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet";
-    const NDArray& src = data_pool_.at(storage_id);
-    data_entry_[i] = src.AsArray(vshape[i], vdtype[i]);
+    auto storage_type = (NDArrayStorageType) vstorage_type[i];
+    if (storage_type == kDefaultStorage) {
+      CHECK_GE(storage_id, 0) << "Do not support runtime shape op yet";
+      const NDArray& src = data_pool_.at(storage_id);
+      data_entry_[i] = src.AsArray(vshape[i], vdtype[i]);
+    } else {
+      data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]);
+    }
+    if (log_verbose_) {
+      LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type);
+    }
   }
 }
 
@@ -1030,10 +1241,8 @@ void GraphExecutor::InitCachedOps() {
     if (inode.source->is_variable()) continue;
 #if MXNET_USE_PROFILER
     op_nodes_[nid].opr_name = inode.source->op()->name.c_str();
-    op_nodes_[nid].attr_name = inode.source->attrs.name.c_str();
 #else
     op_nodes_[nid].opr_name = nullptr;
-    op_nodes_[nid].attr_name = nullptr;
 #endif
     if (skip_plus_node.at(nid)) {
       op_nodes_[nid].skip_exec_node = true; continue;
@@ -1099,8 +1308,10 @@ void GraphExecutor::InitCachedOps() {
     std::copy(mutate_vars.begin(), mutate_vars.end(),
               std::inserter(all_vars, all_vars.end()));
     // setup exec vars
-    Engine::Get()->PushSync([exec](RunContext rctx) {
+    Engine::Get()->PushAsync(
+      [exec](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         exec->Setup();
+        on_complete();
       }, Context::CPU(), {}, all_vars, FnProperty::kNormal, 0,
       PROFILER_MESSAGE("SetupExec"));
     auto exec_fun = [exec, is_async, is_gpu] (
@@ -1108,7 +1319,7 @@ void GraphExecutor::InitCachedOps() {
       if (is_async) {
         exec->op_ctx.async_on_complete = on_complete;
       }
-      exec->Run(ctx);
+      exec->Run(ctx, is_gpu);
       // call on complete only if it is async op
       if (!is_async) {
         if (is_gpu) {
@@ -1125,8 +1336,7 @@ void GraphExecutor::InitCachedOps() {
     // setup the vars
     op_nodes_[nid].cached_opr = Engine::Get()->NewOperator(
         exec_fun, use_vars, mutate_vars, FnProperty::kNormal,
-        PROFILER_MESSAGE(op_nodes_[nid].opr_name),
-        PROFILER_MESSAGE(op_nodes_[nid].attr_name));
+        PROFILER_MESSAGE(op_nodes_[nid].opr_name));
     op_nodes_[nid].mutate_vars = mutate_vars;
     op_nodes_[nid].use_vars = use_vars;
   }
@@ -1150,8 +1360,8 @@ void GraphExecutor::InitOpSegs() {
     num_nodes_threshold = std::numeric_limits<size_t>::max();
   }
 
-  // create forward segments for training
-  if (prefer_bulk_exec > 0) {
+  if (prefer_bulk_exec) {
+    // create forward segments for training
     size_t topo_start = 0;
     for (size_t nid = 0; nid < num_forward_nodes_; nid++) {
       auto &node = graph_.indexed_graph()[nid].source;
@@ -1169,17 +1379,15 @@ void GraphExecutor::InitOpSegs() {
     if (topo_start != num_forward_nodes_) {
       cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, num_forward_nodes_);
     }
-  }
 
-  // create backward segments for training
-  if (prefer_bulk_exec) {
+    // create backward segments for training
     // get all gradient variables
     std::unordered_set<engine::VarHandle> grad_vars;
     for (auto &kv : grad_store_) {
       grad_vars.insert(kv.second.var());
     }
     auto &idx = graph_.indexed_graph();
-    size_t topo_start = num_forward_nodes_;
+    topo_start = num_forward_nodes_;
     for (size_t nid = num_forward_nodes_; nid < total_num_nodes; nid++) {
       auto &op_node = op_nodes_[nid];
       if (op_node.skip_exec_node || op_node.exec == nullptr) {
@@ -1208,6 +1416,7 @@ void GraphExecutor::InitOpSegs() {
       cached_seg_opr_[topo_start] = this->CreateCachedSegOpr(topo_start, total_num_nodes);
     }
   }
+
   return;
 }
 
@@ -1270,7 +1479,8 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
       CHECK_EQ(opnode.exec->out_array.size(), 1U);
       CopyFromTo(opnode.exec->in_array[0], &(opnode.exec->out_array[0]));
     } else if (opnode.exec->exec_type() == ExecType::kLocal) {
-      opnode.exec->Run(RunContext{opnode.ctx, nullptr});
+      bool is_gpu = opnode.ctx.dev_mask() == gpu::kDevMask;
+      opnode.exec->Run(RunContext{opnode.ctx, nullptr}, is_gpu);
     } else if (opnode.cached_opr != nullptr) {
 #if MXNET_USE_PROFILER
       bool profiling = engine::Profiler::Get()->GetState() == engine::Profiler::kRunning;
@@ -1302,10 +1512,8 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
   }
 #if MXNET_USE_PROFILER
   std::string opr_names = "[";
-  std::string attr_names = "[";
 #else
   std::string opr_names = "Bulk Execution";
-  std::string attr_names = "Bulk Execution";
 #endif
 
   const auto& idx = graph_.indexed_graph();
@@ -1330,7 +1538,6 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
     ret.exec_list.push_back(exec);
 #if MXNET_USE_PROFILER
     opr_names += inode.source->op()->name + ",";
-    attr_names += inode.source->attrs.name + ",";
 #endif
   }
 
@@ -1343,7 +1550,7 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
       RunContext ctx, Engine::CallbackOnComplete on_complete) {
     // Run all opr in the sub-graph
     for (auto &exec : exec_list) {
-      exec->Run(ctx);
+      exec->Run(ctx, is_gpu);
     }
     if (is_gpu) {
 #if MXNET_USE_CUDA
@@ -1358,19 +1565,14 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
 #if MXNET_USE_PROFILER
     opr_names.pop_back();
     opr_names += "]";
-    attr_names.pop_back();
-    attr_names += "]";
     // the lifetime of `opr_names.c_str()` is same with opr_names
     // you need to copy it out. (potential memory leak risk)
     char *p_opr_name = new char[opr_names.size() + 1];
     memcpy(p_opr_name, opr_names.c_str(), opr_names.size() + 1);
-    char *p_attr_name = new char[attr_names.size() + 1];
-    memcpy(p_attr_name, attr_names.c_str(), attr_names.size() + 1);
 #endif
   ret.opr = Engine::Get()->NewOperator(
       exec_fun, use_vars, mutate_vars, FnProperty::kNormal,
-      PROFILER_MESSAGE(p_opr_name),
-      PROFILER_MESSAGE(p_attr_name));
+      PROFILER_MESSAGE(p_opr_name));
   return ret;
 }
 }  // namespace exec
@@ -1383,6 +1585,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
                                const std::vector<Context>& aux_state_ctxes,
                                const std::unordered_map<std::string, TShape>& arg_shape_map,
                                const std::unordered_map<std::string, int>& arg_dtype_map,
+                               const std::unordered_map<std::string, int>& arg_stype_map,
                                const std::vector<OpReqType>& grad_req_types,
                                const std::unordered_set<std::string>& shared_arg_names,
                                std::vector<NDArray>* in_args,
@@ -1393,7 +1596,7 @@ Executor *Executor::SimpleBind(nnvm::Symbol symbol,
   auto exec = new exec::GraphExecutor();
   exec->Init(symbol, default_ctx, group2ctx,
              in_arg_ctxes, arg_grad_ctxes, aux_state_ctxes,
-             arg_shape_map, arg_dtype_map,
+             arg_shape_map, arg_dtype_map, arg_stype_map,
              grad_req_types, shared_arg_names,
              in_args, arg_grads, aux_states,
              shared_buffer, shared_exec);
diff --git a/src/executor/graph_executor.h b/src/executor/graph_executor.h
index ba42e5780ec7..0e5ef3298945 100644
--- a/src/executor/graph_executor.h
+++ b/src/executor/graph_executor.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file graph_executor.h
  * \brief Executor to execute the computation graph.
  */
@@ -26,6 +27,7 @@
 
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
+#include <mxnet/imperative.h>
 #include <mxnet/operator.h>
 #include <mxnet/executor.h>
 #include <nnvm/graph.h>
@@ -44,21 +46,18 @@ namespace exec {
 class GraphExecutor;
 }
 
-// forward declaration
-namespace autograd {
-class AutogradRuntime;
-}
-
 namespace exec {
 
 using nnvm::Graph;
 
+nnvm::NodeEntry AggregateGradient(std::vector<nnvm::NodeEntry>&& v);
+
 // graph executors
 class GraphExecutor : public Executor {
  public:
-  friend class autograd::AutogradRuntime;
   using Executor::MonitorCallback;
 
+  GraphExecutor();
   virtual ~GraphExecutor();
   void Forward(bool is_train) override;
   void PartialForward(bool is_train, int step, int *step_left) override;
@@ -96,6 +95,7 @@ class GraphExecutor : public Executor {
             const std::vector<Context>& aux_state_ctxes,
             const std::unordered_map<std::string, TShape>& arg_shape_map,
             const std::unordered_map<std::string, int>& arg_dtype_map,
+            const std::unordered_map<std::string, int>& arg_stype_map,
             const std::vector<OpReqType>& grad_req_types,
             const std::unordered_set<std::string>& shared_arg_names,
             std::vector<NDArray>* in_arg_vec,
@@ -107,13 +107,12 @@ class GraphExecutor : public Executor {
               = nnvm::NodeEntryMap<NDArray>());
 
  protected:
+  friend class mxnet::Imperative;
   // Information about operational node
   struct OpNode {
     // The name of the operator
     const char* opr_name;
     // the context of the node
-    const char* attr_name;
-    // the context of the node
     Context ctx;
     // The executor
     std::shared_ptr<OpExecutor> exec;
@@ -143,6 +142,7 @@ class GraphExecutor : public Executor {
   void InitArguments(const nnvm::IndexedGraph& idx,
                      const nnvm::ShapeVector& inferred_shapes,
                      const nnvm::DTypeVector& inferred_dtypes,
+                     const StorageTypeVector& inferred_stypes,
                      const std::vector<Context>& in_arg_ctxes,
                      const std::vector<Context>& arg_grad_ctxes,
                      const std::vector<Context>& aux_state_ctxes,
@@ -155,6 +155,7 @@ class GraphExecutor : public Executor {
   void InitArguments(const nnvm::IndexedGraph& idx,
                      const nnvm::ShapeVector& inferred_shapes,
                      const nnvm::DTypeVector& inferred_dtypes,
+                     const StorageTypeVector& inferred_stypes,
                      const std::vector<Context>& in_arg_ctxes,
                      const std::vector<Context>& arg_grad_ctxes,
                      const std::vector<Context>& aux_state_ctxes,
@@ -203,7 +204,8 @@ class GraphExecutor : public Executor {
   std::vector<OpNode> op_nodes_;
   // internal data entry of each node
   std::vector<NDArray> data_entry_;
-  // internal data pool of allocated entries
+  // internal data pool of allocated entries.
+  // these allocated entries can be used for static memory sharing between executors.
   std::vector<NDArray> data_pool_;
   // output arrays
   std::vector<NDArray> output_arrays_;
@@ -235,6 +237,8 @@ class GraphExecutor : public Executor {
   bool prefer_bulk_execution_;
   // cached segment operator
   std::vector<CachedSegOpr> cached_seg_opr_;
+  // verbose logging
+  bool log_verbose_ = false;
 };
 
 }  // namespace exec
diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc
new file mode 100644
index 000000000000..67e61aa357c2
--- /dev/null
+++ b/src/executor/infer_graph_attr_pass.cc
@@ -0,0 +1,477 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file infer_graph_attr_pass.cc
+ * \brief infer graph shape, dtype, and storage type
+ */
+
+#include <mxnet/op_attr_types.h>
+#include <mxnet/graph_attr_types.h>
+#include "./exec_pass.h"
+#include "../operator/operator_common.h"
+
+namespace mxnet {
+namespace exec {
+
+template<typename AttrType, typename FInfer>
+bool ApplyOpInferAttr(const nnvm::Graph& g,
+                      const FInfer& finfer,
+                      const NodeAttrs& attrs,
+                      const uint32_t nid,
+                      std::vector<AttrType>* in_attrs,
+                      std::vector<AttrType>* out_attrs,
+                      DispatchMode* dispatch_mode) {
+  return finfer(attrs, in_attrs, out_attrs);
+}
+
+template<>
+bool ApplyOpInferAttr<int, FInferStorageType>(const nnvm::Graph& g,
+                                              const FInferStorageType& finfer,
+                                              const NodeAttrs& attrs,
+                                              const uint32_t nid,
+                                              std::vector<int>* in_attrs,
+                                              std::vector<int>* out_attrs,
+                                              DispatchMode* dispatch_mode) {
+  const DevMaskVector& dev_masks = g.GetAttr<DevMaskVector>("dev_mask");
+  return finfer(attrs, dev_masks[nid], dispatch_mode, in_attrs, out_attrs);
+}
+
+/*!\brief
+ * This is a duplicate of the InferAttr function in nnvm with minor modification
+ * to support inferring storage type whose function signature is different from
+ * shape/type inference functions'. The nnvm InferAttr will be deprecated
+ * in the future. Please use interfaces InferShape, InferType, and InferStorageType
+ * to call this function.
+ */
+template<typename AttrType, typename FInferType, typename IsNone, typename FDefault>
+nnvm::Graph InferAttr(nnvm::Graph &&ret,
+                      const AttrType empty_val,
+                      const char* infer_name,
+                      const char* input_name,
+                      const char* attr_key_name,
+                      const char* attr_name,
+                      const char* unknown_name,
+                      IsNone fis_none,
+                      FDefault fdefault,
+                      bool bwd_identity_assign,
+                      const char* dispatch_mode_name,
+                      const DispatchMode default_mode_val = DispatchMode::kUndefined) {
+  using nnvm::IndexedGraph;
+  using nnvm::Op;
+  using AttrVector = std::vector<AttrType>;
+  using NodeAttrVector = std::vector<DispatchMode>;
+  using dmlc::any;
+
+  const IndexedGraph& idx = ret.indexed_graph();
+  static auto& finfer_shape =
+      Op::GetAttr<FInferType>(infer_name);
+  static auto& is_backward =
+      Op::GetAttr<nnvm::TIsBackward>("TIsBackward");
+  // gradient function, used to get node correspondence.
+  static auto& fgrad =
+      Op::GetAttr<nnvm::FGradient>("FGradient");
+  // reshape shape vector
+  AttrVector rshape;
+  // dispatch mode vector
+  DispatchModeVector dispatch_modes;
+  if (ret.attrs.count(attr_name) != 0) {
+    rshape = ret.MoveCopyAttr<AttrVector>(attr_name);
+  } else {
+    rshape.resize(idx.num_node_entries(), empty_val);
+  }
+
+  if (ret.attrs.count(input_name) != 0) {
+    const AttrVector& shape_args = ret.GetAttr<AttrVector>(input_name);
+    CHECK_LE(shape_args.size(), idx.input_nodes().size())
+        << "More provided " << attr_name << "s than number of arguments.";
+    for (size_t i = 0; i < shape_args.size(); ++i) {
+      rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i];
+    }
+  }
+
+  // get the shape hints
+  std::string shape_hints_key = std::string(attr_name) + "_hints";
+  if (ret.attrs.count(shape_hints_key)) {
+    nnvm::NodeEntryMap<AttrType> shape_hints =
+      ret.GetAttr<nnvm::NodeEntryMap<AttrType>>(shape_hints_key);
+    for (const auto& kv : shape_hints) {
+      nnvm::NodeEntry e = kv.first;
+      if (idx.exist(e.node.get())) {
+        rshape[idx.entry_id(kv.first)] = kv.second;
+      }
+    }
+  }
+
+  std::string shape_attr_key;
+  if (ret.attrs.count(attr_key_name) != 0) {
+    shape_attr_key = ret.GetAttr<std::string>(attr_key_name);
+    // erase the provided arguments
+    ret.attrs.erase(attr_key_name);
+  }
+
+  // limit inference to part of the graph
+  uint32_t node_start = 0, node_end = idx.num_nodes();
+  if (ret.attrs.count("node_range")) {
+    const auto& range = ret.GetAttr<std::pair<uint32_t, uint32_t> >("node_range");
+    node_start = range.first;
+    node_end = range.second;
+    CHECK_GE(node_start, 0);
+    CHECK_LE(node_end, idx.num_nodes());
+    ret.attrs.erase("node_range");
+  }
+  uint32_t entry_start = 0, entry_end = idx.num_node_entries();
+  if (ret.attrs.count("entry_range")) {
+    const auto& range = ret.GetAttr<std::pair<uint32_t, uint32_t> >("entry_range");
+    entry_start = range.first;
+    entry_end = range.second;
+    CHECK_GE(entry_start, 0);
+    CHECK_LE(entry_end, idx.num_node_entries());
+    ret.attrs.erase("entry_range");
+  }
+  // populate the node attribute vector
+  if (dispatch_mode_name != nullptr) {
+    if (ret.attrs.count(dispatch_mode_name) != 0) {
+      dispatch_modes = ret.MoveCopyAttr<NodeAttrVector>(dispatch_mode_name);
+    } else {
+      LOG(FATAL) << "Node attribute " << dispatch_mode_name << " does not exist in the graph";
+    }
+  }
+
+  // Temp space for shape inference.
+  std::vector<AttrType> ishape, oshape;
+
+  // inference step function for nid
+  auto infer_step = [&](uint32_t nid, bool last_iter) {
+    const auto& inode = idx[nid];
+    const uint32_t num_inputs = inode.inputs.size();
+    const uint32_t num_outputs = inode.source->num_outputs();
+    if (inode.source->is_variable()) {
+      // Variable node. No operator. Only one output entry.
+      CHECK(inode.source->op() == nullptr);
+      CHECK_EQ(num_outputs, 1U);
+      const uint32_t out_ent_id = idx.entry_id(nid, 0);
+      if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) {
+        auto it = inode.source->attrs.dict.find(shape_attr_key);
+        if (it != inode.source->attrs.dict.end()) {
+          std::istringstream is(it->second);
+          CHECK(is >> rshape[out_ent_id]) << "Invalid attribute";
+        }
+      }
+      // assign a default value to node attribute
+      if (dispatch_mode_name != nullptr) {
+        op::dispatch_mode_assign(&dispatch_modes[nid], default_mode_val);
+      }
+    } else if (is_backward.get(inode.source->op(), false) &&
+               inode.control_deps.size() && bwd_identity_assign) {
+      CHECK(dispatch_mode_name == nullptr)
+        << "Backward inference for node attributes is not available";
+      CHECK_GE(inode.control_deps.size(), 1U)
+        << "BackwardOp need to have control_deps to its forward op";
+      const IndexedGraph::Node& fnode = idx[inode.control_deps[0]];
+      nnvm::NodePtr fwd_ptr = inode.source->control_deps[0];
+      CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable";
+      // use gradient function to find out the correspondence.
+      std::vector<nnvm::NodeEntry> ograd(fwd_ptr->num_outputs());
+      for (size_t i = 0; i < ograd.size(); ++i) {
+        ograd[i].index = static_cast<uint32_t>(i);
+      }
+      // input gradient list
+      auto igrad = fgrad[fwd_ptr->op()](fwd_ptr, ograd);
+      const nnvm::Node* igrad_node = nullptr;
+      // Input gradient assignement
+      for (size_t i = 0; i < igrad.size(); ++i) {
+        if (igrad[i].node->op() == inode.source->op()) {
+          uint32_t eid = idx.entry_id(nid, igrad[i].index);
+          if (fis_none(rshape[eid])) {
+            rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])];
+          } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) {
+            // Need to skip empty forward shape, because it may not be
+            // available now and it is possible to infer the forward
+            // shape in one of the next a few passes
+            CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])])
+                << "Backward shape inconsistent with the forward shape";
+          }
+          if (igrad_node == nullptr) {
+            igrad_node = igrad[i].node.get();
+          } else {
+            CHECK(igrad_node == igrad[i].node.get());
+          }
+        }
+      }
+      // out grad entries
+      CHECK(igrad_node != nullptr)
+        << "Cannot find matching backward op for " << inode.source->attrs.name;
+      for (size_t i = 0; i < igrad_node->inputs.size(); ++i) {
+        const nnvm::NodeEntry& e = igrad_node->inputs[i];
+        if (e.node == nullptr) {
+          uint32_t eid = idx.entry_id(inode.inputs[i]);
+          if (fis_none(rshape[eid])) {
+            rshape[eid] = rshape[idx.entry_id(inode.control_deps[0], e.index)];
+          }
+        }
+      }
+    } else {
+      DispatchMode* dispatch_mode = nullptr;
+      bool forward_known = true;
+      // Forward operator inference.
+      ishape.resize(num_inputs, empty_val);
+      for (uint32_t i = 0; i < ishape.size(); ++i) {
+        ishape[i] = rshape[idx.entry_id(inode.inputs[i])];
+        if (fis_none(ishape[i])) forward_known = false;
+      }
+      oshape.resize(num_outputs, empty_val);
+      for (uint32_t i = 0; i < oshape.size(); ++i) {
+        oshape[i] = rshape[idx.entry_id(nid, i)];
+        if (fis_none(oshape[i])) forward_known = false;
+      }
+      if (dispatch_mode_name != nullptr) {
+        dispatch_mode = &dispatch_modes[nid];
+        if (dispatch_modes[nid] == DispatchMode::kUndefined) forward_known = false;
+      }
+      auto finfer = finfer_shape.get(inode.source->op(), fdefault);
+      if (!forward_known) {
+        if (finfer != nullptr) {
+          // Call inference function of the operator.
+          try {
+            forward_known = ApplyOpInferAttr(ret, finfer, inode.source->attrs,
+                                             nid, &ishape, &oshape, dispatch_mode);
+          } catch (const std::exception& e) {
+            throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what());
+          }
+        } else {
+          CHECK(!last_iter)
+              << "Attribute " << infer_name
+              << " is not registed by op " << inode.source->op()->name
+              << " we are not able to complete the inference because of this";
+        }
+      }
+      // Save to the result map.
+      for (uint32_t i = 0; i < num_inputs; ++i) {
+        rshape[idx.entry_id(inode.inputs[i])] = ishape[i];
+      }
+      for (uint32_t i = 0; i < num_outputs; ++i) {
+        rshape[idx.entry_id(nid, i)] = oshape[i];
+      }
+    }
+  };
+
+  size_t last_num_unknown;
+  size_t num_unknown_dispatch_mode = dispatch_mode_name ? node_end - node_start : 0;
+  size_t num_unknown_entry_attr = entry_end - entry_start;
+  size_t num_unknown = num_unknown_entry_attr + num_unknown_dispatch_mode;
+  int i = 0;
+  do {
+    if (i % 2 == 0) {
+      for (uint32_t nid = node_start; nid < node_end; ++nid) {
+        infer_step(nid, false);
+      }
+    } else {
+      // backward inference
+      for (uint32_t i = node_end; i != node_start; --i) {
+        infer_step(i - 1, false);
+      }
+    }
+    last_num_unknown = num_unknown;
+    num_unknown = 0;
+    for (size_t j = entry_start; j < entry_end; ++j) {
+      if (fis_none(rshape[j])) {
+        ++num_unknown;
+      }
+    }
+    if (dispatch_mode_name) {
+      for (size_t i = node_start; i < node_end; i++) {
+        if (dispatch_modes[i] == DispatchMode::kUndefined) ++num_unknown;
+      }
+    }
+    ++i;
+  } while (num_unknown > 0 && last_num_unknown > num_unknown);
+  // set the shapes
+  ret.attrs[attr_name] = std::make_shared<any>(std::move(rshape));
+  // set the shapes
+  if (dispatch_mode_name) {
+    ret.attrs[dispatch_mode_name] = std::make_shared<any>(std::move(dispatch_modes));
+  }
+  // number of nodes who knows the shape.
+  ret.attrs[unknown_name] = std::make_shared<any>(num_unknown);
+  return ret;
+}
+
+// inference fucntion for same type
+inline bool SameType(const nnvm::NodeAttrs& attrs,
+                     std::vector<int> *iattr,
+                     std::vector<int> *oattr) {
+  int def_v = -1;
+  for (int v : *oattr) {
+    if (v != -1) {
+      def_v = v; break;
+    }
+  }
+  if (def_v == -1) {
+    for (int v : *iattr) {
+      if (v != -1) {
+        def_v = v; break;
+      }
+    }
+  }
+  if (def_v == -1) return false;
+  for (int& v : *oattr) {
+    v = def_v;
+  }
+  for (int& v : *iattr) {
+    v = def_v;
+  }
+  return true;
+}
+
+inline bool DefaultStorageType(const nnvm::NodeAttrs& attrs,
+                               const int dev_mask,
+                               DispatchMode* dispatch_mode,
+                               std::vector<int> *iattr,
+                               std::vector<int> *oattr) {
+  bool fallback = false;
+  for (int& v : *oattr) {
+    if (v == -1) v = kDefaultStorage;
+    if (v != kDefaultStorage) fallback = true;
+  }
+  for (int& v : *iattr) {
+    if (v == -1) v = kDefaultStorage;
+    if (v != kDefaultStorage) fallback = true;
+  }
+  if (*dispatch_mode == DispatchMode::kUndefined) {
+    if (fallback) {
+      *dispatch_mode = DispatchMode::kFComputeFallback;
+      op::LogStorageFallback(attrs, dev_mask, iattr, oattr);
+    } else {
+      *dispatch_mode = DispatchMode::kFCompute;
+    }
+  }
+  return true;
+}
+
+nnvm::Graph InferShape(nnvm::Graph&& graph,
+                       nnvm::ShapeVector&& shape_inputs,
+                       const std::string& shape_attr_key) {
+  using dmlc::any;
+  if (shape_inputs.size() != 0) {
+    graph.attrs["shape_inputs"] = std::make_shared<any>(std::move(shape_inputs));
+  }
+  if (shape_attr_key.length() != 0) {
+    graph.attrs["shape_attr_key"] = std::make_shared<any>(std::move(shape_attr_key));
+  }
+  return InferAttr<nnvm::TShape, nnvm::FInferShape>(
+      std::move(graph), nnvm::TShape(),
+      "FInferShape", "shape_inputs", "shape_attr_key",
+      "shape", "shape_num_unknown_nodes",
+      [](const nnvm::TShape& s) { return s.ndim() == 0 || s.Size() == 0; },
+      nullptr, true, nullptr);
+}
+
+nnvm::Graph InferType(nnvm::Graph&& graph,
+                      nnvm::DTypeVector&& dtype_inputs,
+                      const std::string& dtype_attr_key) {
+  using dmlc::any;
+  if (dtype_inputs.size() != 0) {
+    graph.attrs["dtype_inputs"] = std::make_shared<any>(std::move(dtype_inputs));
+  }
+  if (dtype_attr_key.length() != 0) {
+    graph.attrs["dtype_attr_key"] = std::make_shared<any>(std::move(dtype_attr_key));
+  }
+  return InferAttr<int, nnvm::FInferType>(
+      std::move(graph), -1,
+      "FInferType", "dtype_inputs", "dtype_attr_key",
+      "dtype", "dtype_num_unknown_nodes",
+      [](const int t) { return t == -1; },
+      SameType, true, nullptr);
+}
+
+nnvm::Graph InferStorageType(nnvm::Graph&& graph,
+                             StorageTypeVector&& storage_type_inputs,
+                             const std::string& storage_type_attr_key) {
+  using dmlc::any;
+  if (storage_type_inputs.size() != 0) {
+    graph.attrs["storage_type_inputs"] = std::make_shared<any>(std::move(storage_type_inputs));
+  }
+  if (storage_type_attr_key.length() != 0) {
+    graph.attrs["storage_type_attr_key"] = std::make_shared<any>(std::move(storage_type_attr_key));
+  }
+  // initialize unknown values for dispatch modes
+  if (graph.attrs.count("dispatch_mode") == 0) {
+    DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined);
+    graph.attrs["dispatch_mode"] = std::make_shared<any>(std::move(dispatch_modes));
+  }
+  // initialize unknown values for dispatch modes
+  if (graph.attrs.count("dispatch_mode") == 0) {
+    DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined);
+    graph.attrs["dispatch_mode"] = std::make_shared<any>(std::move(dispatch_modes));
+  }
+  // initialize the dev_mask vector from the context vector
+  if (graph.attrs.count("dev_mask") == 0) {
+    CHECK_GT(graph.attrs.count("context"), 0);
+    DevMaskVector dev_masks(graph.indexed_graph().num_nodes());
+    const ContextVector& vctx = graph.GetAttr<ContextVector>("context");
+    for (size_t i = 0; i < vctx.size(); i++) dev_masks[i] = vctx[i].dev_mask();
+    graph.attrs["dev_mask"] = std::make_shared<any>(std::move(dev_masks));
+  }
+
+  // for storage type, the backward attr is not necessarily the same as it's correspondence
+  nnvm::Graph ret = InferAttr<int, FInferStorageType>(
+      std::move(graph), -1,
+      "FInferStorageType", "storage_type_inputs", "storage_type_attr_key",
+      "storage_type", "storage_type_num_unknown_nodes",
+      [](const int t) { return t == -1; },
+      DefaultStorageType, false, "dispatch_mode", DispatchMode::kVariable);
+
+  // log the storage types and dispatch modes of the graph
+  bool log_verbose = dmlc::GetEnv("MXNET_INFER_STORAGE_TYPE_VERBOSE_LOGGING", false);
+  if (log_verbose) {
+    const auto &idx = ret.indexed_graph();
+    const auto& vstorage_type = ret.GetAttr<StorageTypeVector>("storage_type");
+    const auto& dispatch_modes = ret.GetAttr<DispatchModeVector>("dispatch_mode");
+    uint32_t node_start = 0, node_end = idx.num_nodes();
+    if (ret.attrs.count("node_range")) {
+      const auto& range = ret.GetAttr<std::pair<uint32_t, uint32_t> >("node_range");
+      node_start = range.first;
+      node_end = range.second;
+    }
+    for (uint32_t nid = node_start; nid < node_end; ++nid) {
+      const auto& inode = idx[nid];
+      if (inode.source->is_variable()) {
+        LOG(INFO) << "node " << nid << " var";
+      } else {
+        LOG(INFO) << "node " << nid << " " << inode.source->attrs.op->name
+                  << ": " << common::dispatch_mode_string(dispatch_modes[nid]);
+        for (const auto& e : inode.inputs) {
+          auto eid = idx.entry_id(e);
+          LOG(INFO) << "\t\tinput " << eid << ": "
+                    << common::stype_string(vstorage_type[eid]);
+        }
+        for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+          uint32_t eid = idx.entry_id(nid, index);
+          LOG(INFO) << "\t\toutput " << eid << ": "
+                    << common::stype_string(vstorage_type[eid]);
+        }
+      }
+    }
+  }
+  return ret;
+}
+
+}  // namespace exec
+}  // namespace mxnet
diff --git a/src/executor/inplace_addto_detect_pass.cc b/src/executor/inplace_addto_detect_pass.cc
index 26a91e3f1b5e..4af2dcd66306 100644
--- a/src/executor/inplace_addto_detect_pass.cc
+++ b/src/executor/inplace_addto_detect_pass.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file inplace_addto_detect_pass.cc
  * \brief Detect whether inplace addto operation is possible for certain op.
  */
@@ -62,6 +63,8 @@ Graph DetectInplaceAddTo(Graph g) {
     uint32_t eid_rhs  = idx.entry_id(inode.inputs[1]);
     if (ref_count[eid_rhs] != 1) continue;
     if (inode.inputs[0].node_id >= inode.inputs[1].node_id) continue;
+    // TODO(haibin) support inplace addto for Dynamic Storage
+    if (storage_id[eid_rhs] == kDynamicStorageID) continue;
     CHECK_NE(storage_id[eid_rhs], sid);
     storage_id[eid_rhs] = sid;
     addto_entry[eid_rhs] = 1;
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
new file mode 100644
index 000000000000..ec0b9c2530b0
--- /dev/null
+++ b/src/imperative/cached_op.cc
@@ -0,0 +1,495 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <unordered_set>
+#include <iostream>
+#include "./imperative_utils.h"
+
+namespace mxnet {
+
+Imperative::CachedOp::CachedOp(const nnvm::Symbol& sym) {
+  using namespace nnvm;
+  using namespace imperative;
+  static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
+  static const auto _copy = Op::Get("_copy");
+
+  // construct forward graph
+  {
+    NodeEntryMap<int> dedup_out;
+    for (const auto& i : sym.outputs) {
+      if (dedup_out.count(i)) {
+        NodePtr copy_node = Node::Create();
+        copy_node->attrs.op = _copy;
+        copy_node->attrs.name =
+            i.node->attrs.name + "_copy" + std::to_string(dedup_out[i]++);
+        copy_node->inputs.emplace_back(i);
+        if (_copy->attr_parser != nullptr) {
+          _copy->attr_parser(&(copy_node->attrs));
+        }
+        fwd_graph_.outputs.push_back(NodeEntry{copy_node, 0, 0});
+      } else {
+        dedup_out.insert({i, 0});
+        fwd_graph_.outputs.push_back(i);
+      }
+    }
+    const auto& idx = fwd_graph_.indexed_graph();
+    CHECK_GE(idx.input_nodes().size(), 1) << "CachedOp requires at least 1 input";
+
+    std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
+    for (const auto& i : idx.input_nodes()) ++ref_count[idx.entry_id(i, 0)];
+    for (const auto& i : idx.outputs()) ++ref_count[idx.entry_id(i)];
+    for (size_t i = 0; i < idx.num_nodes(); ++i) {
+      for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
+    }
+
+    fwd_graph_.attrs["forward_ref_count"] =
+        std::make_shared<dmlc::any>(std::move(ref_count));
+  }
+
+  // construct backward graph
+  {
+    ograd_entries_.reserve(fwd_graph_.outputs.size());
+    for (size_t i = 0; i < fwd_graph_.outputs.size(); ++i) {
+      ograd_entries_.emplace_back(NodeEntry{Node::Create(), 0, 0});
+    }
+
+    std::vector<NodeEntry> xs;
+    std::vector<NodePtr> args = sym.ListInputs(Symbol::kReadOnlyArgs);
+    xs.reserve(args.size());
+    for (const auto& i : args) xs.emplace_back(NodeEntry{i, 0, 0});
+    CHECK_GT(xs.size(), 0)
+        << "There are no inputs in computation graph that require gradients.";
+
+    grad_graph_ = pass::Gradient(
+        fwd_graph_, fwd_graph_.outputs, xs, ograd_entries_,
+        exec::AggregateGradient, nullptr, nullptr,
+        zero_ops, "_copy");
+  }
+
+  // construct full graph
+  {
+    size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
+    size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries();
+
+    full_graph_.outputs = fwd_graph_.outputs;
+    curr_grad_req_ = std::vector<bool>(grad_graph_.outputs.size(), true);
+    for (const auto& i : grad_graph_.outputs) full_graph_.outputs.emplace_back(i);
+    const auto& idx = full_graph_.indexed_graph();
+
+    std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
+    for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) {
+      for (const auto& j : idx[i].inputs) {
+         ++ref_count[idx.entry_id(j)];
+      }
+    }
+
+    auto full_ref_count = fwd_graph_.GetAttr<std::vector<uint32_t> >("forward_ref_count");
+    for (size_t i = 0; i < num_forward_entries; ++i) full_ref_count[i] += ref_count[i];
+    fwd_graph_.attrs["full_ref_count"] =
+        std::make_shared<dmlc::any>(std::move(full_ref_count));
+
+    size_t num_forward_inputs = num_inputs();
+    size_t num_forward_outputs = num_outputs();
+    for (uint32_t i = 0; i < ograd_entries_.size(); ++i) {
+      if (!idx.exist(ograd_entries_[i].node.get())) continue;
+      auto eid = idx.entry_id(ograd_entries_[i]);
+      if (ref_count[eid] > 0) {
+        bwd_ograd_dep_.push_back(i);
+      }
+    }
+    save_inputs_.resize(num_forward_inputs, false);
+    for (uint32_t i = 0; i < num_forward_inputs; ++i) {
+      auto eid = idx.entry_id(idx.input_nodes()[i], 0);
+      if (ref_count[eid] > 0) {
+        save_inputs_[i] = true;
+        bwd_in_dep_.push_back(i);
+      }
+    }
+    save_outputs_.resize(idx.outputs().size(), false);
+    for (uint32_t i = 0; i < num_forward_outputs; ++i) {
+      auto eid = idx.entry_id(idx.outputs()[i]);
+      if (ref_count[eid] > 0) {
+        save_outputs_[i] = true;
+        bwd_out_dep_.push_back(i);
+      }
+    }
+  }
+}
+
+std::vector<nnvm::NodeEntry> Imperative::CachedOp::Gradient(
+    const nnvm::NodePtr& node,
+    const std::vector<nnvm::NodeEntry>& ograds) {
+  using namespace nnvm;
+  static const auto _backward_CachedOp = Op::Get("_backward_CachedOp");
+  static const auto _NoGrad = Op::Get("_NoGradient");
+
+  auto p = Node::Create();
+  p->attrs.op = _backward_CachedOp;
+  p->attrs.name = node->attrs.name + "_backward";
+  p->attrs.parsed = node->attrs.parsed;
+  p->control_deps.push_back(node);
+  p->inputs.reserve(bwd_ograd_dep_.size() + bwd_in_dep_.size() + bwd_out_dep_.size());
+  for (auto i : bwd_ograd_dep_) p->inputs.push_back(ograds[i]);
+  for (auto i : bwd_in_dep_) p->inputs.push_back(node->inputs[i]);
+  for (auto i : bwd_out_dep_) p->inputs.emplace_back(NodeEntry{node, i, 0});
+  std::vector<NodeEntry> ret;
+  ret.reserve(num_inputs());
+  const auto& auxs = mutable_input_nodes();
+  if (auxs.size()) {
+    auto nop = Node::Create();
+    nop->attrs.op = _NoGrad;
+    nop->attrs.name = "NoGradient";
+    uint32_t j = 0, k = 0;
+    for (const auto& i : fwd_graph_.indexed_graph().input_nodes()) {
+      if (auxs.count(i)) {
+        ret.emplace_back(NodeEntry{nop, 0, 0});
+      } else {
+        ret.emplace_back(NodeEntry{p, k++, 0});
+      }
+    }
+  } else {
+    for (uint32_t i = 0; i < num_inputs(); ++i) ret.emplace_back(NodeEntry{p, i, 0});
+  }
+  return ret;
+}
+
+nnvm::Graph Imperative::CachedOp::GetForwardGraph(
+    const bool recording, const std::vector<NDArray*>& inputs) {
+  using namespace nnvm;
+  using namespace imperative;
+  std::lock_guard<std::mutex> lock(mutex_);
+  CHECK_EQ(inputs.size(), num_inputs());
+  nnvm::Graph& g = fwd_graph_;
+
+  ShapeVector shape_inputs;
+  DTypeVector dtype_inputs;
+  StorageTypeVector storage_type_inputs;
+  shape_inputs.reserve(inputs.size());
+  dtype_inputs.reserve(inputs.size());
+  storage_type_inputs.reserve(inputs.size());
+  for (uint32_t i = 0; i < inputs.size(); ++i) {
+    shape_inputs.emplace_back(inputs[i]->shape());
+    dtype_inputs.emplace_back(inputs[i]->dtype());
+    storage_type_inputs.emplace_back(inputs[i]->storage_type());
+  }
+
+  bool match = true;
+  match &= CheckAndInferShape(&g, std::move(shape_inputs), true);
+  match &= CheckAndInferType(&g, std::move(dtype_inputs), true);
+  exec::DevMaskVector dev_mask(g.indexed_graph().num_nodes(), inputs[0]->ctx().dev_mask());
+  match &= CheckAndInferStorageType(&g, std::move(dev_mask),
+                                    std::move(storage_type_inputs), true);
+
+  if (!match) {
+    g.attrs.erase("forward_mem_plan");
+    g.attrs.erase("full_mem_plan");
+  } else if (g.attrs.count(recording ? "full_mem_plan" : "forward_mem_plan")) {
+    return g;
+  }
+
+  const auto& idx = g.indexed_graph();
+
+  StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
+  for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
+
+  auto mem_plan = PlanMemory(
+      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >(
+          recording ? "full_ref_count" : "forward_ref_count"));
+  g.attrs[recording ? "full_mem_plan" : "forward_mem_plan"] =
+      std::make_shared<dmlc::any>(std::move(mem_plan));
+
+  return g;
+}
+
+nnvm::Graph Imperative::CachedOp::GetBackwardGraph(
+    const OpStatePtr& op_state,
+    const std::vector<OpReqType>& reqs,
+    const std::vector<NDArray*>& inputs) {
+  using namespace nnvm;
+  using namespace imperative;
+  std::lock_guard<std::mutex> lock(mutex_);
+  nnvm::Graph& g = full_graph_;
+  auto& state = op_state.get_state<CachedOpState>();
+  bool req_match = true;
+  for (size_t i = 0; i < reqs.size(); ++i) {
+    if (curr_grad_req_[i] != (reqs[i] != kNullOp)) {
+      curr_grad_req_[i] = reqs[i] != kNullOp;
+      req_match = false;
+    }
+  }
+  if (!req_match) {
+    g = nnvm::Graph();
+    g.outputs = fwd_graph_.outputs;
+    for (size_t i = 0; i < grad_graph_.outputs.size(); ++i) {
+      if (curr_grad_req_[i]) g.outputs.emplace_back(grad_graph_.outputs[i]);
+    }
+    bwd_input_eid_.clear();
+  }
+
+  const auto& idx = g.indexed_graph();
+
+  if (bwd_input_eid_.size() != inputs.size()) {
+    bwd_input_eid_.clear();
+    for (const auto& i : bwd_ograd_dep_) {
+      auto eid = idx.entry_id(ograd_entries_[i]);
+      bwd_input_eid_.push_back(eid);
+    }
+    for (const auto& i : bwd_in_dep_) {
+      auto eid = idx.entry_id(idx.input_nodes()[i], 0);
+      bwd_input_eid_.push_back(eid);
+    }
+    for (const auto& i : bwd_out_dep_) {
+      auto eid = idx.entry_id(idx.outputs()[i]);
+      bwd_input_eid_.push_back(eid);
+    }
+    CHECK_EQ(inputs.size(), bwd_input_eid_.size());
+  }
+
+  size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
+  size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries();
+
+  if (!g.attrs.count("backward_ref_count")) {
+    std::vector<uint32_t> ref_count(idx.num_node_entries(), 0);
+    for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) {
+      for (const auto& j : idx[i].inputs) ++ref_count[idx.entry_id(j)];
+    }
+    for (size_t i = 0; i < inputs.size(); ++i) ++ref_count[bwd_input_eid_[i]];
+    for (const auto& i : idx.outputs()) ++ref_count[idx.entry_id(i)];
+    g.attrs["backward_ref_count"] = std::make_shared<dmlc::any>(std::move(ref_count));
+  }
+
+  ShapeVector shapes(idx.num_node_entries(), TShape());
+  DTypeVector dtypes(idx.num_node_entries(), -1);
+  StorageTypeVector stypes(idx.num_node_entries(), -1);
+
+  for (size_t i = 0; i < num_forward_entries; ++i) {
+    shapes[i] = state.buff[i].shape();
+    dtypes[i] = state.buff[i].dtype();
+    stypes[i] = state.buff[i].storage_type();
+  }
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    shapes[bwd_input_eid_[i]] = inputs[i]->shape();
+    dtypes[bwd_input_eid_[i]] = inputs[i]->dtype();
+    stypes[bwd_input_eid_[i]] = inputs[i]->storage_type();
+  }
+
+  std::pair<uint32_t, uint32_t> node_range, entry_range;
+  node_range = {num_forward_nodes, idx.num_nodes()};
+  entry_range = {num_forward_entries, idx.num_node_entries()};
+
+  bool match = true;
+  match &= CheckAndInferShape(&g, std::move(shapes), false,
+                              node_range, entry_range);
+  match &= CheckAndInferType(&g, std::move(dtypes), false,
+                             node_range, entry_range);
+  exec::DevMaskVector dev_mask(idx.num_nodes(), inputs[0]->ctx().dev_mask());
+  match &= CheckAndInferStorageType(&g, std::move(dev_mask), std::move(stypes),
+                                    false, node_range, entry_range);
+
+  if (!match) {
+    g.attrs.erase("backward_mem_plan");
+  } else if (g.attrs.count("backward_mem_plan")) {
+    return g;
+  }
+
+  StorageVector storage(idx.num_node_entries(), exec::kBadStorageID);
+  for (size_t i = 0; i < num_forward_entries; ++i) storage[i] = exec::kExternalStorageID;
+  for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID;
+  for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID;
+
+  auto mem_plan = PlanMemory(
+      &g, std::move(storage), g.GetAttr<std::vector<uint32_t> >("backward_ref_count"),
+      {num_forward_nodes, idx.num_nodes()}, {num_forward_entries, idx.num_node_entries()});
+  g.attrs["backward_mem_plan"] = std::make_shared<dmlc::any>(std::move(mem_plan));
+
+  return g;
+}
+
+OpStatePtr Imperative::CachedOp::Forward(const std::vector<NDArray*>& inputs,
+                                         const std::vector<NDArray*>& outputs) {
+  using namespace nnvm;
+  using namespace imperative;
+
+  bool recording = Imperative::Get()->set_is_recording(false);
+  // Initialize
+  nnvm::Graph g = GetForwardGraph(recording, inputs);
+  const auto& idx = g.indexed_graph();
+  size_t num_inputs = idx.input_nodes().size();
+
+  CHECK_EQ(num_inputs, inputs.size())
+      << "CachedOp requires " << num_inputs << " but got " << inputs.size();
+
+  Context default_ctx = inputs[0]->ctx();
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i]->ctx(), default_ctx)
+        << "CachedOp requires all inputs to live on the same context. But "
+        << idx[idx.input_nodes()[0]].source->attrs.name << " is on " << default_ctx
+        << " while " << idx[idx.input_nodes()[i]].source->attrs.name << " is on "
+        << inputs[i]->ctx();
+  }
+
+  auto op_state_ptr = OpStatePtr::Create<CachedOpState>();
+  auto& cached_op_state = op_state_ptr.get_state<CachedOpState>();
+  auto& buff = cached_op_state.buff;
+  auto& states = cached_op_state.states;
+
+  // Allocate entries
+  states.resize(idx.num_nodes());
+  buff.resize(idx.num_node_entries());
+  states.reserve(idx.num_nodes());
+  std::vector<NDArray*> arrays;
+  arrays.reserve(buff.size());
+  for (size_t i = 0; i < buff.size(); ++i) arrays.push_back(&buff[i]);
+  for (size_t i = 0; i < num_inputs; ++i) {
+    arrays[idx.entry_id(idx.input_nodes()[i], 0)] = inputs[i];
+  }
+  for (size_t i = 0; i < idx.outputs().size(); ++i) {
+    auto eid = idx.entry_id(idx.outputs()[i]);
+    if (!arrays[eid]->is_none()) *outputs[i] = arrays[eid]->Detach();
+    arrays[eid] = outputs[i];
+  }
+
+  // Allocate NDArrays
+  std::vector<uint32_t> ref_count = g.GetAttr<std::vector<uint32_t> >(
+      recording ? "full_ref_count" : "forward_ref_count");
+
+  std::vector<OpReqType> array_reqs(arrays.size(), kWriteTo);
+  for (size_t i = 0; i < idx.num_node_entries(); ++i) {
+    if (ref_count[i] == 0) array_reqs[i] = kNullOp;
+  }
+
+  const auto& mem_plan = g.GetAttr<MemoryPlanVector >(
+      recording ? "full_mem_plan" : "forward_mem_plan");
+  AllocateMemory(g, idx, default_ctx, 0, idx.num_node_entries(),
+                 mem_plan, arrays, &array_reqs);
+
+  const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
+
+  Imperative::Get()->RunGraph(
+      false, idx, arrays, 0, idx.num_nodes(), std::move(array_reqs),
+      std::move(ref_count), &states, dispatch_modes);
+
+  for (size_t i = 0; i < idx.num_node_entries(); ++i) {
+    if (arrays[i] == &buff[i]) continue;
+    buff[i].shape_ = arrays[i]->shape_;
+    buff[i].dtype_ = arrays[i]->dtype_;
+    buff[i].storage_type_ = arrays[i]->storage_type_;
+  }
+
+  Imperative::Get()->set_is_recording(recording);
+
+  return op_state_ptr;
+}
+
+
+void Imperative::CachedOp::Backward(
+    const bool retain_graph,
+    const OpStatePtr& state,
+    const std::vector<NDArray*>& inputs,
+    const std::vector<OpReqType>& reqs,
+    const std::vector<NDArray*>& outputs) {
+  using namespace nnvm;
+  using namespace imperative;
+  CHECK(!Imperative::Get()->is_recording())
+      << "CachedOp does not support higher order gradients. "
+      << "If you want to do backward with create_graph=True please "
+      << "do not use hybridize.";
+
+  // Initialize
+  nnvm::Graph g = GetBackwardGraph(state, reqs, inputs);
+  const auto& idx = g.indexed_graph();
+
+  auto& cached_op_state = state.get_state<CachedOpState>();
+  auto& buff = cached_op_state.buff;
+  auto& states = cached_op_state.states;
+
+  size_t num_forward_outputs = fwd_graph_.outputs.size();
+  size_t num_forward_nodes = fwd_graph_.indexed_graph().num_nodes();
+  size_t num_forward_entries = fwd_graph_.indexed_graph().num_node_entries();
+  buff.resize(idx.num_node_entries());
+  std::vector<NDArray*> arrays;
+  arrays.reserve(buff.size());
+  for (size_t i = 0; i < buff.size(); ++i) arrays.push_back(&buff[i]);
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    arrays[bwd_input_eid_[i]] = inputs[i];
+  }
+  for (size_t i = 0, j = num_forward_outputs; i < reqs.size(); ++i) {
+    if (reqs[i] == kNullOp) continue;
+    arrays[idx.entry_id(idx.outputs()[j++])] = outputs[i];
+  }
+
+  // Allocate NDArrays
+  auto ref_count = g.GetAttr<std::vector<uint32_t> >("backward_ref_count");
+  if (retain_graph) {
+    for (size_t i = 0; i < num_forward_entries; ++i) ++ref_count[i];
+  }
+
+  std::vector<OpReqType> array_reqs(arrays.size(), kWriteTo);
+  for (size_t i = num_forward_entries; i < idx.num_node_entries(); ++i) {
+    if (ref_count[i] == 0) array_reqs[i] = kNullOp;
+  }
+
+  Context default_ctx = outputs[0]->ctx();
+  const auto& mem_plan = g.GetAttr<MemoryPlanVector >("backward_mem_plan");
+  AllocateMemory(g, idx, default_ctx, num_forward_entries, idx.num_node_entries(),
+                 mem_plan, arrays, &array_reqs);
+
+  const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
+
+  Imperative::Get()->RunGraph(
+      retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
+      std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+
+  if (retain_graph) {
+    buff.resize(num_forward_entries);
+  } else {
+    buff.clear();
+    states.clear();
+  }
+}
+
+
+NNVM_REGISTER_OP(_CachedOp)
+.set_num_inputs([](const NodeAttrs& attrs) {
+    const CachedOpPtr& op = nnvm::get<CachedOpPtr>(attrs.parsed);
+    return op->num_inputs();
+  })
+.set_num_outputs([](const NodeAttrs& attrs) {
+    const CachedOpPtr& op = nnvm::get<CachedOpPtr>(attrs.parsed);
+    return op->num_outputs();
+  })
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    const CachedOpPtr& op = nnvm::get<CachedOpPtr>(n->attrs.parsed);
+    return op->Gradient(n, ograds);
+  });
+
+NNVM_REGISTER_OP(_backward_CachedOp)
+.set_num_inputs([](const NodeAttrs& attrs){
+    const CachedOpPtr& op = nnvm::get<CachedOpPtr>(attrs.parsed);
+    return op->num_backward_inputs();
+  })
+.set_num_outputs([](const NodeAttrs& attrs){
+    const CachedOpPtr& op = nnvm::get<CachedOpPtr>(attrs.parsed);
+    return op->num_inputs() - op->mutable_input_nodes().size();
+  })
+.set_attr<bool>("TIsLayerOpBackward", true)
+.set_attr<bool>("TIsBackward", true);
+
+}  // namespace mxnet
diff --git a/src/imperative/imperative.cc b/src/imperative/imperative.cc
new file mode 100644
index 000000000000..361b971a2da3
--- /dev/null
+++ b/src/imperative/imperative.cc
@@ -0,0 +1,599 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <unordered_set>
+#include <iostream>
+#include "./imperative_utils.h"
+
+namespace mxnet {
+#if DMLC_CXX11_THREAD_LOCAL
+thread_local bool Imperative::is_train_ = false;
+thread_local bool Imperative::is_recording_ = false;
+#else
+MX_THREAD_LOCAL bool Imperative::is_train_ = false;
+MX_THREAD_LOCAL bool Imperative::is_recording_ = false;
+#endif
+
+Imperative* Imperative::Get() {
+  static Imperative inst;
+  return &inst;
+}
+
+OpStatePtr Imperative::InvokeOp(
+    const Context& ctx,
+    const nnvm::NodeAttrs& attrs,
+    const std::vector<NDArray*>& inputs,
+    const std::vector<NDArray*>& outputs,
+    const std::vector<OpReqType>& req,
+    const DispatchMode dispatch_mode,
+    OpStatePtr state) {
+  using namespace imperative;
+  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+
+  const nnvm::Op *op = attrs.op;
+
+  std::vector<engine::VarHandle> read_vars, write_vars;
+  std::vector<Resource> requested;
+  std::vector<uint32_t> mutate_idx;
+  SetDependency(attrs, ctx, inputs, outputs,
+      &read_vars, &write_vars, &requested, &mutate_idx, dispatch_mode);
+
+  FCompute fn = common::GetFCompute<FCompute>(op, "FCompute", ctx);
+  FComputeEx fn_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", ctx);
+
+  // FComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
+  CHECK(dispatch_mode != DispatchMode::kUndefined);
+  bool dispatch_fcompex = dispatch_mode == DispatchMode::kFComputeEx;
+  if (fn_ex && dispatch_fcompex) {
+    PushFComputeEx(fn_ex, op, attrs, ctx, read_vars, write_vars,
+        requested, inputs, outputs, req);
+  } else if (fn) {
+    PushFCompute(fn, op, attrs, ctx, read_vars, write_vars,
+        requested, inputs, outputs, mutate_idx, req);
+  } else if (createop.count(op) || is_layer_backward.get(op, false)) {
+    if (!state) {
+      state = createop[op](attrs, ctx, ret->arg_shapes, ret->arg_types);
+    }
+    write_vars.push_back(state.get_var());
+    PushOperator(state, op, attrs, ctx, read_vars, write_vars,
+        requested, inputs, outputs, mutate_idx, req, dispatch_mode);
+  } else {
+    LOG(FATAL)
+      << "Operator " << op->name << " is not implemented for "
+      << (ctx.dev_mask() == gpu::kDevMask ? "GPU." : "CPU.");
+  }
+
+  return state;
+}
+
+OpStatePtr Imperative::Invoke(
+    const Context& default_ctx,
+    const nnvm::NodeAttrs& attrs,
+    const std::vector<NDArray*>& inputs,
+    const std::vector<NDArray*>& outputs) {
+  using namespace imperative;
+  static auto& ndfunc = nnvm::Op::GetAttr<FNDArrayFunction>("FNDArrayFunction");
+
+  if (ndfunc.count(attrs.op)) {
+    std::vector<NDArray> p_inputs, p_outputs;
+    DerefInputOutput(inputs, outputs, &p_inputs, &p_outputs);
+    ndfunc[attrs.op](attrs, p_inputs, &p_outputs);
+    for (size_t i = 0; i < outputs.size(); ++i) *outputs[i] = std::move(p_outputs[i]);
+    return OpStatePtr();
+  }
+
+  // TODO(piiswrong): infer ctx
+  DispatchMode dispatch_mode = DispatchMode::kUndefined;
+  Context ctx = GetContext(attrs, inputs, outputs, default_ctx);
+  SetShapeType(ctx, attrs, inputs, outputs, &dispatch_mode);
+  std::vector<OpReqType> req;
+  SetWriteInplaceReq(inputs, outputs, &req);
+
+  return InvokeOp(ctx, attrs, inputs, outputs, req, dispatch_mode);
+}
+
+void Imperative::MarkVariables(
+    const std::vector<NDArray*>& variables,
+    const std::vector<mx_uint>& grad_reqs,
+    const std::vector<NDArray*>& gradients) {
+  for (uint32_t i = 0; i < variables.size(); ++i) {
+    std::string str_c(std::to_string(variable_count_++));
+
+    variables[i]->entry_ = nnvm::NodeEntry{
+        nnvm::Symbol::CreateVariable("var" + str_c).outputs[0].node, 0, 0};
+    AGInfo& info = AGInfo::Create(variables[i]->entry_.node);
+    info.outputs.emplace_back(variables[i]->Detach());
+    info.out_grads.emplace_back(gradients[i]->Detach());
+    info.grad_req = static_cast<OpReqType>(grad_reqs[i]);
+    info.ctx = variables[i]->ctx();
+
+    gradients[i]->entry_ = nnvm::NodeEntry{
+        nnvm::Symbol::CreateVariable("grad" + str_c).outputs[0].node, 0, 0};
+    AGInfo& grad_info = AGInfo::Create(gradients[i]->entry_.node);
+    grad_info.outputs.emplace_back(gradients[i]->Detach());
+    grad_info.ctx = gradients[i]->ctx();
+  }
+}
+
+
+void Imperative::GetBackwardDependency(
+    const nnvm::NodePtr& node,
+    uint32_t num_inputs, uint32_t num_outputs,
+    std::vector<bool> *p_save_inputs,
+    std::vector<bool> *p_save_outputs) {
+  static auto& fgradient = nnvm::Op::GetAttr<nnvm::FGradient>("FGradient");
+  std::vector<bool>& save_inputs = *p_save_inputs;
+  std::vector<bool>& save_outputs = *p_save_outputs;
+  save_inputs.resize(num_inputs);
+  save_outputs.resize(num_outputs);
+  std::fill(save_inputs.begin(), save_inputs.end(), false);
+  std::fill(save_outputs.begin(), save_outputs.end(), false);
+
+  node->inputs.clear();
+  node->inputs.reserve(num_inputs);
+  for (uint32_t i = 0; i < num_inputs; ++i) {
+    node->inputs.emplace_back(nnvm::NodeEntry{nullptr, i, 0});
+  }
+
+  if (fgradient.count(node->op())) {
+    std::vector<nnvm::NodeEntry> ograd_entries;
+    ograd_entries.reserve(num_outputs);
+    for (uint32_t i = 0; i < num_outputs; ++i) {
+      ograd_entries.emplace_back(nnvm::NodeEntry{nullptr, i, 1});
+    }
+    auto igrad_entries = fgradient[node->op()](node, ograd_entries);
+    for (const auto& i : igrad_entries) {
+      if (i.node == nullptr && i.version == 0) {
+        save_inputs[i.index] = true;
+      } else if (i.node == node) {
+        save_outputs[i.index] = true;
+      }
+    }
+    DFSVisit(igrad_entries, [&](const nnvm::NodePtr& gnode) {
+        if (!gnode || gnode == node) return;
+        for (const auto& i : gnode->inputs) {
+          if (i.node == nullptr && i.version == 0) {
+            save_inputs[i.index] = true;
+          } else if (i.node == node) {
+            save_outputs[i.index] = true;
+          }
+        }
+      });
+  }
+}
+
+void Imperative::RecordOp(
+    nnvm::NodeAttrs&& attrs,
+    const std::vector<NDArray*>& inputs,
+    const std::vector<NDArray*>& outputs,
+    const OpStatePtr& state,
+    std::vector<bool>* p_save_inputs,
+    std::vector<bool>* p_save_outputs) {
+  MXAPIThreadLocalEntry *local_buff = MXAPIThreadLocalStore::Get();
+
+  for (uint32_t i = 0; i < outputs.size(); ++i) {
+    CHECK(AGInfo::IsNone(*(outputs[i])))
+      << "Assigning to NDArrays that are already in a computational graph "
+      << "will cause undefined behavior when evaluating gradients. "
+      << "Please call backward first to clear the graph or do this out side of "
+      << "a record section. ";
+  }
+
+  bool need_grad = false;
+  for (const auto& i : inputs) {
+    if (AGInfo::IsNone(*i)) continue;
+    need_grad = true;
+    break;
+  }
+  if (!need_grad) return;
+
+  nnvm::NodePtr node = nnvm::Node::Create();
+  node->attrs = std::move(attrs);
+  node->attrs.name = "node_" + std::to_string(node_count_++);
+  AGInfo& info = AGInfo::Create(node);
+  info.state = state;
+  info.ctx = outputs[0]->ctx();
+
+  if (p_save_inputs == nullptr) {
+    p_save_inputs = &(local_buff->save_inputs);
+    p_save_outputs = &(local_buff->save_outputs);
+    GetBackwardDependency(
+        node, inputs.size(), outputs.size(), p_save_inputs, p_save_outputs);
+  } else {
+    node->inputs.resize(inputs.size());
+  }
+
+  std::vector<bool>& save_inputs = *p_save_inputs;
+  std::vector<bool>& save_outputs = *p_save_outputs;
+
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    if (AGInfo::IsNone(*(inputs[i]))) {
+      nnvm::NodeEntry entry{nnvm::Symbol::CreateVariable(
+          "null" + std::to_string(variable_count_++)).outputs[0].node, 0, 0};
+      AGInfo& input_info = AGInfo::Create(entry.node);
+      input_info.ctx = inputs[i]->ctx();
+      if (save_inputs[i]) {
+        input_info.outputs.emplace_back(*inputs[i]);
+      } else {
+        // Put a dummy array here since it will not be used.
+        input_info.outputs.emplace_back();
+        input_info.outputs.back().shape_ = inputs[i]->shape();
+        input_info.outputs.back().dtype_ = inputs[i]->dtype();
+        input_info.outputs.back().storage_type_ = inputs[i]->storage_type();
+      }
+      inputs[i]->entry_ = std::move(entry);  // assign last to prevent cyclic reference
+    } else if (save_inputs[i]) {
+      AGInfo::Get(inputs[i]->entry_.node).outputs[inputs[i]->entry_.index] = inputs[i]->Detach();
+    }
+    node->inputs[i] = inputs[i]->entry_;
+  }
+
+  for (uint32_t i = 0; i < outputs.size(); ++i) {
+    CHECK(AGInfo::IsNone(*(outputs[i])))
+      << "Inplace operations (+=, -=, x[:]=, etc) are not supported when "
+      << "recording with autograd.";
+  }
+
+  for (uint32_t i = 0; i < outputs.size(); ++i) {
+    if (save_outputs[i]) {
+      info.outputs.emplace_back(outputs[i]->Detach());
+    } else {
+      // Put a dummy array here since it will not be used.
+      info.outputs.emplace_back();
+      info.outputs.back().shape_ = outputs[i]->shape();
+      info.outputs.back().dtype_ = outputs[i]->dtype();
+      info.outputs.back().storage_type_ = outputs[i]->storage_type();
+    }
+    outputs[i]->entry_ = nnvm::NodeEntry{node, i, 0};
+  }
+}
+
+void Imperative::RunGraph(
+    const bool retain_graph,
+    const nnvm::IndexedGraph& idx,
+    const std::vector<NDArray*> arrays,
+    size_t node_start, size_t node_end,
+    std::vector<OpReqType>&& array_reqs,
+    std::vector<uint32_t>&& ref_count,
+    std::vector<OpStatePtr> *p_states,
+    const DispatchModeVector &dispatch_modes) {
+  using namespace nnvm;
+  using namespace imperative;
+  static auto& createop = nnvm::Op::GetAttr<FCreateOpState>("FCreateOpState");
+  static auto& is_layer_backward = Op::GetAttr<bool>("TIsLayerOpBackward");
+  static const auto bwd_cached_op = Op::Get("_backward_CachedOp");
+
+  std::vector<OpStatePtr>& states = *p_states;
+  bool recording = is_recording();
+
+  std::vector<NDArray*> ndinputs, ndoutputs;
+  ShapeVector arg_shapes;
+  DTypeVector arg_dtypes;
+  std::vector<OpReqType> req;
+
+  int prev_bulk_size = Engine::Get()->set_bulk_size(10);
+
+  for (size_t i = node_start; i < node_end; ++i) {
+    const nnvm::IndexedGraph::Node& node = idx[i];
+    if (node.source->op() == nullptr) continue;
+    auto num_outputs = node.source->num_outputs();
+    ndinputs.clear();
+    ndinputs.reserve(node.inputs.size());
+    for (const auto& j : node.inputs) {
+      ndinputs.emplace_back(arrays[idx.entry_id(j)]);
+      CHECK(!ndinputs.back()->is_none()) << idx[j.node_id].source->attrs.name << " " << j.index;
+    }
+    ndoutputs.clear();
+    ndoutputs.reserve(num_outputs);
+    req.clear();
+    req.reserve(num_outputs);
+    for (size_t j = 0; j < num_outputs; ++j) {
+      size_t eid = idx.entry_id(i, j);
+      ndoutputs.emplace_back(arrays[eid]);
+      req.push_back(array_reqs[eid]);
+      CHECK(!ndoutputs.back()->is_none());
+    }
+    const Context& ctx = ndoutputs[0]->ctx();
+    const DispatchMode dispatch_mode = dispatch_modes[i];
+    if (node.source->op() == bwd_cached_op) {
+      const auto& cached_op = dmlc::get<CachedOpPtr>(node.source->attrs.parsed);
+      nnvm::Node* fwd_node = node.source->control_deps[0].get();
+      auto fwd_node_id = idx.node_id(fwd_node);
+      cached_op->Backward(retain_graph, states[fwd_node_id], ndinputs, req, ndoutputs);
+    } else if (createop.count(node.source->op())) {
+      arg_shapes.clear();
+      arg_dtypes.clear();
+      arg_shapes.reserve(ndinputs.size());
+      arg_dtypes.reserve(ndinputs.size());
+      for (size_t i = 0; i < ndinputs.size(); ++i) {
+        arg_shapes.emplace_back(ndinputs[i]->shape());
+        arg_dtypes.emplace_back(ndinputs[i]->dtype());
+      }
+      states[i] = createop[node.source->op()](
+          node.source->attrs, ctx, arg_shapes, arg_dtypes);
+      InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode, states[i]);
+      if (recording) RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[i]);
+    } else if (is_layer_backward.get(node.source->op(), false)) {
+      nnvm::Node* fwd_node = node.source->control_deps[0].get();
+      auto fwd_node_id = idx.node_id(fwd_node);
+      InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
+               req, dispatch_mode, states[fwd_node_id]);
+      if (recording) {
+        RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs, states[fwd_node_id]);
+      }
+    } else {
+      InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs, req, dispatch_mode);
+      if (recording) RecordOp(NodeAttrs(node.source->attrs), ndinputs, ndoutputs);
+    }
+
+    for (const auto& j : node.inputs) {
+      size_t eid = idx.entry_id(j);
+      --ref_count[eid];
+      if (ref_count[eid] == 0) arrays[eid]->ptr_.reset();
+    }
+    for (size_t j = 0; j < ndoutputs.size(); ++j) {
+      size_t eid = idx.entry_id(i, j);
+      if (ref_count[eid] == 0) arrays[eid]->ptr_.reset();
+    }
+  }
+
+  Engine::Get()->set_bulk_size(prev_bulk_size);
+}
+
+
+std::vector<NDArray*> Imperative::Backward(
+    const std::vector<NDArray*>& outputs,
+    const std::vector<NDArray*>& ograds,
+    const std::vector<NDArray*>& variables,
+    bool is_train, bool retain_graph,
+    bool create_graph) {
+  using namespace nnvm;
+  using namespace imperative;
+  static const std::vector<const Op*> zero_ops{Op::Get("zeros_like"), Op::Get("_zeros")};
+
+  // Construct forward graph
+  Graph graph;
+  graph.outputs.reserve(outputs.size());
+  for (const auto& i : outputs) {
+    CHECK(!AGInfo::IsNone(*i))
+      << "Cannot differentiate node because it is not in a computational graph. "
+      << "You need to set is_recording to true or use autograd.record() to save "
+      << "computational graphs for backward. If you want to differentiate the same "
+      << "graph twice, you need to pass retain_graph=True to backward.";
+    graph.outputs.emplace_back(i->entry_);
+  }
+  size_t num_forward_outputs = graph.outputs.size();
+
+  // Prepare head gradients
+  std::vector<NodeEntry> ograd_entries;
+  ograd_entries.reserve(ograds.size());
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    ograd_entries.emplace_back(NodeEntry{Node::Create(), 0, 0});
+    AGInfo& info = AGInfo::Create(ograd_entries.back().node);
+    info.ctx = outputs[i]->ctx();
+    if (ograds[i] != nullptr) {
+      info.outputs.emplace_back(*ograds[i]);
+    } else {
+      info.outputs.emplace_back(outputs[i]->shape(), outputs[i]->ctx(),
+                                true, outputs[i]->dtype());
+      info.outputs.back() = static_cast<real_t>(1.0);
+    }
+  }
+
+  // Get gradient graph
+  Symbol sym;
+  sym.outputs = graph.outputs;
+  std::vector<NodeEntry> xs;
+  std::vector<NDArray*> x_grads;
+  std::vector<OpReqType> x_reqs;
+  if (variables.size()) {
+    xs.reserve(variables.size());
+    x_grads.reserve(variables.size());
+    x_reqs.reserve(variables.size());
+    for (size_t i = 0; i < variables.size(); ++i) {
+      CHECK(!AGInfo::IsNone(*variables[i]) &&
+            AGInfo::IsVariable(variables[i]->entry_.node))
+          << "Cannot differentiate with respect to the " << i+1 << "-th variable"
+          << " because it does not require gradient.";
+      xs.emplace_back(variables[i]->entry_);
+      x_grads.push_back(new NDArray());
+      x_reqs.push_back(kWriteTo);
+    }
+  } else {
+    std::vector<NodePtr> args = sym.ListInputs(Symbol::kReadOnlyArgs);
+    xs.reserve(args.size());
+    x_grads.reserve(args.size());
+    x_reqs.reserve(args.size());
+    for (const auto& i : args) {
+      AGInfo& info = AGInfo::Get(i);
+      if (info.grad_req == kNullOp) continue;
+      xs.emplace_back(NodeEntry{i, 0, 0});
+      x_grads.push_back(&info.out_grads[0]);
+      x_reqs.push_back(info.grad_req);
+      info.fresh_out_grad = true;
+    }
+    CHECK_GT(xs.size(), 0)
+        << "There are no inputs in computation graph that require gradients.";
+  }
+
+  Graph g_graph = pass::Gradient(
+      graph, graph.outputs, xs, ograd_entries,
+      exec::AggregateGradient, nullptr, nullptr,
+      zero_ops, "_copy");
+  CHECK_EQ(g_graph.outputs.size(), xs.size());
+  for (const auto &e : g_graph.outputs) {
+    graph.outputs.push_back(e);
+  }
+  const auto& idx = graph.indexed_graph();
+  // get number of nodes used in forward pass
+  size_t num_forward_nodes = 0;
+  size_t num_forward_entries = 0;
+  for (size_t i = 0; i < num_forward_outputs; ++i) {
+    num_forward_nodes = std::max(
+        num_forward_nodes, static_cast<size_t>(idx.outputs()[i].node_id + 1));
+    num_forward_entries = std::max(
+        num_forward_entries, static_cast<size_t>(idx.entry_id(idx.outputs()[i])) + 1);
+  }
+
+  // Allocate buffer
+  std::vector<NDArray> buff(idx.num_node_entries());
+  std::vector<uint32_t> ref_count(buff.size(), 0);
+  std::vector<OpStatePtr> states;
+  std::vector<NDArray*> arrays;
+  arrays.reserve(buff.size());
+  for (size_t i = 0; i < buff.size(); ++i) arrays.push_back(&buff[i]);
+  if (create_graph) {
+    states.resize(num_forward_nodes);
+    nnvm::DFSVisit(sym.outputs, [&](const nnvm::NodePtr& n) {
+      AGInfo& info = AGInfo::Get(n);
+      states[idx.node_id(n.get())] = info.state;
+      for (uint32_t i = 0; i < info.outputs.size(); ++i) {
+        CHECK(idx.exist(n.get()));
+        size_t nid = idx.node_id(n.get());
+        size_t eid = idx.entry_id(nid, i);
+        buff[eid] = info.outputs[i];
+        buff[eid].entry_ = NodeEntry{n, i, 0};
+        ref_count[eid] = 1;
+      }
+    });
+    for (size_t i = 0; i < ograd_entries.size(); ++i) {
+      AGInfo& info = AGInfo::Get(ograd_entries[i].node);
+      if (!idx.exist(ograd_entries[i].node.get())) continue;
+      size_t eid = idx.entry_id(ograd_entries[i]);
+      buff[eid] = info.outputs[0];
+      buff[eid].entry_ = ograd_entries[i];
+    }
+  } else {
+    states.reserve(num_forward_nodes);
+    for (size_t i = 0; i < num_forward_nodes; ++i) {
+      const AGInfo& info = dmlc::get<AGInfo>(idx[i].source->info);
+      states.emplace_back(info.state);
+      for (size_t j = 0; j < info.outputs.size(); ++j) {
+        size_t eid = idx.entry_id(i, j);
+        arrays[eid] = const_cast<NDArray*>(&(info.outputs[j]));
+
+        if (retain_graph || info.grad_req != kNullOp) ref_count[eid] = 1;
+      }
+    }
+    for (size_t i = 0; i < ograd_entries.size(); ++i) {
+      if (!idx.exist(ograd_entries[i].node.get())) continue;
+      AGInfo& info = AGInfo::Get(ograd_entries[i].node);
+      arrays[idx.entry_id(ograd_entries[i])] = &info.outputs[0];
+    }
+  }
+  for (size_t i = num_forward_outputs; i < graph.outputs.size(); ++i) {
+    size_t eid = idx.entry_id(graph.outputs[i]);
+    arrays[eid] = x_grads[i - num_forward_outputs];
+    ref_count[eid] = 1;
+  }
+
+  // Assign context
+  auto vctx = PlaceDevice(idx);
+
+  // Infer shape type
+  {
+    std::pair<uint32_t, uint32_t> node_range, entry_range;
+    node_range = {num_forward_nodes, idx.num_nodes()};
+    entry_range = {num_forward_entries, idx.num_node_entries()};
+
+    ShapeVector shapes;
+    shapes.reserve(idx.num_node_entries());
+    for (const auto& i : arrays) shapes.emplace_back(i->shape());
+    CheckAndInferShape(&graph, std::move(shapes), false,
+                       node_range, entry_range);
+
+    DTypeVector dtypes;
+    dtypes.reserve(idx.num_node_entries());
+    for (const auto& i : arrays) dtypes.emplace_back(i->dtype());
+    CheckAndInferType(&graph, std::move(dtypes), false,
+                      node_range, entry_range);
+
+    StorageTypeVector stypes;
+    stypes.reserve(idx.num_node_entries());
+    for (const auto& i : arrays) stypes.emplace_back(i->storage_type());
+    exec::DevMaskVector dev_mask;
+    dev_mask.reserve(idx.num_nodes());
+    for (const auto& i : vctx) dev_mask.emplace_back(i.dev_mask());
+    CheckAndInferStorageType(&graph, std::move(dev_mask), std::move(stypes), false,
+                             node_range, entry_range);
+  }
+
+  // Calculate ref count
+  for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) {
+    for (const auto& j : idx[i].inputs) {
+       ++ref_count[idx.entry_id(j)];
+    }
+  }
+
+  // Assign reqs
+  std::vector<OpReqType> array_reqs(arrays.size(), kWriteTo);
+  for (size_t i = num_forward_entries; i < idx.num_node_entries(); ++i) {
+    if (ref_count[i] == 0) array_reqs[i] = kNullOp;
+  }
+  for (size_t i = num_forward_outputs; i < idx.outputs().size(); ++i) {
+    size_t eid = idx.entry_id(idx.outputs()[i]);
+    array_reqs[eid] = x_reqs[i - num_forward_outputs];
+  }
+
+  const auto& shapes = graph.GetAttr<ShapeVector>("shape");
+  const auto& dtypes = graph.GetAttr<DTypeVector>("dtype");
+  const auto& stypes = graph.GetAttr<StorageTypeVector>("storage_type");
+  const auto& dispatch_modes = graph.GetAttr<DispatchModeVector>("dispatch_mode");
+
+  for (size_t i = num_forward_nodes; i < idx.num_nodes(); ++i) {
+    auto num_outputs = idx[i].source->num_outputs();
+    for (size_t j = 0; j < num_outputs; ++j) {
+      auto eid = idx.entry_id(i, j);
+      if (!arrays[eid]->is_none()) continue;
+      if (stypes[eid] == kDefaultStorage) {
+        *arrays[eid] = NDArray(shapes[eid], vctx[i], true, dtypes[eid]);
+      } else {
+        *arrays[eid] = NDArray(static_cast<NDArrayStorageType>(stypes[eid]),
+                               shapes[eid], vctx[i], true, dtypes[eid]);
+      }
+    }
+  }
+
+  // Execution
+
+  bool prev_recording = set_is_recording(create_graph);
+  bool prev_training = set_is_training(is_train);
+
+  RunGraph(retain_graph, idx, arrays, num_forward_nodes, idx.num_nodes(),
+           std::move(array_reqs), std::move(ref_count), &states, dispatch_modes);
+
+  set_is_recording(prev_recording);
+  set_is_training(prev_training);
+
+  // Clear history
+  if (!retain_graph) {
+    nnvm::DFSVisit(sym.outputs, [&](const nnvm::NodePtr& n) {
+      AGInfo::Clear(n);
+      n->inputs.clear();
+    });
+  }
+
+  if (variables.size()) {
+    return x_grads;
+  }
+  return {};
+}
+
+}  // namespace mxnet
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
new file mode 100644
index 000000000000..ecc40314e911
--- /dev/null
+++ b/src/imperative/imperative_utils.h
@@ -0,0 +1,757 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <mxnet/operator.h>
+#include <mxnet/executor.h>
+#include <mxnet/imperative.h>
+#include <nnvm/pass_functions.h>
+#include <utility>
+#include <algorithm>
+#include <vector>
+#include <string>
+#include "../executor/graph_executor.h"
+#include "../executor/exec_pass.h"
+#include "../c_api/c_api_common.h"
+#include "../common/utils.h"
+#include "../common/exec_utils.h"
+
+#ifndef MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
+#define MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
+
+namespace mxnet {
+namespace imperative {
+
+struct MemoryPlanInfo {
+  uint32_t sid;
+  size_t size;
+  bool inplace;
+};
+
+using MemoryPlanVector = std::vector<MemoryPlanInfo>;
+
+inline Context GetContext(const nnvm::NodeAttrs& attrs,
+                const std::vector<NDArray*>& inputs,
+                const std::vector<NDArray*>& outputs,
+                const Context& default_ctx) {
+  Context ctx;
+  if (inputs.size()) {
+    ctx = inputs[0]->ctx();
+    for (size_t i = 1; i < inputs.size(); ++i) {
+      CHECK_EQ(inputs[i]->ctx().dev_mask(), ctx.dev_mask())
+          << "Operator " << attrs.op->name
+          << " require all inputs live on the same context. "
+          << "But the first argument is on "
+          << ctx << " while the " << i+1 << "-th argument is on "
+          << inputs[i]->ctx();
+    }
+  } else if (outputs.size() && !outputs[0]->is_none()) {
+    ctx = outputs[0]->ctx();
+  } else if (attrs.dict.find("ctx") != attrs.dict.end()) {
+    ctx = Context::FromString(attrs.dict.at("ctx"));
+  } else {
+    ctx = default_ctx;
+  }
+  // Non-default context (pinned, shared) does not propagate
+  if (ctx.dev_mask() != ctx.dev_type) {
+    ctx = Context::Create(ctx.dev_mask(), ctx.dev_id);
+  }
+#if !MXNET_USE_CUDA
+  if (ctx.dev_mask() == gpu::kDevMask) {
+    LOG(INFO) << "GPU support is disabled. Compile MXNet with "
+              << "USE_CUDA=1 to enable GPU support.";
+  }
+#endif  // MXNET_USE_CUDA
+  return ctx;
+}
+
+// Set the shape, dtype, storage type and dispatch mode via the attribute inference functions
+inline void SetShapeType(const Context& ctx,
+                         const nnvm::NodeAttrs& attrs,
+                         const std::vector<NDArray*>& inputs,
+                         const std::vector<NDArray*>& outputs,
+                         DispatchMode* dispatch_mode) {
+  static auto& infershape = nnvm::Op::GetAttr<nnvm::FInferShape>("FInferShape");
+  static auto& infertype = nnvm::Op::GetAttr<nnvm::FInferType>("FInferType");
+  static auto& inferstorage = nnvm::Op::GetAttr<FInferStorageType>("FInferStorageType");
+  MXAPIThreadLocalEntry *ret = MXAPIThreadLocalStore::Get();
+  // infer shape
+  std::vector<TShape>& in_shapes  = ret->arg_shapes;
+  in_shapes.clear();
+  in_shapes.reserve(inputs.size());
+  for (auto& i : inputs) {
+    in_shapes.push_back(i->shape());
+  }
+  std::vector<TShape>& out_shapes = ret->out_shapes;
+  out_shapes.clear();
+  out_shapes.reserve(outputs.size());
+  for (auto& i : outputs) {
+    out_shapes.push_back(i->shape());
+  }
+  CHECK(infershape.count(attrs.op))
+    << "Operator " << attrs.op->name << " is missing FInferShape attribute";
+  CHECK(infershape[attrs.op](attrs, &in_shapes, &out_shapes));
+  CHECK_EQ(out_shapes.size(), outputs.size());
+
+  // infer type
+  std::vector<int>& in_types = ret->arg_types;
+  in_types.clear();
+  in_types.reserve(inputs.size());
+  for (auto& i : inputs) {
+    in_types.push_back(i->dtype());
+  }
+  std::vector<int>& out_types = ret->out_types;
+  out_types.clear();
+  out_types.reserve(outputs.size());
+  for (auto& i : outputs) {
+    out_types.push_back(i->dtype());
+  }
+  CHECK(infertype.count(attrs.op))
+    << "Operator " << attrs.op->name << " is missing FInferType attribute";
+  CHECK(infertype[attrs.op](attrs, &in_types, &out_types));
+  CHECK_EQ(out_types.size(), outputs.size());
+
+  // infer storage type
+  auto& in_storage_types = ret->arg_storage_types;
+  in_storage_types.clear();
+  in_storage_types.reserve(inputs.size());
+  for (auto& i : inputs) {
+    in_storage_types.push_back(i->storage_type());
+  }
+  auto& out_storage_types = ret->out_storage_types;
+  out_storage_types.clear();
+  out_storage_types.reserve(outputs.size());
+  for (auto& i : outputs) {
+    out_storage_types.push_back(i->storage_type());
+  }
+  if (inferstorage.count(attrs.op)) {
+    CHECK(inferstorage[attrs.op](attrs, ctx.dev_mask(), dispatch_mode,
+                                 &in_storage_types, &out_storage_types));
+  } else {
+    // if infer storage attr is not present, apply the default infer storage function
+    bool success = exec::DefaultStorageType(attrs, ctx.dev_mask(), dispatch_mode,
+                                            &in_storage_types, &out_storage_types);
+    CHECK(success);
+  }
+  CHECK_EQ(out_storage_types.size(), outputs.size());
+  CHECK(*dispatch_mode != DispatchMode::kUndefined);
+
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    NDArrayStorageType storage_type = static_cast<NDArrayStorageType>(out_storage_types[i]);
+    if (outputs[i]->is_none()) {
+      if (storage_type == kDefaultStorage) {
+        *outputs[i] = NDArray(out_shapes[i], ctx, true, out_types[i]);
+      } else {
+        *outputs[i] = NDArray(storage_type, out_shapes[i], ctx, true, out_types[i]);
+      }
+    } else {
+      CHECK_EQ(outputs[i]->shape(), out_shapes[i])
+        << i << "-th output has invalid shape. "
+        << "Expecting " << out_shapes[i] << " got "
+        << outputs[i]->shape() << " in operator " << attrs.op->name;
+      CHECK_EQ(outputs[i]->dtype(), out_types[i])
+        << i << "-th output has invalid shape. "
+        << "Expecting " << out_types[i] << " got "
+        << outputs[i]->dtype()  << " in operator " << attrs.op->name;
+    }
+  }
+}
+
+inline void SetDependency(const nnvm::NodeAttrs& attrs,
+                   const Context& ctx,
+                   const std::vector<NDArray*>& inputs,
+                   const std::vector<NDArray*>& outputs,
+                   std::vector<engine::VarHandle> *p_read_vars,
+                   std::vector<engine::VarHandle> *p_write_vars,
+                   std::vector<Resource> *p_requested,
+                   std::vector<uint32_t> *p_mutate_idx,
+                   const DispatchMode dispatch_mode) {
+  static auto& fmutate = nnvm::Op::GetAttr<nnvm::FMutateInputs>("FMutateInputs");
+  static auto& ftmp_resource = nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
+
+  std::vector<engine::VarHandle>& read_vars  = *p_read_vars;
+  std::vector<engine::VarHandle>& write_vars = *p_write_vars;
+  std::vector<Resource>& requested = *p_requested;
+  std::vector<uint32_t>& mutate_idx = *p_mutate_idx;
+
+  if (fmutate.count(attrs.op)) {
+    mutate_idx = fmutate[attrs.op](attrs);
+  }
+
+  if (ftmp_resource.count(attrs.op)) {
+    int ntmp = 0;
+    auto resource_reqs = ftmp_resource[attrs.op](attrs);
+    for (const auto& req : resource_reqs) {
+      switch (req.type) {
+       case ResourceRequest::kTempSpace:
+        ++ntmp;
+       case ResourceRequest::kRandom:
+        requested.push_back(ResourceManager::Get()->Request(ctx, req));
+        write_vars.push_back(requested.back().var);
+        break;
+       default:
+        LOG(FATAL) << "resource type not yet supported";
+      }
+    }
+    CHECK_LE(ntmp, 1) << "Only support 1 temp space request";
+  }
+
+  // append extra resource requests for storage fallback
+  if (dispatch_mode == DispatchMode::kFComputeFallback) {
+    requested.push_back(ResourceManager::Get()->Request(ctx, ResourceRequest::kTempSpace));
+    write_vars.push_back(requested.back().var);
+  }
+
+  read_vars.reserve(inputs.size());
+  for (auto& i : inputs) {
+    read_vars.push_back(i->var());
+  }
+  write_vars.reserve(outputs.size() + mutate_idx.size());
+  for (auto& i : outputs) {
+    write_vars.push_back(i->var());
+  }
+  for (auto & i : mutate_idx) {
+    write_vars.push_back(inputs[i]->var());
+  }
+  Engine::Get()->DeduplicateVarHandle(&read_vars, &write_vars);
+}
+
+inline void SetWriteInplaceReq(const std::vector<NDArray*>& inputs,
+                        const std::vector<NDArray*>& outputs,
+                        std::vector<OpReqType> *req) {
+  std::unordered_set<engine::VarHandle> in_vars;
+  in_vars.reserve(inputs.size());
+  for (auto &i : inputs) {
+    in_vars.insert(i->var());
+  }
+  req->clear();
+  req->resize(outputs.size(), kWriteTo);
+  for (size_t i = 0; i < outputs.size(); i++) {
+    // output NDArray shares the memory with the input NDArray
+    if (in_vars.find(outputs[i]->var()) != in_vars.end()) {
+      req->at(i) = kWriteInplace;
+    }
+  }
+}
+
+/*!
+ * \brief Parse parameter attributes into a nnvm::NodeAttrs structure
+ * \param op Pointer to the nnvm Operator object
+ * \param num_inputs Number of operator inputs
+ * \param num_params Number of parameters
+ * \param param_keys Array of string pointers representing the parameter keys
+ * \param param_vals Array of string pointers representing the associated values
+ * \return nnvm::NodeAttrs structure representing the parsed attributes
+ */
+inline nnvm::NodeAttrs ParseAttrs(const nnvm::Op *op,
+                                  const int num_inputs,
+                                  const int num_params,
+                                  const char **param_keys,
+                                  const char **param_vals) {
+  static auto& num_args = nnvm::Op::GetAttr<std::string>("key_var_num_args");
+
+  nnvm::NodeAttrs attrs;
+  attrs.op = op;
+  attrs.dict.reserve(num_params+1);
+  for (int i = 0; i < num_params; ++i) {
+    attrs.dict.emplace(param_keys[i], param_vals[i]);
+  }
+  if (num_args.count(op)) {
+    attrs.dict.emplace(num_args[op], std::to_string(num_inputs));
+  }
+  if (op->attr_parser != nullptr) {
+    op->attr_parser(&attrs);
+  }
+
+  return attrs;
+}
+
+/*!
+ * \brief Determine number of outputs for the given operator
+ * \param op Pointer to the nnvm Operator object
+ * \param attrs  nnvm::NodeAttrs structure representing the operator's attributes
+ * \param num_inputs Number of inputs tot he operator
+ * \param infered_num_outputs The inferred number of outputs
+ * \param num_visible_outputs The actual number of visible outputs
+ */
+inline void SetNumOutputs(const nnvm::Op *op,
+                          const nnvm::NodeAttrs& attrs,
+                          const int& num_inputs,
+                          int* infered_num_outputs,
+                          int* num_visible_outputs) {
+  static auto& visible_out = nnvm::Op::GetAttr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs");
+  int infered_num_inputs;
+  if (op->get_num_inputs != nullptr) {
+    infered_num_inputs = op->get_num_inputs(attrs);
+  } else {
+    infered_num_inputs = op->num_inputs;
+  }
+  CHECK_EQ(num_inputs, infered_num_inputs)
+    << "Operator " << op->name << " expects " << infered_num_inputs
+    << " inputs, but got " << num_inputs << " instead.";
+  if (op->get_num_outputs != nullptr) {
+    *infered_num_outputs = op->get_num_outputs(attrs);
+  } else {
+    *infered_num_outputs = op->num_outputs;
+  }
+  *num_visible_outputs = *infered_num_outputs;
+  if (visible_out.count(op)) {
+    *num_visible_outputs = visible_out[op](attrs);
+    CHECK_LE(*num_visible_outputs, *infered_num_outputs);
+  }
+}
+
+inline void DerefInputOutput(const std::vector<NDArray*>& inputs,
+                             const std::vector<NDArray*>& outputs,
+                             std::vector<NDArray>* p_inputs,
+                             std::vector<NDArray>* p_outputs) {
+  p_inputs->reserve(inputs.size());
+  p_outputs->reserve(outputs.size());
+  for (NDArray* i : inputs) p_inputs->emplace_back(*i);
+  for (NDArray* i : outputs) p_outputs->emplace_back(*i);
+}
+
+inline void PushFCompute(const FCompute& fn,
+                  const nnvm::Op* op,
+                  const nnvm::NodeAttrs& attrs,
+                  const Context& ctx,
+                  const std::vector<engine::VarHandle>& read_vars,
+                  const std::vector<engine::VarHandle>& write_vars,
+                  const std::vector<Resource>& requested,
+                  const std::vector<NDArray*>& p_inputs,
+                  const std::vector<NDArray*>& p_outputs,
+                  const std::vector<uint32_t>& mutate_idx,
+                  const std::vector<OpReqType>& req) {
+  using namespace common;
+  bool is_train = Imperative::Get()->is_training();
+  std::vector<NDArray> inputs, outputs;
+  DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
+  Engine::Get()->PushSync(
+    [ctx, attrs, fn, inputs, outputs, requested, is_train, mutate_idx, req](
+        RunContext rctx) {
+      std::vector<TBlob> input_blobs, output_blobs;
+      // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays
+      std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
+      // mapping from index in input_blobs to index in pre_temp_dst
+      std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
+      // setup blobs
+      SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs,
+                             &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst,
+                             &in_temp_idx_map, mutate_idx);
+      // setup context
+      OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
+      bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
+      // pre-fcompute fallback, cast to default storage type
+      CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
+      fn(attrs, opctx, input_blobs, req, output_blobs);
+      // post-fcompute fallback, cast to original storage type
+      CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
+      if (is_gpu) {
+        rctx.get_stream<gpu>()->Wait();
+      }
+    }, ctx, read_vars, write_vars, FnProperty::kNormal,
+    0, PROFILER_MESSAGE(op->name.c_str()));
+}
+
+inline void PushFComputeEx(const FComputeEx& fn,
+                    const nnvm::Op* op,
+                    const nnvm::NodeAttrs& attrs,
+                    const Context& ctx,
+                    const std::vector<engine::VarHandle>& read_vars,
+                    const std::vector<engine::VarHandle>& write_vars,
+                    const std::vector<Resource>& requested,
+                    const std::vector<NDArray*>& p_inputs,
+                    const std::vector<NDArray*>& p_outputs,
+                    const std::vector<OpReqType>& req) {
+  static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
+
+  bool is_train = Imperative::Get()->is_training();
+  ExecType exec_type = ExecType::kSync;
+  if (fexec_type.count(op)) {
+    exec_type = fexec_type[op](attrs);
+  }
+  std::vector<NDArray> inputs, outputs;
+  DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
+  const auto& run = [ctx, exec_type, is_train, attrs, fn, inputs, outputs, requested, req](
+        RunContext rctx) {
+      OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
+      fn(attrs, opctx, inputs, req, outputs);
+      if (exec_type == ExecType::kSync) {
+        if (rctx.get_ctx().dev_mask() == gpu::kDevMask) {
+          rctx.get_stream<gpu>()->Wait();
+        }
+      }
+    };
+  if (exec_type == ExecType::kLocal) {
+    run(RunContext{ctx, nullptr});
+  } else {
+    Engine::Get()->PushSync(run, ctx, read_vars, write_vars, FnProperty::kNormal,
+      0, PROFILER_MESSAGE(op->name.c_str()));
+  }
+}
+
+inline void PushOperator(const OpStatePtr& state,
+                  const nnvm::Op* op,
+                  const nnvm::NodeAttrs& attrs,
+                  const Context& ctx,
+                  const std::vector<engine::VarHandle>& read_vars,
+                  const std::vector<engine::VarHandle>& write_vars,
+                  const std::vector<Resource>& requested,
+                  const std::vector<NDArray*>& p_inputs,
+                  const std::vector<NDArray*>& p_outputs,
+                  const std::vector<uint32_t>& mutate_idx,
+                  const std::vector<OpReqType>& req,
+                  const DispatchMode dispatch_mode) {
+  using namespace common;
+  static auto& fexec_type = nnvm::Op::GetAttr<FExecType>("FExecType");
+
+  bool is_train = Imperative::Get()->is_training();
+  ExecType exec_type = ExecType::kSync;
+  if (fexec_type.count(op)) {
+    exec_type = fexec_type[op](attrs);
+  }
+  std::vector<NDArray> inputs, outputs;
+  DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
+
+  auto fcompute = common::GetFCompute<FStatefulCompute>(op, "FStatefulCompute", ctx);
+  auto fcompute_ex = common::GetFCompute<FStatefulComputeEx>(op, "FStatefulComputeEx", ctx);
+  if (fcompute_ex != nullptr && dispatch_mode == DispatchMode::kFComputeEx) {
+    const auto& run = [state, fcompute_ex, inputs, outputs, requested, is_train,
+                       exec_type, req](
+          RunContext rctx) {
+        OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
+        fcompute_ex(state, opctx, inputs, req, outputs);
+        if (exec_type == ExecType::kSync) {
+          if (rctx.get_ctx().dev_mask() == gpu::kDevMask) {
+            rctx.get_stream<gpu>()->Wait();
+          }
+        }
+      };
+    if (exec_type == ExecType::kLocal) {
+      run(RunContext{ctx, nullptr});
+    } else {
+      Engine::Get()->PushSync(run, ctx, read_vars, write_vars, FnProperty::kNormal,
+                               0, PROFILER_MESSAGE(op->name.c_str()));
+    }
+  } else {
+    CHECK(fcompute != nullptr)
+        << "One of FStatefulCompute and FStatefulComputeEx must be registered "
+        << "for stateful operator " << op->name;
+    CHECK(exec_type == ExecType::kSync || exec_type == ExecType::kAsync);
+    Engine::Get()->PushSync(
+      [state, fcompute, inputs, outputs, requested, is_train, exec_type, mutate_idx, req](
+          RunContext rctx) {
+        OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested};
+
+        std::vector<TBlob> input_blobs, output_blobs;
+        // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays
+        std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
+        // mapping from index in input_blobs to index in pre_temp_dst
+        std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
+        // populate input blobs and output blobs
+        SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs,
+                               &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst,
+                               &in_temp_idx_map, mutate_idx);
+        // setup contexts
+        bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask;
+        // pre-fcompute fallback
+        CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
+        fcompute(state, opctx, input_blobs, req, output_blobs);
+        // post-fcompute fallback, cast to original storage type, if necessary
+        CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
+        if (exec_type == ExecType::kSync) {
+          if (is_gpu) {
+            rctx.get_stream<gpu>()->Wait();
+          }
+        }
+      }, ctx, read_vars, write_vars, FnProperty::kNormal,
+      0, PROFILER_MESSAGE(op->name.c_str()));
+  }
+}
+
+inline bool CheckAndInferShape(nnvm::Graph* p_g, nnvm::ShapeVector&& shapes,
+                               bool use_inputs,
+                               std::pair<uint32_t, uint32_t> node_range = {0, 0},
+                               std::pair<uint32_t, uint32_t> entry_range = {0, 0}) {
+  using namespace nnvm;
+  nnvm::Graph& g = *p_g;
+  if (use_inputs) {
+    if (g.attrs.count("shape_inputs") &&
+        g.GetAttr<ShapeVector>("shape_inputs") == shapes) return true;
+  } else if (g.attrs.count("shape")) {
+    const auto& prev_shapes = g.GetAttr<ShapeVector>("shape");
+    CHECK_EQ(prev_shapes.size(), shapes.size());
+    bool match = true;
+    for (size_t i = 0; i < shapes.size(); ++i) {
+      if (i == entry_range.first) {
+        i = entry_range.second;
+        if (i >= shapes.size()) break;
+      }
+      if (shapes[i] == prev_shapes[i]) continue;
+      match = false;
+      break;
+    }
+    if (match) return true;
+  }
+  g.attrs.erase("shape");
+  g.attrs.erase("shape_inputs");
+  if (node_range.second > node_range.first) {
+    g.attrs["node_range"] = std::make_shared<dmlc::any>(node_range);
+  }
+  if (node_range.second > node_range.first) {
+    g.attrs["node_range"] = std::make_shared<dmlc::any>(node_range);
+  }
+  if (use_inputs) {
+    g = exec::InferShape(std::move(g), std::move(shapes));
+  } else {
+    g.attrs["shape"] = std::make_shared<dmlc::any>(std::move(shapes));
+    g = exec::InferShape(std::move(g));
+  }
+  CHECK_EQ(g.GetAttr<size_t>("shape_num_unknown_nodes"), 0U);
+
+  return false;
+}
+
+
+inline bool CheckAndInferType(nnvm::Graph* p_g, nnvm::DTypeVector&& dtypes,
+                              bool use_inputs,
+                              std::pair<uint32_t, uint32_t> node_range = {0, 0},
+                              std::pair<uint32_t, uint32_t> entry_range = {0, 0}) {
+  using namespace nnvm;
+  nnvm::Graph& g = *p_g;
+  if (use_inputs) {
+    if (g.attrs.count("dtype_inputs") &&
+        g.GetAttr<DTypeVector>("dtype_inputs") == dtypes) return true;
+  } else if (g.attrs.count("dtype")) {
+    const auto& prev_dtypes = g.GetAttr<DTypeVector>("dtype");
+    CHECK_EQ(prev_dtypes.size(), dtypes.size());
+    bool match = true;
+    for (size_t i = 0; i < dtypes.size(); ++i) {
+      if (i == entry_range.first) {
+        i = entry_range.second;
+        if (i >= dtypes.size()) break;
+      }
+      if (dtypes[i] == prev_dtypes[i]) continue;
+      match = false;
+      break;
+    }
+    if (match) return true;
+  }
+  g.attrs.erase("dtype");
+  g.attrs.erase("dtype_inputs");
+  if (node_range.second > node_range.first) {
+    g.attrs["node_range"] = std::make_shared<dmlc::any>(node_range);
+  }
+  if (node_range.second > node_range.first) {
+    g.attrs["node_range"] = std::make_shared<dmlc::any>(node_range);
+  }
+  if (use_inputs) {
+    g = exec::InferType(std::move(g), std::move(dtypes));
+  } else {
+    g.attrs["dtype"] = std::make_shared<dmlc::any>(std::move(dtypes));
+    g = exec::InferType(std::move(g));
+  }
+  CHECK_EQ(g.GetAttr<size_t>("dtype_num_unknown_nodes"), 0U);
+
+  return false;
+}
+
+inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev_mask,
+                                     StorageTypeVector&& storage_types, bool use_inputs,
+                                     std::pair<uint32_t, uint32_t> node_range = {0, 0},
+                                     std::pair<uint32_t, uint32_t> entry_range = {0, 0}) {
+  using namespace nnvm;
+  nnvm::Graph& g = *p_g;
+  bool dev_match = g.attrs.count("dev_mask") &&
+                   g.GetAttr<exec::DevMaskVector>("dev_mask") == dev_mask;
+  if (!dev_match) {
+    g.attrs["dev_mask"] = std::make_shared<dmlc::any>(std::move(dev_mask));
+  }
+
+  if (dev_match && use_inputs) {
+    if (g.attrs.count("storage_type_inputs") &&
+        g.GetAttr<StorageTypeVector>("storage_type_inputs") == storage_types) return true;
+  } else if (dev_match && g.attrs.count("storage_type")) {
+    const auto& prev_storage_types = g.GetAttr<StorageTypeVector>("storage_type");
+    CHECK_EQ(prev_storage_types.size(), storage_types.size());
+    bool match = true;
+    for (size_t i = 0; i < storage_types.size(); ++i) {
+      if (i == entry_range.first) {
+        i = entry_range.second;
+        if (i >= storage_types.size()) break;
+      }
+      if (storage_types[i] == prev_storage_types[i]) continue;
+      match = false;
+      break;
+    }
+    if (match) return true;
+  }
+  g.attrs.erase("storage_type");
+  g.attrs.erase("storage_type_inputs");
+  if (node_range.second > node_range.first) {
+    g.attrs["node_range"] = std::make_shared<dmlc::any>(node_range);
+  }
+  if (use_inputs) {
+    g = exec::InferStorageType(std::move(g), std::move(storage_types));
+  } else {
+    g.attrs["storage_type"] = std::make_shared<dmlc::any>(std::move(storage_types));
+    g = exec::InferStorageType(std::move(g));
+  }
+
+  CHECK_EQ(g.GetAttr<size_t>("storage_type_num_unknown_nodes"), 0U);
+  return false;
+}
+
+
+inline std::vector<Context> PlaceDevice(const nnvm::IndexedGraph& idx) {
+  static const auto& _copyto = Op::Get("_copyto");
+
+  std::vector<Context> vctx(
+      idx.num_nodes(), Context::Create(static_cast<Context::DeviceType>(-1), 0));
+  // forward pass
+  for (size_t i = 0; i < idx.num_nodes(); ++i) {
+    if (!idx[i].source->info.empty()) {
+      vctx[i] = dmlc::get<Imperative::AGInfo>(idx[i].source->info).ctx;
+    } else if (idx[i].source->op() == _copyto) {
+      CHECK_GT(idx[i].source->control_deps.size(), 0);
+      auto fwd_nid = idx.node_id(idx[i].source->control_deps[0].get());
+      CHECK_EQ(idx[fwd_nid].source->op(), _copyto);
+      vctx[i] = vctx[idx[fwd_nid].inputs[0].node_id];
+    } else if (idx[i].control_deps.size() &&
+               vctx[idx[i].control_deps[0]].dev_type != -1) {
+      vctx[i] = vctx[idx[i].control_deps[0]];
+    } else {
+      for (const auto& in : idx[i].inputs) {
+        if (vctx[in.node_id].dev_type == -1) continue;
+        vctx[i] = vctx[in.node_id];
+        break;
+      }
+    }
+  }
+  // backward pass
+  for (int i = idx.num_nodes() - 1; i >= 0; --i) {
+    if (vctx[i].dev_type == -1) continue;
+    if (idx[i].source->op() == _copyto) {
+      auto in_nid = idx[i].inputs[0].node_id;
+      if (vctx[in_nid].dev_type != -1) continue;
+      CHECK_GT(idx[i].source->control_deps.size(), 0);
+      auto fwd_nid = idx.node_id(idx[i].source->control_deps[0].get());
+      CHECK_EQ(idx[fwd_nid].source->op(), _copyto);
+      vctx[in_nid] = vctx[fwd_nid];
+      continue;
+    }
+    for (const auto& j : idx[i].inputs) {
+      if (vctx[j.node_id].dev_type != -1) continue;
+      vctx[j.node_id] = vctx[i];
+    }
+  }
+  // check all context initialized
+  for (size_t i = 0; i < idx.num_nodes(); ++i) {
+    CHECK_NE(vctx[i].dev_type, -1)
+        << "Cannot decide context for node " << idx[i].source->attrs.name;
+    // Non-default context do not propagate.
+    vctx[i].dev_type = vctx[i].dev_mask();
+  }
+
+  return vctx;
+}
+
+
+inline MemoryPlanVector PlanMemory(
+    nnvm::Graph* p_g, nnvm::StorageVector&& storage,
+    const std::vector<uint32_t>& ref_count,
+    const std::pair<uint32_t, uint32_t>& node_range = {0, 0},
+    const std::pair<uint32_t, uint32_t>& entry_range = {0, 0}) {
+  using namespace nnvm;
+  nnvm::Graph& g = *p_g;
+  const auto& idx = g.indexed_graph();
+  if (node_range.second > node_range.first) {
+    g.attrs["node_range"] = std::make_shared<dmlc::any>(node_range);
+  }
+  g.attrs["ref_count"] = std::make_shared<dmlc::any>(ref_count);
+  g.attrs["storage"] = std::make_shared<dmlc::any>(std::move(storage));
+  g = nnvm::ApplyPass(g, "PlanMemory");
+
+  const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
+  const auto& shapes = g.GetAttr<ShapeVector>("shape");
+  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+  auto storage_ids = g.MoveCopyAttr<StorageVector>("storage_id");
+  auto storage_inplace = g.MoveCopyAttr<std::vector<int> >("storage_inplace_index");
+  uint32_t entry_start = entry_range.first;
+  uint32_t entry_end =
+      entry_range.second > entry_start ? entry_range.second : idx.num_node_entries();
+  MemoryPlanVector mem_plan(idx.num_node_entries());
+  std::unordered_map<int, uint32_t> sid_to_loc;
+
+  for (uint32_t i = entry_start; i < entry_end; ++i) {
+    if (stypes[i] != kDefaultStorage) continue;
+    if (storage_ids[i] < 0) {
+      mem_plan[i] = {i, mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size(), false};
+    } else if (!sid_to_loc.count(storage_ids[i])) {
+      CHECK_LT(storage_inplace[i], 0);
+      sid_to_loc[storage_ids[i]] = i;
+      mem_plan[i].sid = i;
+      mem_plan[i].size = mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size();
+    } else {
+      uint32_t loc = sid_to_loc[storage_ids[i]];
+      mem_plan[i] = {loc, 0, storage_inplace[i] >= 0};
+      mem_plan[loc].size = std::max(mem_plan[loc].size,
+          mshadow::mshadow_sizeof(dtypes[i]) * shapes[i].Size());
+    }
+  }
+
+  return mem_plan;
+}
+
+
+inline void AllocateMemory(const nnvm::Graph& g,
+                    const nnvm::IndexedGraph& idx,
+                    const Context& default_ctx,
+                    const uint32_t entry_start, const uint32_t entry_end,
+                    const MemoryPlanVector& mem_plan,
+                    const std::vector<NDArray*>& arrays,
+                    std::vector<OpReqType> *array_reqs) {
+  using namespace nnvm;
+  const auto& dtypes = g.GetAttr<DTypeVector>("dtype");
+  const auto& shapes = g.GetAttr<ShapeVector>("shape");
+  const auto& stypes = g.GetAttr<StorageTypeVector>("storage_type");
+
+  for (uint32_t i = entry_start; i < entry_end; ++i) {
+    if (!arrays[i]->is_none()) continue;
+    if (stypes[i] == kDefaultStorage) {
+      if (mem_plan[i].sid == i) {
+        CHECK_GT(mem_plan[i].size, 0);
+        NDArray buff(TShape({static_cast<dim_t>(mem_plan[i].size)}),
+                     default_ctx, true, mshadow::kUint8);
+        *arrays[i] = buff.AsArray(shapes[i], dtypes[i]);
+      } else {
+        *arrays[i] = arrays[mem_plan[i].sid]->AsArray(shapes[i], dtypes[i]);
+        if (mem_plan[i].inplace && array_reqs->at(i) == kWriteTo) {
+          array_reqs->at(i) = kWriteInplace;
+        }
+      }
+    } else {
+      *arrays[i] = NDArray(static_cast<NDArrayStorageType>(stypes[i]),
+                           shapes[i], default_ctx, true, dtypes[i]);
+    }
+  }
+}
+
+}  // namespace imperative
+}  // namespace mxnet
+
+#endif  // MXNET_IMPERATIVE_IMPERATIVE_UTILS_H_
diff --git a/src/initialize.cc b/src/initialize.cc
index 092dacfb26f2..56d6fe1fffef 100644
--- a/src/initialize.cc
+++ b/src/initialize.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file initialize.cc
  * \brief initialize mxnet library
  */
@@ -53,7 +54,9 @@ class LibraryInitializer {
  public:
   LibraryInitializer() {
     dmlc::InitLogging("mxnet");
-    // signal(SIGSEGV, segfault_logger);
+#if MXNET_USE_SIGNAL_HANDLER
+    signal(SIGSEGV, segfault_logger);
+#endif
 #if MXNET_USE_PROFILER
     // ensure profiler's constructor are called before atexit.
     engine::Profiler::Get();
@@ -75,5 +78,10 @@ LibraryInitializer* LibraryInitializer::Get() {
   return &inst;
 }
 
+#ifdef __GNUC__
+// Don't print an unused variable message since this is intentional
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
 static LibraryInitializer* __library_init = LibraryInitializer::Get();
 }  // namespace mxnet
diff --git a/src/io/image_aug_default.cc b/src/io/image_aug_default.cc
index 6db14bd583c8..22af7d927500 100644
--- a/src/io/image_aug_default.cc
+++ b/src/io/image_aug_default.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_aug_default.cc
  * \brief Default augmenter.
  */
diff --git a/src/io/image_augmenter.h b/src/io/image_augmenter.h
index 5b6c4e99e502..e8a56ba2e5b7 100644
--- a/src/io/image_augmenter.h
+++ b/src/io/image_augmenter.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_augmenter.h
  * \brief Interface of opencv based image augmenter
  */
diff --git a/src/io/image_det_aug_default.cc b/src/io/image_det_aug_default.cc
index 7d15facf7843..79e19318366e 100644
--- a/src/io/image_det_aug_default.cc
+++ b/src/io/image_det_aug_default.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_det_aug_default.cc
  * \brief Default augmenter.
  */
@@ -315,9 +316,9 @@ class ImageDetLabel {
     }
     // check if crop_box valid
     bool valid = false;
-    if (min_crop_overlap > 0.f && max_crop_overlap < 1.f &&
-        min_crop_sample_coverage > 0.f && max_crop_sample_coverage < 1.f &&
-        min_crop_object_coverage > 0.f && max_crop_object_coverage < 1.f) {
+    if (min_crop_overlap > 0.f || max_crop_overlap < 1.f ||
+        min_crop_sample_coverage > 0.f || max_crop_sample_coverage < 1.f ||
+        min_crop_object_coverage > 0.f || max_crop_object_coverage < 1.f) {
       for (auto& obj : objects_) {
         Rect gt_box = obj.ToRect();
         if (min_crop_overlap > 0.f || max_crop_overlap < 1.f) {
@@ -483,7 +484,7 @@ class DefaultImageDetAugmenter : public ImageAugmenter {
     float min_ratio = std::max<float>(min_crop_aspect_ratio / img_aspect_ratio,
         new_scale * new_scale);
     float max_ratio = std::min<float>(max_crop_aspect_ratio / img_aspect_ratio,
-        1. / new_scale * new_scale);
+        1. / (new_scale * new_scale));
     float new_ratio = std::sqrt(std::uniform_real_distribution<float>(
         min_ratio, max_ratio)(*prnd));
     float new_width = std::min(1.f, new_scale * new_ratio);
diff --git a/src/io/image_io.cc b/src/io/image_io.cc
index e6b5a624448e..d95e750e79f3 100644
--- a/src/io/image_io.cc
+++ b/src/io/image_io.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file optimizer_op-inl.h
  * \brief Optimizer operators
  * \author Junyuan Xie
@@ -142,7 +143,7 @@ void ImdecodeImpl(int flag, bool to_rgb, void* data, size_t size,
   if (out->is_none()) {
     cv::Mat res = cv::imdecode(buf, flag);
     if (res.empty()) {
-      LOG(INFO) << "Invalid image file. Only supports png and jpg.";
+      LOG(INFO) << "Decoding failed. Invalid image file.";
       *out = NDArray();
       return;
     }
@@ -151,18 +152,20 @@ void ImdecodeImpl(int flag, bool to_rgb, void* data, size_t size,
     dst = cv::Mat(out->shape()[0], out->shape()[1], flag == 0 ? CV_8U : CV_8UC3,
                   out->data().dptr_);
     res.copyTo(dst);
+    CHECK(!dst.empty()) << "Failed copying buffer to output.";
   } else {
     dst = cv::Mat(out->shape()[0], out->shape()[1], flag == 0 ? CV_8U : CV_8UC3,
                 out->data().dptr_);
 #if (CV_MAJOR_VERSION > 2 || (CV_MAJOR_VERSION == 2 && CV_MINOR_VERSION >=4))
     cv::imdecode(buf, flag, &dst);
+    CHECK(!dst.empty()) << "Decoding failed. Invalid image file.";
 #else
     cv::Mat tmp = cv::imdecode(buf, flag);
-    CHECK(!tmp.empty());
+    CHECK(!tmp.empty()) << "Decoding failed. Invalid image file.";
     tmp.copyTo(dst);
+    CHECK(!dst.empty()) << "Failed copying buffer to output.";
 #endif
   }
-  CHECK(!dst.empty());
   CHECK_EQ(static_cast<void*>(dst.ptr()), out->data().dptr_);
   if (to_rgb && flag != 0) {
     cv::cvtColor(dst, dst, CV_BGR2RGB);
@@ -176,7 +179,7 @@ void Imdecode(const nnvm::NodeAttrs& attrs,
 #if MXNET_USE_OPENCV
   const auto& param = nnvm::get<ImdecodeParam>(attrs.parsed);
 
-  CHECK_EQ(inputs[0].ctx().dev_mask(), cpu::kDevMask) << "Only supports cpu input";
+  CHECK_EQ(inputs[0].ctx().dev_mask(), Context::kCPU) << "Only supports cpu input";
   CHECK_EQ(inputs[0].dtype(), mshadow::kUint8) << "Input needs to be uint8 buffer";
   inputs[0].WaitToRead();
 
@@ -225,7 +228,7 @@ void Imread(const nnvm::NodeAttrs& attrs,
   } else {
     (*outputs)[0] = NDArray();
     ImdecodeImpl(param.flag, param.to_rgb, buff, fsize, &((*outputs)[0]));
-    delete buff;
+    delete[] buff;
     return;
   }
 
diff --git a/src/io/image_iter_common.h b/src/io/image_iter_common.h
index f2f72dc928eb..c61e3d12a82e 100644
--- a/src/io/image_iter_common.h
+++ b/src/io/image_iter_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file image_iter_common.h
  * \brief common types used by image data iterators
  */
@@ -102,6 +103,8 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
   std::string path_imglist;
   /*! \brief path to image recordio */
   std::string path_imgrec;
+  /*! \brief path to index file */
+  std::string path_imgidx;
   /*! \brief a sequence of names of image augmenters, seperated by , */
   std::string aug_seq;
   /*! \brief label-width */
@@ -130,6 +133,9 @@ struct ImageRecParserParam : public dmlc::Parameter<ImageRecParserParam> {
     DMLC_DECLARE_FIELD(path_imgrec).set_default("")
         .describe("Path to the image RecordIO (.rec) file or a directory path. "\
                   "Created with tools/im2rec.py.");
+    DMLC_DECLARE_FIELD(path_imgidx).set_default("")
+        .describe("Path to the image RecordIO index (.idx) file. "\
+                  "Created with tools/im2rec.py.");
     DMLC_DECLARE_FIELD(aug_seq).set_default("aug_default")
         .describe("The augmenter names to represent"\
                   " sequence of augmenters to be applied, seperated by comma." \
@@ -206,6 +212,14 @@ struct ImageNormalizeParam :  public dmlc::Parameter<ImageNormalizeParam> {
   float mean_b;
   /*! \brief mean value for alpha channel */
   float mean_a;
+  /*! \brief standard deviation for r channel */
+  float std_r;
+  /*! \brief standard deviation for g channel */
+  float std_g;
+  /*! \brief standard deviation for b channel */
+  float std_b;
+  /*! \brief standard deviation for alpha channel */
+  float std_a;
   /*! \brief scale on color space */
   float scale;
   /*! \brief maximum ratio of contrast variation */
@@ -235,6 +249,14 @@ struct ImageNormalizeParam :  public dmlc::Parameter<ImageNormalizeParam> {
         .describe("The mean value to be subtracted on the B channel");
     DMLC_DECLARE_FIELD(mean_a).set_default(0.0f)
         .describe("The mean value to be subtracted on the alpha channel");
+    DMLC_DECLARE_FIELD(std_r).set_default(1.0f)
+        .describe("Augmentation Param: Standard deviation on R channel.");
+    DMLC_DECLARE_FIELD(std_g).set_default(1.0f)
+        .describe("Augmentation Param: Standard deviation on G channel.");
+    DMLC_DECLARE_FIELD(std_b).set_default(1.0f)
+        .describe("Augmentation Param: Standard deviation on B channel.");
+    DMLC_DECLARE_FIELD(std_a).set_default(1.0f)
+        .describe("Augmentation Param: Standard deviation on Alpha channel.");
     DMLC_DECLARE_FIELD(scale).set_default(1.0f)
         .describe("Multiply the image with a scale value.");
     DMLC_DECLARE_FIELD(max_random_contrast).set_default(0.0f)
diff --git a/src/io/image_recordio.h b/src/io/image_recordio.h
index a931539aa296..24951803f3b8 100644
--- a/src/io/image_recordio.h
+++ b/src/io/image_recordio.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file image_recordio.h
  * \brief image recordio struct
  */
diff --git a/src/io/inst_vector.h b/src/io/inst_vector.h
index 6dc7bdfd730a..2682b94b4fae 100644
--- a/src/io/inst_vector.h
+++ b/src/io/inst_vector.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file inst_vector.h
  * \brief holder of a sequence of DataInst in CPU
  *        that are not necessarily of same shape
@@ -30,7 +31,6 @@
 #include <mxnet/base.h>
 #include <dmlc/base.h>
 #include <mshadow/tensor.h>
-#include <mshadow/tensor_blob.h>
 #include <vector>
 #include <string>
 
@@ -102,6 +102,10 @@ class InstVector {
   inline DataInst operator[](size_t i) const {
     DataInst inst;
     inst.index = index_[i];
+    // ImageRecordIter depends on data vector
+    // here having size 2. If you want to
+    // change this assumption here, change it
+    // in there as well (InitBatch section)!
     inst.data.push_back(TBlob(data_[i]));
     inst.data.push_back(TBlob(label_[i]));
     return inst;
@@ -169,20 +173,20 @@ struct TBlobBatch {
   }
   /*! \brief destructor */
   ~TBlobBatch() {
-    delete inst_index;
+    delete[] inst_index;
   }
 };  // struct TBlobBatch
 
-class TBlobContainer : public mshadow::TBlob {
+class TBlobContainer : public TBlob {
  public:
   TBlobContainer(void)
-    : mshadow::TBlob(), tensor_container_(nullptr) {}
+    : TBlob(), tensor_container_(nullptr) {}
   ~TBlobContainer() {
     if (tensor_container_) {
       release();
     }
   }
-  void resize(const mshadow::TShape &shape, int type_flag) {
+  void resize(const TShape &shape, int type_flag) {
     if (tensor_container_) {
       CHECK_EQ(this->type_flag_, type_flag);
       this->shape_ = shape;
@@ -192,13 +196,12 @@ class TBlobContainer : public mshadow::TBlob {
       this->shape_ = shape;
       create();
     }
-    this->stride_ = shape_[shape_.ndim() - 1];
   }
 
  private:
   void create() {
     CHECK(tensor_container_ == nullptr);
-    CHECK_EQ(this->dev_mask_, mshadow::cpu::kDevMask);
+    CHECK_EQ(this->dev_mask(), mshadow::cpu::kDevMask);
     MSHADOW_TYPE_SWITCH(this->type_flag_, DType, {
         auto tensor_container = new mshadow::TensorContainer<mshadow::cpu, 1, DType>(false);
         tensor_container->Resize(mshadow::Shape1(shape_.Size()));
diff --git a/src/io/io.cc b/src/io/io.cc
index e7c92843b44e..b92f02e160fc 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -17,6 +17,7 @@
  * under the License.
  */
 
+// Copyright (c) 2015 by Contributors
 
 #include <mxnet/io.h>
 #include <dmlc/registry.h>
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index c5ec10618080..be911f695c8e 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_batchloader.h
  * \brief define a batch adapter to create tblob batch
  */
@@ -41,7 +42,7 @@ namespace io {
 class BatchLoader : public IIterator<TBlobBatch> {
  public:
   explicit BatchLoader(IIterator<DataInst> *base):
-      base_(base), head_(1), num_overflow_(0) {
+    head_(1), num_overflow_(0), base_(base) {
   }
 
   virtual ~BatchLoader(void) {
@@ -52,7 +53,7 @@ class BatchLoader : public IIterator<TBlobBatch> {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init batch param, it could have similar param with
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    // Init space for out_
+    // Init space for out
     out_.inst_index = new unsigned[param_.batch_size];
     out_.batch_size = param_.batch_size;
     out_.data.clear();
@@ -69,6 +70,7 @@ class BatchLoader : public IIterator<TBlobBatch> {
     }
     head_ = 1;
   }
+
   virtual bool Next(void) {
     out_.num_batch_padd = 0;
     out_.batch_size = param_.batch_size;
@@ -128,23 +130,25 @@ class BatchLoader : public IIterator<TBlobBatch> {
     return out_;
   }
 
- private:
+ protected:
   /*! \brief batch parameters */
   BatchParam param_;
   /*! \brief output data */
   TBlobBatch out_;
-  /*! \brief base iterator */
-  IIterator<DataInst> *base_;
   /*! \brief on first */
   int head_;
   /*! \brief number of overflow instances that readed in round_batch mode */
   int num_overflow_;
+  /*! \brief tensor to hold data */
+  std::vector<TBlobContainer> data_;
+
+ private:
+  /*! \brief base iterator */
+  IIterator<DataInst> *base_;
   /*! \brief data shape */
   std::vector<TShape> shape_;
   /*! \brief unit size */
   std::vector<size_t> unit_size_;
-  /*! \brief tensor to hold data */
-  std::vector<TBlobContainer> data_;
   // initialize the data holder by using from the first batch.
   inline void InitData(const DataInst& first_batch) {
     shape_.resize(first_batch.data.size());
diff --git a/src/io/iter_csv.cc b/src/io/iter_csv.cc
index a28b8d4d9d13..a9e650b63875 100644
--- a/src/io/iter_csv.cc
+++ b/src/io/iter_csv.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_csv.cc
  * \brief define a CSV Reader to read in arrays
  */
@@ -164,6 +165,8 @@ to set `round_batch` to False.
 
 If ``data_csv = 'data/'`` is set, then all the files in this directory will be read.
 
+``reset()`` is expected to be called only after a complete pass of data.
+
 Examples::
 
   // Contents of CSV file ``data/data.csv``.
diff --git a/src/io/iter_image_det_recordio.cc b/src/io/iter_image_det_recordio.cc
index 4e80d5d53172..b93370026616 100644
--- a/src/io/iter_image_det_recordio.cc
+++ b/src/io/iter_image_det_recordio.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_image_recordio-inl.hpp
  * \brief recordio data iterator
  */
diff --git a/src/io/iter_image_recordio.cc b/src/io/iter_image_recordio.cc
index 64f31a65fa51..3af9e46b505c 100644
--- a/src/io/iter_image_recordio.cc
+++ b/src/io/iter_image_recordio.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_image_recordio-inl.hpp
  * \brief recordio data iterator
  */
diff --git a/src/io/iter_image_recordio_2.cc b/src/io/iter_image_recordio_2.cc
index c4d1e8624bcc..fd8b6d7be834 100644
--- a/src/io/iter_image_recordio_2.cc
+++ b/src/io/iter_image_recordio_2.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file iter_image_recordio_2.cc
  * \brief new version of recordio data iterator
  */
@@ -33,6 +34,9 @@
 #include <dmlc/common.h>
 #include <dmlc/timer.h>
 #include <type_traits>
+#if MXNET_USE_LIBJPEG_TURBO
+#include <turbojpeg.h>
+#endif
 #include "./image_recordio.h"
 #include "./image_augmenter.h"
 #include "./image_iter_common.h"
@@ -62,7 +66,17 @@ class ImageRecordIOParser2 {
   inline bool ParseNext(DataBatch *out);
 
  private:
-  inline void ParseChunk(dmlc::InputSplit::Blob * chunk);
+#if MXNET_USE_OPENCV
+  template<int n_channels>
+  void ProcessImage(const cv::Mat& res,
+    mshadow::Tensor<cpu, 3, DType>* data_ptr, const bool is_mirrored, const float contrast_scaled,
+    const float illumination_scaled);
+#if MXNET_USE_LIBJPEG_TURBO
+  cv::Mat TJimdecode(cv::Mat buf, int color);
+#endif
+#endif
+  inline unsigned ParseChunk(DType* data_dptr, real_t* label_dptr, const unsigned current_size,
+    dmlc::InputSplit::Blob * chunk);
   inline void CreateMeanImg(void);
 
   // magic number to seed prng
@@ -100,6 +114,9 @@ class ImageRecordIOParser2 {
   std::vector<size_t> unit_size_;
   /*! \brief mean image, if needed */
   mshadow::TensorContainer<cpu, 3> meanimg_;
+  // whether to use legacy shuffle
+  // (without IndexedRecordIO support)
+  bool legacy_shuffle_;
   // whether mean image is ready.
   bool meanfile_ready_;
 };
@@ -153,34 +170,48 @@ inline void ImageRecordIOParser2<DType>::Init(
     LOG(INFO) << "ImageRecordIOParser2: " << param_.path_imgrec
               << ", use " << threadget << " threads for decoding..";
   }
-  source_.reset(dmlc::InputSplit::Create(
-      param_.path_imgrec.c_str(), param_.part_index,
-      param_.num_parts, "recordio"));
-  if (param_.shuffle_chunk_size > 0) {
-    if (param_.shuffle_chunk_size > 4096) {
-      LOG(INFO) << "Chunk size: " << param_.shuffle_chunk_size
-                 << " MB which is larger than 4096 MB, please set "
-                    "smaller chunk size";
-    }
-    if (param_.shuffle_chunk_size < 4) {
-      LOG(INFO) << "Chunk size: " << param_.shuffle_chunk_size
-                 << " MB which is less than 4 MB, please set "
-                    "larger chunk size";
-    }
-    // 1.1 ratio is for a bit more shuffle parts to avoid boundary issue
-    unsigned num_shuffle_parts =
-        std::ceil(source_->GetTotalSize() * 1.1 /
-                  (param_.num_parts * (param_.shuffle_chunk_size << 20UL)));
-
-    if (num_shuffle_parts > 1) {
-      source_.reset(dmlc::InputSplitShuffle::Create(
-          param_.path_imgrec.c_str(), param_.part_index,
-          param_.num_parts, "recordio", num_shuffle_parts, param_.shuffle_chunk_seed));
-    }
-    source_->HintChunkSize(param_.shuffle_chunk_size << 17UL);
+  legacy_shuffle_ = false;
+  if (param_.path_imgidx.length() != 0) {
+    source_.reset(dmlc::InputSplit::Create(
+        param_.path_imgrec.c_str(),
+        param_.path_imgidx.c_str(),
+        param_.part_index,
+        param_.num_parts, "indexed_recordio",
+        record_param_.shuffle,
+        record_param_.seed,
+        batch_param_.batch_size));
   } else {
-    // use 64 MB chunk when possible
-    source_->HintChunkSize(8 << 20UL);
+    source_.reset(dmlc::InputSplit::Create(
+        param_.path_imgrec.c_str(), param_.part_index,
+        param_.num_parts, "recordio"));
+    if (record_param_.shuffle)
+      legacy_shuffle_ = true;
+    if (param_.shuffle_chunk_size > 0) {
+      if (param_.shuffle_chunk_size > 4096) {
+        LOG(INFO) << "Chunk size: " << param_.shuffle_chunk_size
+                   << " MB which is larger than 4096 MB, please set "
+                      "smaller chunk size";
+      }
+      if (param_.shuffle_chunk_size < 4) {
+        LOG(INFO) << "Chunk size: " << param_.shuffle_chunk_size
+                   << " MB which is less than 4 MB, please set "
+                      "larger chunk size";
+      }
+      // 1.1 ratio is for a bit more shuffle parts to avoid boundary issue
+      unsigned num_shuffle_parts =
+          std::ceil(source_->GetTotalSize() * 1.1 /
+                    (param_.num_parts * (param_.shuffle_chunk_size << 20UL)));
+
+      if (num_shuffle_parts > 1) {
+        source_.reset(dmlc::InputSplitShuffle::Create(
+            param_.path_imgrec.c_str(), param_.part_index,
+            param_.num_parts, "recordio", num_shuffle_parts, param_.shuffle_chunk_seed));
+      }
+      source_->HintChunkSize(param_.shuffle_chunk_size << 17UL);
+    } else {
+      // use 64 MB chunk when possible
+      source_->HintChunkSize(64 << 20UL);
+    }
   }
   // Normalize init
   if (!std::is_same<DType, uint8_t>::value) {
@@ -211,6 +242,9 @@ inline void ImageRecordIOParser2<DType>::Init(
         meanimg_.Resize(src.shape_);
         mshadow::Copy(meanimg_, src);
         meanfile_ready_ = true;
+        if (param_.verbose) {
+          LOG(INFO) << "Load mean image from " << normalize_param_.mean_img << " completed";
+        }
       }
     }
   }
@@ -221,35 +255,66 @@ inline void ImageRecordIOParser2<DType>::Init(
 
 template<typename DType>
 inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
-  if (overflow)
+  if (overflow) {
     return false;
+  }
   CHECK(source_ != nullptr);
   dmlc::InputSplit::Blob chunk;
   unsigned current_size = 0;
   out->index.resize(batch_param_.batch_size);
+
+  // InitBatch
+  if (out->data.size() == 0) {
+    // This assumes that DataInst given by
+    // InstVector contains only 2 elements in
+    // data vector (operator[] implementation)
+    out->data.resize(2);
+    unit_size_.resize(2);
+
+    std::vector<index_t> shape_vec;
+    shape_vec.push_back(batch_param_.batch_size);
+    for (index_t dim = 0; dim < param_.data_shape.ndim(); ++dim) {
+      shape_vec.push_back(param_.data_shape[dim]);
+    }
+    TShape data_shape(shape_vec.begin(), shape_vec.end());
+
+    shape_vec.clear();
+    shape_vec.push_back(batch_param_.batch_size);
+    shape_vec.push_back(param_.label_width);
+    TShape label_shape(shape_vec.begin(), shape_vec.end());
+
+    out->data.at(0) = NDArray(data_shape, Context::CPUPinned(0), false,
+      mshadow::DataType<DType>::kFlag);
+    out->data.at(1) = NDArray(label_shape, Context::CPUPinned(0), false,
+      mshadow::DataType<real_t>::kFlag);
+    unit_size_[0] = param_.data_shape.Size();
+    unit_size_[1] = param_.label_width;
+  }
+
   while (current_size < batch_param_.batch_size) {
-    int n_to_copy;
+    // int n_to_copy;
+    unsigned n_to_out = 0;
     if (n_parsed_ == 0) {
-      if (source_->NextChunk(&chunk)) {
+      if (source_->NextBatch(&chunk, batch_param_.batch_size)) {
         inst_order_.clear();
         inst_index_ = 0;
-        ParseChunk(&chunk);
-        unsigned n_read = 0;
-        for (unsigned i = 0; i < temp_.size(); ++i) {
-          const InstVector<DType>& tmp = temp_[i];
-          for (unsigned j = 0; j < tmp.Size(); ++j) {
-            inst_order_.push_back(std::make_pair(i, j));
-          }
-          n_read += tmp.Size();
+        DType* data_dptr = static_cast<DType*>(out->data[0].data().dptr_);
+        real_t* label_dptr = static_cast<real_t*>(out->data[1].data().dptr_);
+        if (!legacy_shuffle_) {
+          n_to_out = ParseChunk(data_dptr, label_dptr, current_size, &chunk);
+        } else {
+          n_to_out = ParseChunk(NULL, NULL, batch_param_.batch_size, &chunk);
         }
-        n_to_copy = std::min(n_read, batch_param_.batch_size - current_size);
-        n_parsed_ = n_read - n_to_copy;
+        // Count number of parsed images that do not fit into current out
+        n_parsed_ = inst_order_.size();
         // shuffle instance order if needed
-        if (record_param_.shuffle != 0) {
+        if (legacy_shuffle_) {
           std::shuffle(inst_order_.begin(), inst_order_.end(), rnd_);
         }
       } else {
-        if (current_size == 0) return false;
+        if (current_size == 0) {
+          return false;
+        }
         CHECK(!overflow) << "number of input images must be bigger than the batch size";
         if (batch_param_.round_batch != 0) {
           overflow = true;
@@ -258,84 +323,209 @@ inline bool ImageRecordIOParser2<DType>::ParseNext(DataBatch *out) {
           current_size = batch_param_.batch_size;
         }
         out->num_batch_padd = batch_param_.batch_size - current_size;
-        n_to_copy = 0;
+        n_to_out = 0;
       }
     } else {
-      n_to_copy = std::min(n_parsed_, batch_param_.batch_size - current_size);
+      int n_to_copy = std::min(n_parsed_, batch_param_.batch_size - current_size);
       n_parsed_ -= n_to_copy;
-    }
-
-    // InitBatch
-    if (out->data.size() == 0 && n_to_copy != 0) {
-      std::pair<unsigned, unsigned> place = inst_order_[inst_index_];
-      const DataInst& first_batch = temp_[place.first][place.second];
-      out->data.resize(first_batch.data.size());
-      unit_size_.resize(first_batch.data.size());
-      for (size_t i = 0; i < out->data.size(); ++i) {
-        TShape src_shape = first_batch.data[i].shape_;
-        int src_type_flag = first_batch.data[i].type_flag_;
-        // init object attributes
-        std::vector<index_t> shape_vec;
-        shape_vec.push_back(batch_param_.batch_size);
-        for (index_t dim = 0; dim < src_shape.ndim(); ++dim) {
-          shape_vec.push_back(src_shape[dim]);
+      // Copy
+      #pragma omp parallel for num_threads(param_.preprocess_threads)
+      for (int i = 0; i < n_to_copy; ++i) {
+        std::pair<unsigned, unsigned> place = inst_order_[inst_index_ + i];
+        const DataInst& batch = temp_[place.first][place.second];
+        for (unsigned j = 0; j < batch.data.size(); ++j) {
+          CHECK_EQ(unit_size_[j], batch.data[j].Size());
+          MSHADOW_TYPE_SWITCH(out->data[j].data().type_flag_, dtype, {
+          mshadow::Copy(
+              out->data[j].data().FlatTo1D<cpu, dtype>().Slice((current_size + i) * unit_size_[j],
+                (current_size + i + 1) * unit_size_[j]),
+              batch.data[j].get_with_shape<cpu, 1, dtype>(mshadow::Shape1(unit_size_[j])));
+          });
         }
-        TShape dst_shape(shape_vec.begin(), shape_vec.end());
-        auto dtype = prefetch_param_.dtype
-          ? prefetch_param_.dtype.value()
-          : first_batch.data[i].type_flag_;
-        out->data.at(i) = NDArray(dst_shape, Context::CPUPinned(0), false, src_type_flag);
-        unit_size_[i] = src_shape.Size();
       }
+      n_to_out = n_to_copy;
+      inst_index_ += n_to_copy;
     }
 
-    // Copy
-    #pragma omp parallel for num_threads(param_.preprocess_threads)
-    for (int i = 0; i < n_to_copy; ++i) {
-      std::pair<unsigned, unsigned> place = inst_order_[inst_index_ + i];
-      const DataInst& batch = temp_[place.first][place.second];
-      for (unsigned j = 0; j < batch.data.size(); ++j) {
-        CHECK_EQ(unit_size_[j], batch.data[j].Size());
-        MSHADOW_TYPE_SWITCH(out->data[j].data().type_flag_, dtype, {
-        mshadow::Copy(
-            out->data[j].data().FlatTo1D<cpu, dtype>().Slice((current_size + i) * unit_size_[j],
-              (current_size + i + 1) * unit_size_[j]),
-            batch.data[j].get_with_shape<cpu, 1, dtype>(mshadow::Shape1(unit_size_[j])));
-        });
+    current_size += n_to_out;
+  }
+  return true;
+}
+
+#if MXNET_USE_OPENCV
+template<typename DType>
+template<int n_channels>
+void ImageRecordIOParser2<DType>::ProcessImage(const cv::Mat& res,
+  mshadow::Tensor<cpu, 3, DType>* data_ptr, const bool is_mirrored, const float contrast_scaled,
+  const float illumination_scaled) {
+  float RGBA_MULT[4] = { 0 };
+  float RGBA_BIAS[4] = { 0 };
+  float RGBA_MEAN[4] = { 0 };
+  mshadow::Tensor<cpu, 3, DType>& data = (*data_ptr);
+  if (!std::is_same<DType, uint8_t>::value) {
+    RGBA_MULT[0] = contrast_scaled / normalize_param_.std_r;
+    RGBA_MULT[1] = contrast_scaled / normalize_param_.std_g;
+    RGBA_MULT[2] = contrast_scaled / normalize_param_.std_b;
+    RGBA_MULT[3] = contrast_scaled / normalize_param_.std_a;
+    RGBA_BIAS[0] = illumination_scaled / normalize_param_.std_r;
+    RGBA_BIAS[1] = illumination_scaled / normalize_param_.std_g;
+    RGBA_BIAS[2] = illumination_scaled / normalize_param_.std_b;
+    RGBA_BIAS[3] = illumination_scaled / normalize_param_.std_a;
+    if (!meanfile_ready_) {
+      RGBA_MEAN[0] = normalize_param_.mean_r;
+      RGBA_MEAN[1] = normalize_param_.mean_g;
+      RGBA_MEAN[2] = normalize_param_.mean_b;
+      RGBA_MEAN[3] = normalize_param_.mean_a;
+    }
+  }
+
+  int swap_indices[n_channels]; // NOLINT(*)
+  if (n_channels == 1) {
+    swap_indices[0] = 0;
+  } else if (n_channels == 3) {
+    swap_indices[0] = 2;
+    swap_indices[1] = 1;
+    swap_indices[2] = 0;
+  } else if (n_channels == 4) {
+    swap_indices[0] = 2;
+    swap_indices[1] = 1;
+    swap_indices[2] = 0;
+    swap_indices[3] = 3;
+  }
+
+  DType RGBA[n_channels] = {};
+  for (int i = 0; i < res.rows; ++i) {
+    const uchar* im_data = res.ptr<uchar>(i);
+    for (int j = 0; j < res.cols; ++j) {
+      for (int k = 0; k < n_channels; ++k) {
+        RGBA[k] = im_data[swap_indices[k]];
+      }
+      if (!std::is_same<DType, uint8_t>::value) {
+        // normalize/mirror here to avoid memory copies
+        // logic from iter_normalize.h, function SetOutImg
+        for (int k = 0; k < n_channels; ++k) {
+          if (meanfile_ready_) {
+            RGBA[k] = (RGBA[k] - meanimg_[k][i][j]) * RGBA_MULT[k] + RGBA_BIAS[k];
+          } else {
+            RGBA[k] = (RGBA[k] - RGBA_MEAN[k]) * RGBA_MULT[k] + RGBA_BIAS[k];
+          }
+        }
+      }
+      for (int k = 0; k < n_channels; ++k) {
+        // mirror here to avoid memory copies
+        // logic from iter_normalize.h, function SetOutImg
+        if (is_mirrored) {
+          data[k][i][res.cols - j - 1] = RGBA[k];
+        } else {
+          data[k][i][j] = RGBA[k];
+        }
       }
+      im_data += n_channels;
     }
-    inst_index_ += n_to_copy;
-    current_size += n_to_copy;
   }
-  return true;
+}
+
+#if MXNET_USE_LIBJPEG_TURBO
+
+bool is_jpeg(unsigned char * file) {
+  if ((file[0] == 255) && (file[1] == 216)) {
+    return true;
+  } else {
+    return false;
+  }
 }
 
 template<typename DType>
-inline void ImageRecordIOParser2<DType>::ParseChunk(dmlc::InputSplit::Blob * chunk) {
+cv::Mat ImageRecordIOParser2<DType>::TJimdecode(cv::Mat image, int color) {
+  unsigned char* jpeg = image.ptr();
+  size_t jpeg_size = image.rows * image.cols;
+
+  if (!is_jpeg(jpeg)) {
+    // If it is not JPEG then fall back to OpenCV
+    return cv::imdecode(image, color);
+  }
+
+  tjhandle handle = tjInitDecompress();
+  int h, w, subsamp;
+  int err = tjDecompressHeader2(handle,
+                                jpeg,
+                                jpeg_size,
+                                &w, &h, &subsamp);
+  if (err != 0) {
+    // If it is a malformed JPEG then fall back to OpenCV
+    return cv::imdecode(image, color);
+  }
+  cv::Mat ret = cv::Mat(h, w, color ? CV_8UC3 : CV_8UC1);
+  err = tjDecompress2(handle,
+                      jpeg,
+                      jpeg_size,
+                      ret.ptr(),
+                      w,
+                      0,
+                      h,
+                      color ? TJPF_BGR : TJPF_GRAY,
+                      0);
+  if (err != 0) {
+    // If it is a malformed JPEG then fall back to OpenCV
+    return cv::imdecode(image, color);
+  }
+  tjDestroy(handle);
+  return ret;
+}
+#endif
+#endif
+
+// Returns the number of images that are put into output
+template<typename DType>
+inline unsigned ImageRecordIOParser2<DType>::ParseChunk(DType* data_dptr, real_t* label_dptr,
+  const unsigned current_size, dmlc::InputSplit::Blob * chunk) {
   temp_.resize(param_.preprocess_threads);
 #if MXNET_USE_OPENCV
   // save opencv out
+  dmlc::RecordIOChunkReader reader(*chunk, 0, 1);
+  unsigned gl_idx = current_size;
   #pragma omp parallel num_threads(param_.preprocess_threads)
   {
     CHECK(omp_get_num_threads() == param_.preprocess_threads);
-    int tid = omp_get_thread_num();
-    dmlc::RecordIOChunkReader reader(*chunk, tid, param_.preprocess_threads);
+    unsigned int tid = omp_get_thread_num();
+    // dmlc::RecordIOChunkReader reader(*chunk, tid, param_.preprocess_threads);
     ImageRecordIO rec;
     dmlc::InputSplit::Blob blob;
     // image data
-    InstVector<DType> &out = temp_[tid];
-    out.Clear();
-    while (reader.NextRecord(&blob)) {
+    InstVector<DType> &out_tmp = temp_[tid];
+    out_tmp.Clear();
+    while (true) {
+      bool reader_has_data;
+      unsigned idx;
+      #pragma omp critical
+      {
+        reader_has_data = reader.NextRecord(&blob);
+        if (reader_has_data) {
+          idx = gl_idx++;
+          if (idx >= batch_param_.batch_size) {
+            inst_order_.push_back(std::make_pair(tid, out_tmp.Size()));
+          }
+        }
+      }
+      if (!reader_has_data) break;
       // Opencv decode and augments
       cv::Mat res;
       rec.Load(blob.dptr, blob.size);
       cv::Mat buf(1, rec.content_size, CV_8U, rec.content);
       switch (param_.data_shape[0]) {
        case 1:
+#if MXNET_USE_LIBJPEG_TURBO
+        res = TJimdecode(buf, 0);
+#else
         res = cv::imdecode(buf, 0);
+#endif
         break;
        case 3:
+#if MXNET_USE_LIBJPEG_TURBO
+        res = TJimdecode(buf, 1);
+#else
         res = cv::imdecode(buf, 1);
+#endif
         break;
        case 4:
         // -1 to keep the number of channel of the encoded image, and not force gray or color.
@@ -351,25 +541,23 @@ inline void ImageRecordIOParser2<DType>::ParseChunk(dmlc::InputSplit::Blob * chu
       for (auto& aug : augmenters_[tid]) {
         res = aug->Process(res, nullptr, prnds_[tid].get());
       }
-      out.Push(static_cast<unsigned>(rec.image_index()),
-               mshadow::Shape3(n_channels, res.rows, res.cols),
-               mshadow::Shape1(param_.label_width));
-
-      mshadow::Tensor<cpu, 3, DType> data = out.data().Back();
-
-      // For RGB or RGBA data, swap the B and R channel:
-      // OpenCV store as BGR (or BGRA) and we want RGB (or RGBA)
-      std::vector<int> swap_indices;
-      if (n_channels == 1) swap_indices = {0};
-      if (n_channels == 3) swap_indices = {2, 1, 0};
-      if (n_channels == 4) swap_indices = {2, 1, 0, 3};
+      mshadow::Tensor<cpu, 3, DType> data;
+      if (idx < batch_param_.batch_size) {
+        data = mshadow::Tensor<cpu, 3, DType>(data_dptr + idx*unit_size_[0],
+          mshadow::Shape3(n_channels, res.rows, res.cols));
+      } else {
+        out_tmp.Push(static_cast<unsigned>(rec.image_index()),
+                 mshadow::Shape3(n_channels, res.rows, res.cols),
+                 mshadow::Shape1(param_.label_width));
+        data = out_tmp.data().Back();
+      }
 
       std::uniform_real_distribution<float> rand_uniform(0, 1);
       std::bernoulli_distribution coin_flip(0.5);
       bool is_mirrored = (normalize_param_.rand_mirror && coin_flip(*(prnds_[tid])))
                          || normalize_param_.mirror;
-      float contrast_scaled;
-      float illumination_scaled;
+      float contrast_scaled = 1;
+      float illumination_scaled = 0;
       if (!std::is_same<DType, uint8_t>::value) {
         contrast_scaled =
           (rand_uniform(*(prnds_[tid])) * normalize_param_.max_random_contrast * 2
@@ -378,62 +566,24 @@ inline void ImageRecordIOParser2<DType>::ParseChunk(dmlc::InputSplit::Blob * chu
           (rand_uniform(*(prnds_[tid])) * normalize_param_.max_random_illumination * 2
           - normalize_param_.max_random_illumination) * normalize_param_.scale;
       }
-      DType RGBA[4] = {};
-      for (int i = 0; i < res.rows; ++i) {
-        uchar* im_data = res.ptr<uchar>(i);
-        for (int j = 0; j < res.cols; ++j) {
-          for (int k = 0; k < n_channels; ++k) {
-            RGBA[k] = im_data[swap_indices[k]];
-          }
-          if (!std::is_same<DType, uint8_t>::value) {
-            // normalize/mirror here to avoid memory copies
-            // logic from iter_normalize.h, function SetOutImg
-
-            if (normalize_param_.mean_r > 0.0f || normalize_param_.mean_g > 0.0f ||
-                normalize_param_.mean_b > 0.0f || normalize_param_.mean_a > 0.0f) {
-              // subtract mean per channel
-              RGBA[0] -= normalize_param_.mean_r;
-              if (n_channels >= 3) {
-                RGBA[1] -= normalize_param_.mean_g;
-                RGBA[2] -= normalize_param_.mean_b;
-              }
-              if (n_channels == 4) {
-                RGBA[3] -= normalize_param_.mean_a;
-              }
-              for (int k = 0; k < n_channels; ++k) {
-                RGBA[k] = RGBA[k] * contrast_scaled + illumination_scaled;
-              }
-            } else if (!meanfile_ready_ || normalize_param_.mean_img.length() == 0) {
-              // do not subtract anything
-              for (int k = 0; k < n_channels; ++k) {
-                RGBA[k] = RGBA[k] * normalize_param_.scale;
-              }
-            } else {
-              CHECK(meanfile_ready_);
-              for (int k = 0; k < n_channels; ++k) {
-                  RGBA[k] = (RGBA[k] - meanimg_[k][i][j]) * contrast_scaled + illumination_scaled;
-              }
-            }
-          }
-          for (int k = 0; k < n_channels; ++k) {
-            if (!std::is_same<DType, uint8_t>::value) {
-              // normalize/mirror here to avoid memory copies
-              // logic from iter_normalize.h, function SetOutImg
-              if (is_mirrored) {
-                data[k][i][res.cols - j - 1] = RGBA[k];
-              } else {
-                data[k][i][j] = RGBA[k];
-              }
-            } else {
-              // do not do normalization in Uint8 reader
-              data[k][i][j] = RGBA[k];
-            }
-          }
-          im_data += n_channels;
-        }
+      // For RGB or RGBA data, swap the B and R channel:
+      // OpenCV store as BGR (or BGRA) and we want RGB (or RGBA)
+      if (n_channels == 1) {
+        ProcessImage<1>(res, &data, is_mirrored, contrast_scaled, illumination_scaled);
+      } else if (n_channels == 3) {
+        ProcessImage<3>(res, &data, is_mirrored, contrast_scaled, illumination_scaled);
+      } else if (n_channels == 4) {
+        ProcessImage<4>(res, &data, is_mirrored, contrast_scaled, illumination_scaled);
+      }
+
+      mshadow::Tensor<cpu, 1, real_t> label;
+      if (idx < batch_param_.batch_size) {
+        label = mshadow::Tensor<cpu, 1, real_t>(label_dptr + idx*unit_size_[1],
+          mshadow::Shape1(param_.label_width));
+      } else {
+        label = out_tmp.label().Back();
       }
 
-      mshadow::Tensor<cpu, 1> label = out.label().Back();
       if (label_map_ != nullptr) {
         mshadow::Copy(label, label_map_->Find(rec.image_index()));
       } else if (rec.label != NULL) {
@@ -451,8 +601,10 @@ inline void ImageRecordIOParser2<DType>::ParseChunk(dmlc::InputSplit::Blob * chu
       res.release();
     }
   }
+  return (std::min(batch_param_.batch_size, gl_idx) - current_size);
 #else
-      LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
+  LOG(FATAL) << "Opencv is needed for image decoding and augmenting.";
+  return 0;
 #endif
 }
 
@@ -467,14 +619,9 @@ inline void ImageRecordIOParser2<DType>::CreateMeanImg(void) {
     dmlc::InputSplit::Blob chunk;
     size_t imcnt = 0;  // NOLINT(*)
     while (source_->NextChunk(&chunk)) {
-      ParseChunk(&chunk);
       inst_order_.clear();
-      for (unsigned i = 0; i < temp_.size(); ++i) {
-        const InstVector<DType>& tmp = temp_[i];
-        for (unsigned j = 0; j < tmp.Size(); ++j) {
-          inst_order_.push_back(std::make_pair(i, j));
-        }
-      }
+      // Parse chunk w/o putting anything in out
+      ParseChunk(NULL, NULL, batch_param_.batch_size, &chunk);
       for (unsigned i = 0; i < inst_order_.size(); ++i) {
         std::pair<unsigned, unsigned> place = inst_order_[i];
         mshadow::Tensor<cpu, 3> outimg =
diff --git a/src/io/iter_libsvm.cc b/src/io/iter_libsvm.cc
new file mode 100644
index 000000000000..8abb768ad4c8
--- /dev/null
+++ b/src/io/iter_libsvm.cc
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file iter_libsvm.cc
+ * \brief define a LibSVM Reader to read in arrays
+ */
+#include <mxnet/io.h>
+#include <dmlc/base.h>
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <dmlc/data.h>
+#include "./iter_sparse_prefetcher.h"
+#include "./iter_sparse_batchloader.h"
+
+namespace mxnet {
+namespace io {
+// LibSVM parameters
+struct LibSVMIterParam : public dmlc::Parameter<LibSVMIterParam> {
+  /*! \brief path to data libsvm file */
+  std::string data_libsvm;
+  /*! \brief data shape */
+  TShape data_shape;
+  /*! \brief path to label libsvm file */
+  std::string label_libsvm;
+  /*! \brief label shape */
+  TShape label_shape;
+  /*! \brief partition the data into multiple parts */
+  int num_parts;
+  /*! \brief the index of the part will read*/
+  int part_index;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(LibSVMIterParam) {
+    DMLC_DECLARE_FIELD(data_libsvm)
+        .describe("The input zero-base indexed LibSVM data file or a directory path.");
+    DMLC_DECLARE_FIELD(data_shape)
+        .describe("The shape of one example.");
+    DMLC_DECLARE_FIELD(label_libsvm).set_default("NULL")
+        .describe("The input LibSVM label file or a directory path. "
+                  "If NULL, all labels will be read from ``data_libsvm``.");
+    index_t shape1[] = {1};
+    DMLC_DECLARE_FIELD(label_shape).set_default(TShape(shape1, shape1 + 1))
+        .describe("The shape of one label.");
+    DMLC_DECLARE_FIELD(num_parts).set_default(1)
+        .describe("partition the data into multiple parts");
+    DMLC_DECLARE_FIELD(part_index).set_default(0)
+        .describe("the index of the part will read");
+  }
+};
+
+class LibSVMIter: public SparseIIterator<DataInst> {
+ public:
+  LibSVMIter() {}
+  virtual ~LibSVMIter() {}
+
+  // intialize iterator loads data in
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    CHECK_EQ(param_.data_shape.ndim(), 1) << "dimension of data_shape is expected to be 1";
+    CHECK_GT(param_.num_parts, 0) << "number of parts should be positive";
+    CHECK_GE(param_.part_index, 0) << "part index should be non-negative";
+    data_parser_.reset(dmlc::Parser<uint64_t>::Create(param_.data_libsvm.c_str(),
+                                                      param_.part_index,
+                                                      param_.num_parts, "libsvm"));
+    if (param_.label_libsvm != "NULL") {
+      label_parser_.reset(dmlc::Parser<uint64_t>::Create(param_.label_libsvm.c_str(),
+                                                         param_.part_index,
+                                                         param_.num_parts, "libsvm"));
+      CHECK_GT(param_.label_shape.Size(), 1)
+        << "label_shape is not expected to be (1,) when param_.label_libsvm is set.";
+    } else {
+      CHECK_EQ(param_.label_shape.Size(), 1)
+        << "label_shape is expected to be (1,) when param_.label_libsvm is NULL";
+    }
+    // both data and label are of CSRStorage in libsvm format
+    if (param_.label_shape.Size() > 1) {
+      out_.data.resize(6);
+    } else {
+      // only data is of CSRStorage in libsvm format.
+      out_.data.resize(4);
+    }
+  }
+
+  virtual void BeforeFirst() {
+    data_parser_->BeforeFirst();
+    if (label_parser_.get() != nullptr) {
+      label_parser_->BeforeFirst();
+    }
+    data_ptr_ = label_ptr_ = 0;
+    data_size_ = label_size_ = 0;
+    inst_counter_ = 0;
+    end_ = false;
+  }
+
+  virtual bool Next() {
+    if (end_) return false;
+    while (data_ptr_ >= data_size_) {
+      if (!data_parser_->Next()) {
+        end_ = true; return false;
+      }
+      data_ptr_ = 0;
+      data_size_ = data_parser_->Value().size;
+    }
+    out_.index = inst_counter_++;
+    CHECK_LT(data_ptr_, data_size_);
+    const auto data_row = data_parser_->Value()[data_ptr_++];
+    // data, indices and indptr
+    out_.data[0] = AsDataBlob(data_row);
+    out_.data[1] = AsIdxBlob(data_row);
+    out_.data[2] = AsIndPtrPlaceholder(data_row);
+
+    if (label_parser_.get() != nullptr) {
+      while (label_ptr_ >= label_size_) {
+        CHECK(label_parser_->Next())
+            << "Data LibSVM's row is smaller than the number of rows in label_libsvm";
+        label_ptr_ = 0;
+        label_size_ = label_parser_->Value().size;
+      }
+      CHECK_LT(label_ptr_, label_size_);
+      const auto label_row = label_parser_->Value()[label_ptr_++];
+      // data, indices and indptr
+      out_.data[3] = AsDataBlob(label_row);
+      out_.data[4] = AsIdxBlob(label_row);
+      out_.data[5] = AsIndPtrPlaceholder(label_row);
+    } else {
+      out_.data[3] = AsScalarLabelBlob(data_row);
+    }
+    return true;
+  }
+
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+  virtual const NDArrayStorageType GetStorageType(bool is_data) const {
+    if (is_data) return kCSRStorage;
+    return param_.label_shape.Size() > 1 ? kCSRStorage : kDefaultStorage;
+  }
+
+  virtual const TShape GetShape(bool is_data) const {
+    if (is_data) return param_.data_shape;
+    return param_.label_shape;
+  }
+
+ private:
+  inline TBlob AsDataBlob(const dmlc::Row<uint64_t>& row) {
+    const real_t* ptr = row.value;
+    TShape shape(mshadow::Shape1(row.length));
+    return TBlob((real_t*) ptr, shape, cpu::kDevMask);  // NOLINT(*)
+  }
+
+  inline TBlob AsIdxBlob(const dmlc::Row<uint64_t>& row) {
+    const uint64_t* ptr = row.index;
+    TShape shape(mshadow::Shape1(row.length));
+    return TBlob((int64_t*) ptr, shape, cpu::kDevMask, mshadow::kInt64);  // NOLINT(*)
+  }
+
+  inline TBlob AsIndPtrPlaceholder(const dmlc::Row<uint64_t>& row) {
+    return TBlob(nullptr, mshadow::Shape1(0), cpu::kDevMask, mshadow::kInt64);
+  }
+
+  inline TBlob AsScalarLabelBlob(const dmlc::Row<uint64_t>& row) {
+    const real_t* ptr = row.label;
+    return TBlob((real_t*) ptr, mshadow::Shape1(1), cpu::kDevMask);  // NOLINT(*)
+  }
+
+  LibSVMIterParam param_;
+  // output instance
+  DataInst out_;
+  // internal instance counter
+  unsigned inst_counter_{0};
+  // at end
+  bool end_{false};
+  // label parser
+  size_t label_ptr_{0}, label_size_{0};
+  size_t data_ptr_{0}, data_size_{0};
+  std::unique_ptr<dmlc::Parser<uint64_t> > label_parser_;
+  std::unique_ptr<dmlc::Parser<uint64_t> > data_parser_;
+};
+
+
+DMLC_REGISTER_PARAMETER(LibSVMIterParam);
+
+MXNET_REGISTER_IO_ITER(LibSVMIter)
+.describe(R"code(Returns the LibSVM iterator which returns data with `csr`
+storage type. This iterator is experimental and should be used with care.
+
+The input data is stored in a format similar to LibSVM file format, except that the **indices
+are expected to be zero-based instead of one-based, and the column indices for each row are
+expected to be sorted in ascending order**. Details of the LibSVM format are available
+`here. <https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/>`_
+
+
+The `data_shape` parameter is used to set the shape of each line of the data.
+The dimension of both `data_shape` and `label_shape` are expected to be 1.
+
+The `data_libsvm` parameter is used to set the path input LibSVM file.
+When it is set to a directory, all the files in the directory will be read.
+
+When `label_libsvm` is set to ``NULL``, both data and label are read from the file specified
+by `data_libsvm`. In this case, the data is stored in `csr` storage type, while the label is a 1D
+dense array.
+
+The `LibSVMIter` only support `round_batch` parameter set to ``True``. Therefore, if `batch_size`
+is 3 and there are 4 total rows in libsvm file, 2 more examples are consumed at the first round.
+
+When `num_parts` and `part_index` are provided, the data is split into `num_parts` partitions,
+and the iterator only reads the `part_index`-th partition. However, the partitions are not
+guaranteed to be even.
+
+``reset()`` is expected to be called only after a complete pass of data.
+
+Example::
+
+  # Contents of libsvm file ``data.t``.
+  1.0 0:0.5 2:1.2
+  -2.0
+  -3.0 0:0.6 1:2.4 2:1.2
+  4 2:-1.2
+
+  # Creates a `LibSVMIter` with `batch_size`=3.
+  >>> data_iter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,), batch_size = 3)
+  # The data of the first batch is stored in csr storage type
+  >>> batch = data_iter.next()
+  >>> csr = batch.data[0]
+  <CSRNDArray 3x3 @cpu(0)>
+  >>> csr.asnumpy()
+  [[ 0.5        0.          1.2 ]
+  [ 0.          0.          0.  ]
+  [ 0.6         2.4         1.2]]
+  # The label of first batch
+  >>> label = batch.label[0]
+  >>> label
+  [ 1. -2. -3.]
+  <NDArray 3 @cpu(0)>
+
+  >>> second_batch = data_iter.next()
+  # The data of the second batch
+  >>> second_batch.data[0].asnumpy()
+  [[ 0.          0.         -1.2 ]
+   [ 0.5         0.          1.2 ]
+   [ 0.          0.          0. ]]
+  # The label of the second batch
+  >>> second_batch.label[0].asnumpy()
+  [ 4.  1. -2.]
+
+  >>> data_iter.reset()
+  # To restart the iterator for the second pass of the data
+
+When `label_libsvm` is set to the path to another LibSVM file,
+data is read from `data_libsvm` and label from `label_libsvm`.
+In this case, both data and label are stored in the csr format.
+If the label column in the `data_libsvm` file is ignored.
+
+Example::
+
+  # Contents of libsvm file ``label.t``
+  1.0
+  -2.0 0:0.125
+  -3.0 2:1.2
+  4 1:1.0 2:-1.2
+
+  # Creates a `LibSVMIter` with specified label file
+  >>> data_iter = mx.io.LibSVMIter(data_libsvm = 'data.t', data_shape = (3,),
+                   label_libsvm = 'label.t', label_shape = (3,), batch_size = 3)
+
+  # Both data and label are in csr storage type
+  >>> batch = data_iter.next()
+  >>> csr_data = batch.data[0]
+  <CSRNDArray 3x3 @cpu(0)>
+  >>> csr_data.asnumpy()
+  [[ 0.5         0.          1.2  ]
+   [ 0.          0.          0.   ]
+   [ 0.6         2.4         1.2 ]]
+  >>> csr_label = batch.label[0]
+  <CSRNDArray 3x3 @cpu(0)>
+  >>> csr_label.asnumpy()
+  [[ 0.          0.          0.   ]
+   [ 0.125       0.          0.   ]
+   [ 0.          0.          1.2 ]]
+
+)code" ADD_FILELINE)
+.add_arguments(LibSVMIterParam::__FIELDS__())
+.add_arguments(BatchParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.set_body([]() {
+    return new SparsePrefetcherIter(
+        new SparseBatchLoader(
+            new LibSVMIter()));
+  });
+
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/iter_mnist.cc b/src/io/iter_mnist.cc
index 055af52aaebd..1882a560d55f 100644
--- a/src/io/iter_mnist.cc
+++ b/src/io/iter_mnist.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file iter_mnist.cc
  * \brief register mnist iterator
 */
@@ -103,7 +104,7 @@ class MNISTIter: public IIterator<TBlobBatch> {
     out_.batch_size = param_.batch_size;
     if (param_.shuffle) this->Shuffle();
     if (param_.silent == 0) {
-      mshadow::TShape s;
+      TShape s;
       s = batch_data_.shape_;
       if (param_.flat) {
         LOG(INFO) << "MNISTIter: load " << (unsigned)img_.size(0) << " images, shuffle="
diff --git a/src/io/iter_normalize.h b/src/io/iter_normalize.h
index 409231b59bc8..4bc7d53d2b76 100644
--- a/src/io/iter_normalize.h
+++ b/src/io/iter_normalize.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_normalize.h
  * \brief Iterator that subtracts mean and do a few augmentations.
  */
@@ -100,7 +101,7 @@ class ImageNormalizeIter : public IIterator<DataInst> {
  private:
   /*! \brief base iterator */
   std::unique_ptr<IIterator<DataInst> > base_;
-  // whether mean image is ready.
+  /*! whether mean image is ready */
   bool meanfile_ready_;
   /*! \brief output data */
   DataInst out_;
@@ -143,39 +144,72 @@ class ImageNormalizeIter : public IIterator<DataInst> {
         rand_uniform(rnd_) * param_.max_random_contrast * 2 - param_.max_random_contrast + 1;
     float illumination =
         rand_uniform(rnd_) * param_.max_random_illumination * 2 - param_.max_random_illumination;
+    bool flip = (param_.rand_mirror && coin_flip(rnd_)) || param_.mirror;
 
-    if (param_.mean_r > 0.0f || param_.mean_g > 0.0f ||
-        param_.mean_b > 0.0f || param_.mean_a > 0.0f) {
-      // subtract mean per channel
-      data[0] -= param_.mean_r;
-      if (data.shape_[0] >= 3) {
-        data[1] -= param_.mean_g;
-        data[2] -= param_.mean_b;
-      }
-      if (data.shape_[0] == 4) {
-        data[3] -= param_.mean_a;
-      }
-      if ((param_.rand_mirror && coin_flip(rnd_)) || param_.mirror) {
-        outimg_ = mirror(data * contrast + illumination) * param_.scale;
-      } else {
-        outimg_ = (data * contrast + illumination) * param_.scale;
-      }
-    } else if (!meanfile_ready_ || param_.mean_img.length() == 0) {
-      // do not subtract anything
-      if ((param_.rand_mirror && coin_flip(rnd_)) || param_.mirror) {
-        outimg_ = mirror(data) * param_.scale;
-      } else {
-        outimg_ = F<mshadow::op::identity>(data) * param_.scale;
-      }
-    } else {
-      CHECK(meanfile_ready_);
-      if ((param_.rand_mirror && coin_flip(rnd_)) || param_.mirror) {
-        outimg_ = mirror((data - meanimg_) * contrast + illumination) * param_.scale;
-      } else {
-        outimg_ = ((data - meanimg_) * contrast + illumination) * param_.scale;
-      }
+    // one-liner channel-wise normalization
+    switch (data.shape_[0]) {
+      case 4:
+        if (meanfile_ready_ && flip) {
+          outimg_[3] = mirror((data[3] - meanimg_[3]) * contrast + illumination)
+            * param_.scale / param_.std_a;
+        } else if (meanfile_ready_ && (!flip)) {
+          outimg_[3] = ((data[3] - meanimg_[3]) * contrast + illumination)
+            * param_.scale / param_.std_a;
+        } else if (!meanfile_ready_ && flip) {
+          outimg_[3] = mirror((data[3] - param_.mean_a) * contrast + illumination)
+            * param_.scale / param_.std_a;
+        } else {
+          outimg_[3] = ((data[3] - param_.mean_a) * contrast + illumination)
+            * param_.scale / param_.std_a;
+        }
+      case 3:
+        if (meanfile_ready_ && flip) {
+          outimg_[2] = mirror((data[2] - meanimg_[2]) * contrast + illumination)
+            * param_.scale / param_.std_b;
+        } else if (meanfile_ready_ && (!flip)) {
+          outimg_[2] = ((data[2] - meanimg_[2]) * contrast + illumination)
+            * param_.scale / param_.std_b;
+        } else if (!meanfile_ready_ && flip) {
+          outimg_[2] = mirror((data[2] - param_.mean_b) * contrast + illumination)
+            * param_.scale / param_.std_b;
+        } else {
+          outimg_[2] = ((data[2] - param_.mean_b) * contrast + illumination)
+            * param_.scale / param_.std_b;
+        }
+      case 2:
+        if (meanfile_ready_ && flip) {
+          outimg_[1] = mirror((data[1] - meanimg_[1]) * contrast + illumination)
+            * param_.scale / param_.std_g;
+        } else if (meanfile_ready_ && (!flip)) {
+          outimg_[1] = ((data[1] - meanimg_[1]) * contrast + illumination)
+            * param_.scale / param_.std_g;
+        } else if (!meanfile_ready_ && flip) {
+          outimg_[1] = mirror((data[1] - param_.mean_g) * contrast + illumination)
+            * param_.scale / param_.std_g;
+        } else {
+          outimg_[1] = ((data[1] - param_.mean_g) * contrast + illumination)
+            * param_.scale / param_.std_g;
+        }
+      case 1:
+        if (meanfile_ready_ && flip) {
+          outimg_[0] = mirror((data[0] - meanimg_[0]) * contrast + illumination)
+            * param_.scale / param_.std_r;
+        } else if (meanfile_ready_ && (!flip)) {
+          outimg_[0] = ((data[0] - meanimg_[0]) * contrast + illumination)
+            * param_.scale / param_.std_r;
+        } else if (!meanfile_ready_ && flip) {
+          outimg_[0] = mirror((data[0] - param_.mean_r) * contrast + illumination)
+            * param_.scale / param_.std_r;
+        } else {
+          outimg_[0] = ((data[0] - param_.mean_r) * contrast + illumination)
+            * param_.scale / param_.std_r;
+        }
+        break;
+      default:
+        LOG(FATAL) << "Expected image channels range 1-4, got " << data.shape_[0];
     }
   }
+
   // creat mean image.
   inline void CreateMeanImg(void) {
     if (param_.verbose) {
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index 89960c71a12f..fdd1d2b91925 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file iter_prefetcher.h
  * \brief define a prefetcher using threaditer to keep k batch fetched
  */
@@ -46,8 +47,7 @@ namespace io {
 class PrefetcherIter : public IIterator<DataBatch> {
  public:
   explicit PrefetcherIter(IIterator<TBlobBatch>* base)
-      : loader_(base), out_(nullptr) {
-  }
+      : loader_(base), out_(nullptr) {}
 
   ~PrefetcherIter() {
     while (recycle_queue_.size() != 0) {
@@ -56,21 +56,24 @@ class PrefetcherIter : public IIterator<DataBatch> {
       delete batch;
     }
     delete out_;
-    iter_.Destroy();
+    iter.Destroy();
   }
 
-  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+  void InitParams(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init image rec param
     kwargs_left = param_.InitAllowUnknown(kwargs);
-    // use the kwarg to init batch loader
-    loader_->Init(kwargs);
     // maximum prefetch threaded iter internal size
     const int kMaxPrefetchBuffer = 16;
     // init thread iter
-    iter_.set_max_capacity(kMaxPrefetchBuffer);
+    iter.set_max_capacity(kMaxPrefetchBuffer);
+  }
 
-    iter_.Init([this](DataBatch **dptr) {
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    InitParams(kwargs);
+    // use the kwarg to init batch loader
+    loader_->Init(kwargs);
+    iter.Init([this](DataBatch **dptr) {
         if (!loader_->Next()) return false;
         const TBlobBatch& batch = loader_->Value();
         if (*dptr == nullptr) {
@@ -109,7 +112,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
   }
 
   virtual void BeforeFirst(void) {
-    iter_.BeforeFirst();
+    iter.BeforeFirst();
   }
 
   virtual bool Next(void) {
@@ -124,9 +127,9 @@ class PrefetcherIter : public IIterator<DataBatch> {
         arr.WaitToWrite();
       }
       recycle_queue_.pop();
-      iter_.Recycle(&old_batch);
+      iter.Recycle(&old_batch);
     }
-    return iter_.Next(&out_);
+    return iter.Next(&out_);
   }
   virtual const DataBatch &Value(void) const {
     return *out_;
@@ -135,6 +138,8 @@ class PrefetcherIter : public IIterator<DataBatch> {
  protected:
   /*! \brief prefetcher parameters */
   PrefetcherParam param_;
+  /*! \brief backend thread */
+  dmlc::ThreadedIter<DataBatch> iter;
   /*! \brief internal batch loader */
   std::unique_ptr<IIterator<TBlobBatch> > loader_;
 
@@ -143,8 +148,6 @@ class PrefetcherIter : public IIterator<DataBatch> {
   DataBatch *out_;
   /*! \brief queue to be recycled */
   std::queue<DataBatch*> recycle_queue_;
-  /*! \brief backend thread */
-  dmlc::ThreadedIter<DataBatch> iter_;
 };
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_sparse.h b/src/io/iter_sparse.h
new file mode 100644
index 000000000000..beaf5c682998
--- /dev/null
+++ b/src/io/iter_sparse.h
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file iter_sparse.h
+ * \brief mxnet sparse data iterator
+ */
+#ifndef MXNET_IO_ITER_SPARSE_H_
+#define MXNET_IO_ITER_SPARSE_H_
+
+#include <mxnet/io.h>
+#include <mxnet/ndarray.h>
+
+namespace mxnet {
+/*!
+ * \brief iterator type
+ * \param DType data type
+ */
+template<typename DType>
+class SparseIIterator : public IIterator<DType> {
+ public:
+  /*! \brief storage type of the data or label */
+  virtual const NDArrayStorageType GetStorageType(bool is_data) const = 0;
+  /*! \brief shape of the data or label */
+  virtual const TShape GetShape(bool is_data) const = 0;
+};  // class SparseIIterator
+
+}  // namespace mxnet
+#endif  // MXNET_IO_ITER_SPARSE_H_
diff --git a/src/io/iter_sparse_batchloader.h b/src/io/iter_sparse_batchloader.h
new file mode 100644
index 000000000000..d5c9bd2f4578
--- /dev/null
+++ b/src/io/iter_sparse_batchloader.h
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file iter_sparse_batchloader.h
+ * \brief define a batch adapter to create sparse tblob batch
+ */
+#ifndef MXNET_IO_ITER_SPARSE_BATCHLOADER_H_
+#define MXNET_IO_ITER_SPARSE_BATCHLOADER_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <dmlc/logging.h>
+#include <mshadow/tensor.h>
+#include <utility>
+#include <vector>
+#include <string>
+#include "./inst_vector.h"
+#include "./image_iter_common.h"
+#include "./iter_batchloader.h"
+#include "./iter_sparse.h"
+
+namespace mxnet {
+namespace io {
+
+/*! \brief create a batch iterator from single instance iterator */
+class SparseBatchLoader : public BatchLoader, public SparseIIterator<TBlobBatch> {
+ public:
+  explicit SparseBatchLoader(SparseIIterator<DataInst> *base):
+      BatchLoader(base), sparse_base_(base) {
+  }
+
+  virtual ~SparseBatchLoader(void) {}
+
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    BatchLoader::Init(kwargs);
+    data_stype_ = sparse_base_->GetStorageType(true);
+    label_stype_ = sparse_base_->GetStorageType(false);
+    if (param_.round_batch == 0) {
+      LOG(FATAL) << "sparse batch loader doesn't support round_batch == false yet";
+    }
+  }
+
+  virtual void BeforeFirst(void) {
+    BatchLoader::BeforeFirst();
+  }
+
+  virtual bool Next(void) {
+    out_.num_batch_padd = 0;
+    out_.batch_size = param_.batch_size;
+    this->head_ = 0;
+    // if overflown from previous round, directly return false, until before first is called
+    if (num_overflow_ != 0) return false;
+    index_t top = 0;
+    inst_cache_.clear();
+    while (sparse_base_->Next()) {
+      inst_cache_.emplace_back(sparse_base_->Value());
+      if (inst_cache_.size() >= param_.batch_size) break;
+    }
+    // no more data instance
+    if (inst_cache_.size() == 0) {
+      return false;
+    }
+    if (inst_cache_.size() < param_.batch_size) {
+      CHECK_GT(param_.round_batch, 0);
+      num_overflow_ = 0;
+      sparse_base_->BeforeFirst();
+      for (; inst_cache_.size() < param_.batch_size; ++num_overflow_) {
+        CHECK(sparse_base_->Next()) << "number of input must be bigger than batch size";
+        inst_cache_.emplace_back(sparse_base_->Value());
+      }
+    }
+    out_.num_batch_padd = num_overflow_;
+    CHECK_EQ(inst_cache_.size(), param_.batch_size);
+    this->InitDataFromBatch();
+    for (size_t j = 0; j < inst_cache_.size(); j++) {
+      const auto& d = inst_cache_[j];
+      out_.inst_index[top] = d.index;
+      // TODO(haibin) double check the type?
+      int64_t unit_size = 0;
+      for (size_t i = 0; i < d.data.size(); ++i) {
+        // indptr tensor
+        if (IsIndPtr(i)) {
+          auto indptr = data_[i].get<cpu, 1, int64_t>();
+          if (j == 0) indptr[0] = 0;
+          indptr[j + 1] = indptr[j] + unit_size;
+          offsets_[i] = j;
+        } else {
+          // indices and values tensor
+          unit_size = d.data[i].shape_.Size();
+          MSHADOW_TYPE_SWITCH(data_[i].type_flag_, DType, {
+            const auto begin = offsets_[i];
+            const auto end = offsets_[i] + unit_size;
+            mshadow::Copy(data_[i].get<cpu, 1, DType>().Slice(begin, end),
+                          d.data[i].get_with_shape<cpu, 1, DType>(mshadow::Shape1(unit_size)));
+            });
+          offsets_[i] += unit_size;
+        }
+      }
+    }
+    return true;
+  }
+
+  virtual const TBlobBatch &Value(void) const {
+    return BatchLoader::Value();
+  }
+
+  virtual const NDArrayStorageType GetStorageType(bool is_data) const {
+    return sparse_base_->GetStorageType(is_data);
+  }
+
+  virtual const TShape GetShape(bool is_data) const {
+    TShape inst_shape = sparse_base_->GetShape(is_data);
+    std::vector<index_t> shape_vec;
+    shape_vec.push_back(param_.batch_size);
+    for (index_t dim = 0; dim < inst_shape.ndim(); ++dim) {
+      shape_vec.push_back(inst_shape[dim]);
+    }
+    return TShape(shape_vec.begin(), shape_vec.end());
+  }
+
+ private:
+  /*! \brief base sparse iterator */
+  SparseIIterator<DataInst> *sparse_base_;
+  /*! \brief data instances */
+  std::vector<DataInst> inst_cache_;
+  /*! \brief data storage type */
+  NDArrayStorageType data_stype_;
+  /*! \brief data label type */
+  NDArrayStorageType label_stype_;
+  /*! \brief tensor offset for slicing */
+  std::vector<size_t> offsets_;
+
+  // check whether ith position is the indptr tensor for a CSR tensor
+  inline bool IsIndPtr(size_t i) {
+    auto data_num_aux = num_aux_data(data_stype_);
+    auto label_num_aux = num_aux_data(label_stype_);
+    auto label_indptr_offset = data_num_aux + 1 + label_num_aux;
+    // data indptr
+    if (i == data_num_aux && data_stype_ == kCSRStorage) {
+      return true;
+    }
+    // label indptr
+    if (i == label_indptr_offset && label_stype_ == kCSRStorage && data_stype_ == kCSRStorage) {
+      return true;
+    }
+    return false;
+  }
+
+  // initialize the data holder by using from the batch
+  inline void InitDataFromBatch() {
+    CHECK(data_stype_ == kCSRStorage || label_stype_ == kCSRStorage);
+    CHECK_GT(inst_cache_.size(), 0);
+    out_.data.clear();
+    data_.clear();
+    offsets_.clear();
+
+    size_t total_size = inst_cache_[0].data.size();
+    data_.resize(total_size);
+    offsets_.resize(total_size, 0);
+    std::vector<size_t> vec_sizes(total_size, 0);
+    // accumulate the memory required for a batch
+    for (size_t i = 0; i < total_size; ++i) {
+      size_t size = 0;
+      // vec_size for indptr
+      if (IsIndPtr(i)) {
+        size = param_.batch_size + 1;
+      } else {
+        for (const auto &d : inst_cache_) size += d.data[i].shape_.Size();
+      }
+      vec_sizes[i] = size;
+    }
+
+    CHECK_EQ(vec_sizes[0], vec_sizes[1]);
+    for (size_t i = 0; i < total_size; ++i) {
+      int src_type_flag = inst_cache_[0].data[i].type_flag_;
+      // init object attributes
+      TShape dst_shape(mshadow::Shape1(vec_sizes[i]));
+      data_[i].resize(mshadow::Shape1(vec_sizes[i]), src_type_flag);
+      CHECK(data_[i].dptr_ != nullptr);
+      out_.data.push_back(TBlob(data_[i].dptr_, dst_shape, cpu::kDevMask, src_type_flag));
+    }
+  }
+};  // class BatchLoader
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_ITER_SPARSE_BATCHLOADER_H_
diff --git a/src/io/iter_sparse_prefetcher.h b/src/io/iter_sparse_prefetcher.h
new file mode 100644
index 000000000000..3908f9bd3826
--- /dev/null
+++ b/src/io/iter_sparse_prefetcher.h
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file iter_sparse_prefetcher.h
+ * \brief define a prefetcher using threaditer to keep k batch fetched
+ */
+#ifndef MXNET_IO_ITER_SPARSE_PREFETCHER_H_
+#define MXNET_IO_ITER_SPARSE_PREFETCHER_H_
+
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/ndarray.h>
+#include <dmlc/logging.h>
+#include <dmlc/threadediter.h>
+#include <dmlc/optional.h>
+#include <mshadow/tensor.h>
+#include <climits>
+#include <utility>
+#include <string>
+#include <vector>
+#include <queue>
+#include <algorithm>
+#include "./inst_vector.h"
+#include "./image_iter_common.h"
+#include "./iter_prefetcher.h"
+#include "./iter_sparse.h"
+
+namespace mxnet {
+namespace io {
+// iterator on sparse data
+class SparsePrefetcherIter : public PrefetcherIter {
+ public:
+  explicit SparsePrefetcherIter(SparseIIterator<TBlobBatch>* base)
+      : PrefetcherIter(base), sparse_loader_(base) {}
+
+  ~SparsePrefetcherIter() {}
+
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    PrefetcherIter::InitParams(kwargs);
+    // use the kwarg to init batch loader
+    sparse_loader_->Init(kwargs);
+    iter.Init([this](DataBatch **dptr) {
+        if (!sparse_loader_->Next()) return false;
+        const TBlobBatch& batch = sparse_loader_->Value();
+        if (*dptr == nullptr) {
+          // allocate databatch
+          *dptr = new DataBatch();
+          (*dptr)->num_batch_padd = batch.num_batch_padd;
+          // (*dptr)->data.at(0) => data
+          // (*dptr)->data.at(1) => label
+          (*dptr)->data.resize(2);
+          (*dptr)->index.resize(batch.batch_size);
+          size_t data_iter = 0;
+          for (size_t i = 0; i < (*dptr)->data.size(); ++i) {
+            bool is_data = i == 0;
+            auto stype = this->GetStorageType(is_data);
+            auto dtype = param_.dtype ? param_.dtype.value() : batch.data[data_iter].type_flag_;
+            if (stype == kDefaultStorage) {
+              (*dptr)->data.at(i) = NDArray(batch.data[data_iter].shape_,
+                                            Context::CPU(), false, dtype);
+            } else {
+              (*dptr)->data.at(i) = NDArray(stype, this->GetShape(is_data),
+                                            Context::CPU(), false, dtype);
+            }
+            data_iter += num_aux_data(stype) + 1;
+          }
+        }
+        // copy data over
+        size_t data_iter = 0;
+        for (size_t i = 0; i < (*dptr)->data.size(); ++i) {
+          auto& nd = ((*dptr)->data)[i];
+          auto stype = nd.storage_type();
+          auto& data_i = ((*dptr)->data)[i];
+          if (stype == kDefaultStorage) {
+            CopyFromTo(data_i.data(), batch.data[data_iter]);
+          } else if (stype == kCSRStorage) {
+            auto& values = batch.data[data_iter];
+            auto& indices = batch.data[data_iter + 1];
+            auto& indptr = batch.data[data_iter + 2];
+            // allocate memory
+            CHECK_EQ(indices.shape_.Size(), values.shape_.Size());
+            nd.CheckAndAllocAuxData(csr::kIdx, indices.shape_);
+            nd.CheckAndAllocData(values.shape_);
+            nd.CheckAndAllocAuxData(csr::kIndPtr, indptr.shape_);
+            // copy values, indices and indptr
+            CopyFromTo(data_i.data(), values);
+            CopyFromTo(data_i.aux_data(csr::kIdx), indices);
+            CopyFromTo(data_i.aux_data(csr::kIndPtr), indptr);
+          } else {
+            LOG(FATAL) << "Storage type not implemented: " << stype;
+          }
+          data_iter += num_aux_data(stype) + 1;
+          (*dptr)->num_batch_padd = batch.num_batch_padd;
+        }
+        if (batch.inst_index) {
+          std::copy(batch.inst_index,
+                    batch.inst_index + batch.batch_size,
+                    (*dptr)->index.begin());
+        }
+       return true;
+      },
+      [this]() { sparse_loader_->BeforeFirst(); });
+  }
+
+  virtual void BeforeFirst(void) {
+    PrefetcherIter::BeforeFirst();
+  }
+
+  virtual bool Next(void) {
+    return PrefetcherIter::Next();
+  }
+  virtual const DataBatch &Value(void) const {
+    return PrefetcherIter::Value();
+  }
+
+  virtual const NDArrayStorageType GetStorageType(bool is_data) const {
+    return sparse_loader_->GetStorageType(is_data);
+  }
+
+  virtual const TShape GetShape(bool is_data) const {
+    return sparse_loader_->GetShape(is_data);
+  }
+
+ private:
+  /*! \brief internal sparse batch loader */
+  SparseIIterator<TBlobBatch>* sparse_loader_;
+
+  inline void CopyFromTo(TBlob dst, const TBlob src) {
+    MSHADOW_TYPE_SWITCH(src.type_flag_, DType, {
+      mshadow::Copy(dst.FlatTo1D<cpu, DType>(), src.FlatTo1D<cpu, DType>());
+    });
+  }
+};
+}  // namespace io
+}  // namespace mxnet
+#endif  // MXNET_IO_ITER_SPARSE_PREFETCHER_H_
diff --git a/src/kvstore/comm.h b/src/kvstore/comm.h
index ade9c95feda7..5e15c2a085f1 100644
--- a/src/kvstore/comm.h
+++ b/src/kvstore/comm.h
@@ -18,16 +18,22 @@
  */
 
 /**
+ * Copyright (c) 2015 by Contributors
  */
 #ifndef MXNET_KVSTORE_COMM_H_
 #define MXNET_KVSTORE_COMM_H_
+#include <dmlc/omp.h>
 #include <string>
 #include <algorithm>
 #include <utility>
 #include <limits>
 #include <vector>
 #include <tuple>
+#include <thread>
 #include "mxnet/ndarray.h"
+#include "gradient_compression.h"
+#include "../ndarray/ndarray_function.h"
+#include "../operator/tensor/sparse_retain-inl.h"
 namespace mxnet {
 namespace kvstore {
 /**
@@ -40,9 +46,10 @@ class Comm {
   }
   virtual ~Comm() { }
   /**
-   * \brief init key with the data shape
+   * \brief init key with the data shape and storage shape
    */
-  virtual void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) = 0;
+  virtual void Init(int key, const NDArrayStorageType stype,
+                    const TShape& shape, int dtype = mshadow::kFloat32) = 0;
   /**
    * \brief returns src[0] + .. + src[src.size()-1]
    */
@@ -55,6 +62,18 @@ class Comm {
       int key, const NDArray& src,
       const std::vector<NDArray*> dst, int priority) = 0;
 
+  /**
+   * \brief broadcast src to dst[i] with target row_ids for every i
+   * \param dst a list of destination row_sparse NDArray and its target row_ids to broadcast,
+            where the row_ids are expected to be unique and sorted
+   * \param use_copy if set to true, directly copy src to dst[i] without looking up the
+            provided row_ids
+   */
+  virtual void BroadcastRowSparse(int key, const NDArray& src,
+                                  const std::vector<std::pair<NDArray*, NDArray>>& dst,
+                                  const bool use_copy,
+                                  const int priority) = 0;
+
   /**
    * \brief return a pinned contex
    */
@@ -62,8 +81,18 @@ class Comm {
     return pinned_ctx_;
   }
 
+  /**
+   * \brief Sets gradient compression parameters to be able to
+   * perform reduce with compressed gradients
+   */
+  void SetGradientCompression(std::shared_ptr<GradientCompression> gc) {
+    gc_ = gc;
+  }
+
  protected:
   Context pinned_ctx_;
+
+  std::shared_ptr<GradientCompression> gc_;
 };
 
 /**
@@ -75,43 +104,91 @@ class CommCPU : public Comm {
   CommCPU() {
     nthread_reduction_ = dmlc::GetEnv("MXNET_KVSTORE_REDUCTION_NTHREADS", 4);
     bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000);
+    // TODO(junwu) delete the following data member, now for benchmark only
+    is_serial_push_ = dmlc::GetEnv("MXNET_KVSTORE_SERIAL_PUSH", 0);
   }
   virtual ~CommCPU() { }
 
-  void Init(int key, const TShape& shape, int type = mshadow::kFloat32) override {
-    merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type);
+  void Init(int key, const NDArrayStorageType stype, const TShape& shape,
+            int type = mshadow::kFloat32) override {
+    if (stype == kDefaultStorage) {
+      merge_buf_[key].merged = NDArray(shape, pinned_ctx_, false, type);
+    } else {
+      merge_buf_[key].merged = NDArray(stype, shape, pinned_ctx_, true, type);
+    }
   }
 
   const NDArray& Reduce(int key, const std::vector<NDArray>& src,
                         int priority) override {
+    auto& buf = merge_buf_[key];
     // avoid extra copy for single device, but it may bring problems for
     // abnormal usage of kvstore
     if (src.size() == 1) {
-      return src[0];
+      if (src[0].storage_type() == kDefaultStorage) {
+        return src[0];
+      } else {  // if sparse and only one GPU, always update weight on CPU
+        CopyFromTo(src[0], &buf.merged, priority);
+        return buf.merged;
+      }
     }
-    std::vector<Engine::VarHandle> const_vars(src.size() - 1);
-    std::vector<NDArray> reduce(src.size());
-    auto& buf = merge_buf_[key];
-    CopyFromTo(src[0], &buf.merged, priority);
-    reduce[0] = buf.merged;
 
-    if (buf.copy_buf.empty()) {
-      buf.copy_buf.resize(src.size()-1);
-      for (size_t j = 0; j < src.size() - 1; ++j) {
-        buf.copy_buf[j] = NDArray(
-          src[0].shape(), pinned_ctx_, false, src[0].dtype());
+    if (buf.merged.storage_type() == kDefaultStorage) {
+      std::vector<Engine::VarHandle> const_vars(src.size() - 1);
+      std::vector<NDArray> reduce(src.size());
+      CopyFromTo(src[0], &buf.merged, priority);
+      reduce[0] = buf.merged;
+
+      if (buf.copy_buf.empty()) {
+        buf.copy_buf.resize(src.size()-1);
+        for (size_t j = 0; j < src.size() - 1; ++j) {
+          // allocate NDArray based on storage type
+          buf.copy_buf[j] = NDArray(
+            src[0].shape(), pinned_ctx_, false, src[0].dtype());
+        }
+      }
+      for (size_t i = 1; i < src.size(); ++i) {
+        CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority);
+        reduce[i] = buf.copy_buf[i-1];
+        const_vars[i-1] = reduce[i].var();
       }
-    }
-    for (size_t i = 1; i < src.size(); ++i) {
-      CopyFromTo(src[i], &(buf.copy_buf[i-1]), priority);
-      reduce[i] = buf.copy_buf[i-1];
-      const_vars[i-1] = reduce[i].var();
-    }
 
-    Engine::Get()->PushSync([reduce, this](RunContext rctx) {
-        ReduceSumCPU(reduce);
-      }, Context::CPU(), const_vars, {reduce[0].var()},
-      FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
+      Engine::Get()->PushAsync(
+        [reduce, this](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          ReduceSumCPU(reduce);
+          on_complete();
+        }, Context::CPU(), const_vars, {reduce[0].var()},
+        FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
+
+    } else {
+      // buf.merged is a sparse ndarray.
+      std::vector<Engine::VarHandle> const_vars(src.size());
+      std::vector<NDArray> reduce(src.size());
+
+      if (buf.copy_buf.empty()) {
+        buf.copy_buf.resize(src.size());
+        for (size_t j = 0; j < src.size(); ++j) {
+          buf.copy_buf[j] = NDArray(
+            src[0].storage_type(), src[0].shape(), pinned_ctx_, true, src[0].dtype());
+        }
+      }
+      for (size_t i = 0; i < src.size(); ++i) {
+        CopyFromTo(src[i], &(buf.copy_buf[i]), priority);
+        reduce[i] = buf.copy_buf[i];
+        const_vars[i] = reduce[i].var();
+      }
+      auto result = buf.merged;
+      Engine::Get()->PushAsync(
+        [reduce, result, this](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          NDArray out = result;
+          Resource rsc = ResourceManager::Get()->Request(rctx.ctx,
+              ResourceRequest(ResourceRequest::kTempSpace));
+          is_serial_push_?
+            ReduceSumCPUExSerial(reduce, &out)
+            : mxnet::ndarray::ElementwiseSum(rctx.get_stream<cpu>(), rsc, reduce, &out);
+          on_complete();
+        }, Context::CPU(), const_vars, {result.var()},
+        FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreReduce"));
+    }
 
     return buf.merged;
   }
@@ -129,7 +206,117 @@ class CommCPU : public Comm {
     }
   }
 
+  void BroadcastRowSparse(int key, const NDArray& src,
+                          const std::vector<std::pair<NDArray*, NDArray>>& dst,
+                          const bool use_copy,
+                          const int priority) override {
+    using namespace mshadow;
+    CHECK_EQ(src.storage_type(), kRowSparseStorage)
+      << "BroadcastRowSparse expects row-sparse src NDArray";
+    CHECK_EQ(src.ctx().dev_mask(), Context::kCPU)
+      << "BroadcastRowSparse with src on gpu context not supported";
+    for (size_t i = 0; i < dst.size(); ++i) {
+      NDArray* out = dst[i].first;
+      NDArray row_id = dst[i].second;
+      if (use_copy) {
+        CopyFromTo(src, out, priority);
+      } else {
+        CHECK_EQ(out->storage_type(), kRowSparseStorage)
+                 << "BroadcastRowSparse expects row_sparse dst NDArray";
+        CHECK_EQ(row_id.ctx().dev_mask(), Context::kCPU)
+                 << "BroadcastRowSparse with row_indices on gpu context not supported";
+        // retain according to unique indices
+        const bool use_sparse_retain = (src.shape()[0] != src.storage_shape()[0])
+          || (row_id.dtype() != out->aux_type(rowsparse::kIdx))
+          || (out->ctx().dev_mask() != Context::kGPU);
+        if (use_sparse_retain) {  // use sparse_retain op
+          const bool is_to_gpu = out->ctx().dev_mask() == Context::kGPU;
+          NDArray out_cpu = is_to_gpu? NDArray(kRowSparseStorage, src.shape(),
+              src.ctx(), true, src.dtype(), src.aux_types()) : *out;
+          Engine::Get()->PushAsync(
+            [=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+              const TBlob& indices = row_id.data();
+              NDArray temp = out_cpu;  // get rid of const qualifier
+              op::SparseRetainOpForwardRspImpl<cpu>(rctx.get_stream<cpu>(),
+                                                    src, indices, kWriteTo,
+                                                    &temp);
+              on_complete();
+            }, Context::CPU(), {src.var(), row_id.var()}, {out_cpu.var()},
+            FnProperty::kNormal, priority, PROFILER_MESSAGE("KVStoreSparseRetain"));
+          if (is_to_gpu) {
+            CopyFromTo(out_cpu, out, priority);
+          }
+        } else {  // direct copy rows
+          Engine::Get()->PushAsync(
+            [=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+              CopyRetainedRowsToGPU(rctx.get_stream<cpu>(), rctx.get_stream<gpu>(),
+                                    src, row_id, out);
+              on_complete();
+            }, out->ctx(), {src.var(), row_id.var()}, {out->var()},
+            FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("KVStoreCopyRetainedRowsToGPU"));
+        }
+      }
+    }
+  }
+
  private:
+  /*!
+   * \brief When src is a rsp with full rows,
+   * simply copy retained rows directly from cpu to gpu
+   * without invoking sparse_retain op.
+   */
+  void CopyRetainedRowsToGPU(mshadow::Stream<cpu>* cpu_stream,
+                             mshadow::Stream<gpu>* gpu_stream,
+                             const NDArray& src,
+                             const NDArray& indices,
+                             NDArray* dst) {
+#if MXNET_USE_CUDA == 1
+    CHECK_EQ(src.storage_type(), kRowSparseStorage)
+      << "CopyRetainedRowsToGPU expects row-sparse src NDArray";
+    CHECK_EQ(src.ctx().dev_mask(), Context::kCPU)
+      << "CopyRetainedRowsToGPU with src on gpu context not supported";
+    CHECK_EQ(src.storage_shape()[0], src.shape()[0])
+      << "CopyRetainedRowsToGPU only supports src rsp with full rows";
+    CHECK_EQ(indices.storage_type(), kDefaultStorage);
+    CHECK_EQ(indices.ctx().dev_mask(), Context::kCPU);
+    CHECK_EQ(dst->storage_type(), kRowSparseStorage);
+    CHECK_EQ(dst->ctx().dev_mask(), Context::kGPU);
+    CHECK_EQ(indices.dtype(), dst->aux_type(rowsparse::kIdx))
+      << "CopyRetainedRowsToGPU only supports same data type for idx array and dst aux_data(0)";
+    if (!src.storage_initialized() || indices.data().Size() == 0U) {
+      op::FillZerosRspImpl(gpu_stream, *dst);
+      return;
+    }
+    using namespace mshadow;
+
+    const TBlob& src_data = src.data();
+    const TBlob& idx_data = indices.data();
+    const size_t row_length = src.shape().ProdShape(1, src.shape().ndim());
+    const size_t num_rows_retained = idx_data.Size();
+    dst->CheckAndAlloc({Shape1(num_rows_retained)});
+    TBlob dst_data = dst->data();
+    TBlob dst_idx_data = dst->aux_data(rowsparse::kIdx);
+    MSHADOW_TYPE_SWITCH(src.dtype(), DType, {
+      MSHADOW_IDX_TYPE_SWITCH(indices.dtype(), IType, {
+        // copy idx array
+        Tensor<gpu, 1, IType> dst_idx_tensor = dst_idx_data.FlatTo1D<gpu, IType>(gpu_stream);
+        const Tensor<cpu, 1, IType> idx_tensor = idx_data.FlatTo1D<cpu, IType>(cpu_stream);
+        Copy(dst_idx_tensor, idx_tensor, gpu_stream);
+        // copy src data
+        const Tensor<cpu, 2, DType> src_data_tensor = src_data.get_with_shape<cpu, 2, DType>(
+            Shape2(src_data.shape_[0], row_length), cpu_stream);
+        Tensor<gpu, 2, DType> dst_data_tensor = dst_data.get_with_shape<gpu, 2, DType>(
+            Shape2(dst_data.shape_[0], row_length), gpu_stream);
+        for (size_t i = 0; i < num_rows_retained; ++i) {
+          Copy(dst_data_tensor[i], src_data_tensor[idx_tensor[i]], gpu_stream);
+        }
+      })
+    })
+#else
+    LOG(FATAL) << "GPU not enabled";
+#endif
+  }
+
   // reduce sum into val[0]
   inline void ReduceSumCPU(const std::vector<NDArray> &in_data) {
     MSHADOW_TYPE_SWITCH(in_data[0].dtype(), DType, {
@@ -144,6 +331,78 @@ class CommCPU : public Comm {
     });
   }
 
+  // serial implementation of reduce sum for row sparse NDArray.
+  inline void ReduceSumCPUExSerial(const std::vector<NDArray> &in, NDArray *out) {
+    using namespace rowsparse;
+    using namespace mshadow;
+    auto stype = out->storage_type();
+    CHECK_EQ(stype, kRowSparseStorage) << "Unexpected storage type " << stype;
+    size_t total_num_rows = 0;
+    size_t num_in = in.size();
+    // skip the ones with empty indices and values
+    std::vector<bool> skip(num_in, false);
+    // the values tensor of the inputs
+    MSHADOW_TYPE_SWITCH(out->dtype(), DType, {
+      MSHADOW_IDX_TYPE_SWITCH(out->aux_type(kIdx), IType, {
+        std::vector<Tensor<cpu, 2, DType>> in_vals(num_in);
+        std::vector<Tensor<cpu, 1, IType>> in_indices(num_in);
+        // offset to the values tensor of all inputs
+        std::vector<size_t> offsets(num_in, 0);
+        std::vector<size_t> num_rows(num_in, 0);
+        for (size_t i = 0; i < num_in; i++) {
+          if (!in[i].storage_initialized()) {
+            skip[i] = true;
+            continue;
+          }
+          auto size = in[i].aux_shape(kIdx).Size();
+          num_rows[i] = size;
+          total_num_rows += size;
+          in_vals[i] = in[i].data().FlatTo2D<cpu, DType>();
+          in_indices[i] = in[i].aux_data(kIdx).FlatTo1D<cpu, IType>();
+        }
+        std::vector<IType> indices;
+        indices.reserve(total_num_rows);
+        // gather indices from all inputs
+        for (size_t i = 0; i < num_in; i++) {
+          for (size_t j = 0; j < num_rows[i]; j++) {
+            indices.emplace_back(in_indices[i][j]);
+          }
+        }
+        CHECK_EQ(indices.size(), total_num_rows);
+        // dedup indices
+        std::sort(indices.begin(), indices.end());
+        indices.resize(std::unique(indices.begin(), indices.end()) - indices.begin());
+        // the one left are unique non-zero rows
+        size_t nnr = indices.size();
+        // allocate memory for output
+        out->CheckAndAlloc({Shape1(nnr)});
+        auto idx_data = out->aux_data(kIdx).FlatTo1D<cpu, IType>();
+        auto val_data = out->data().FlatTo2D<cpu, DType>();
+
+        for (size_t i = 0; i < nnr; i++) {
+          // copy indices back
+          idx_data[i] = indices[i];
+          bool zeros = true;
+          for (size_t j = 0; j < num_in; j++) {
+            if (skip[j]) continue;
+            size_t offset = offsets[j];
+            if (offset < num_rows[j]) {
+              if (indices[i] == in_indices[j][offset]) {
+                if (zeros) {
+                  Copy(val_data[i], in_vals[j][offset], nullptr);
+                  zeros = false;
+                } else {
+                  val_data[i] += in_vals[j][offset];
+                }
+                offsets[j] += 1;
+              }
+            }
+          }
+        }
+      });
+    });
+  }
+
   template<typename DType>
   inline static void ReduceSumCPU(
       const std::vector<DType*> &dptr, size_t offset, index_t size) {
@@ -209,6 +468,7 @@ class CommCPU : public Comm {
   std::unordered_map<int, BufferEntry> merge_buf_;
   size_t bigarray_bound_;
   int nthread_reduction_;
+  bool is_serial_push_;
 };
 
 /**
@@ -227,18 +487,16 @@ class CommDevice : public Comm {
 
   virtual ~CommDevice() { }
 
-  void Init(int key, const TShape& shape, int dtype = mshadow::kFloat32) override {
-    sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype));
-  }
-
-  const NDArray& Reduce(int key, const std::vector<NDArray>& src,
-                        int priority) override {
-    // avoid extra copy for single device, but it may bring problems for
-    // abnormal usage of kvstore
-    if (src.size() == 1) {
-      return src[0];
+  void Init(int key, const NDArrayStorageType stype, const TShape& shape,
+            int dtype = mshadow::kFloat32) override {
+    if (stype == kDefaultStorage) {
+      sorted_key_attrs_.push_back(std::make_tuple(key, shape, dtype));
+    } else {
+      LOG(FATAL) << "storage type " << stype << " not implemented for device yet";
     }
+  }
 
+  void InitBuffersAndComm(const std::vector<NDArray>& src) {
     if (!inited_) {
       std::vector<Context> devs;
       for (const auto& a : src) {
@@ -249,7 +507,23 @@ class CommDevice : public Comm {
         EnableP2P(devs);
       }
     }
+  }
+
+  const NDArray& Reduce(int key, const std::vector<NDArray>& src,
+                        int priority) override {
+    // when this reduce is called from kvstore_dist, gc is not set
+    // we don't do compression twice in dist_sync_device
+    if ((gc_ != nullptr) && (gc_->get_type() != CompressionType::kNone)) {
+      return ReduceCompressed(key, src, priority);
+    }
 
+    // avoid extra copy for single device, but it may bring problems for
+    // abnormal usage of kvstore
+    if (src.size() == 1) {
+      return src[0];
+    }
+
+    InitBuffersAndComm(src);
     auto& buf = merge_buf_[key];
     std::vector<NDArray> reduce(src.size());
     CopyFromTo(src[0], &(buf.merged), priority);
@@ -272,7 +546,52 @@ class CommDevice : public Comm {
     }
 
     ElementwiseSum(reduce, &buf.merged);
+    return buf.merged;
+  }
+
+  const NDArray& ReduceCompressed(int key, const std::vector<NDArray>& src,
+                                  int priority) {
+    InitBuffersAndComm(src);
+    auto& buf = merge_buf_[key];
+    std::vector<NDArray> reduce(src.size());
+    if (buf.copy_buf.empty()) {
+      // one buf for each context
+      buf.copy_buf.resize(src.size());
+      buf.compressed_recv_buf.resize(src.size());
+      buf.compressed_send_buf.resize(src.size());
+      buf.residual.resize(src.size());
+
+      for (size_t i = 0; i < src.size(); ++i) {
+        buf.copy_buf[i] = NDArray(buf.merged.shape(), buf.merged.ctx(),
+                                  false, buf.merged.dtype());
+        buf.residual[i] = NDArray(buf.merged.shape(), src[i].ctx(),
+                                  false, buf.merged.dtype());
+        buf.residual[i] = 0;
+        int64_t small_size = gc_->GetCompressedSize(buf.merged.shape().Size());
+        buf.compressed_recv_buf[i] = NDArray(TShape{small_size}, buf.merged.ctx(),
+                                        false, buf.merged.dtype());
+        buf.compressed_send_buf[i] = NDArray(TShape{small_size}, src[i].ctx(),
+                                        false, buf.merged.dtype());
+      }
+    }
+
+    for (size_t i = 0; i < src.size(); ++i) {
+      // compress before copy
+      // this is done even if the data is on same context as copy_buf because
+      // we don't want the training to be biased towards data on this GPU
+      gc_->Quantize(src[i], &(buf.compressed_send_buf[i]), &(buf.residual[i]), priority);
 
+      if (buf.compressed_send_buf[i].ctx() != buf.compressed_recv_buf[i].ctx()) {
+        CopyFromTo(buf.compressed_send_buf[i], &(buf.compressed_recv_buf[i]), priority);
+      } else {
+        // avoid memory copy when they are on same context
+        buf.compressed_recv_buf[i] = buf.compressed_send_buf[i];
+      }
+
+      gc_->Dequantize(buf.compressed_recv_buf[i], &(buf.copy_buf[i]), priority);
+      reduce[i] = buf.copy_buf[i];
+    }
+    ElementwiseSum(reduce, &buf.merged);
     return buf.merged;
   }
 
@@ -296,6 +615,13 @@ class CommDevice : public Comm {
     }
   }
 
+  void BroadcastRowSparse(int key, const NDArray& src,
+                          const std::vector<std::pair<NDArray*, NDArray>>& dst,
+                          const bool use_copy,
+                          const int priority) override {
+    LOG(FATAL) << "Not implemented yet";
+  }
+
  private:
   void EnableP2P(const std::vector<Context>& devs) {
 #if MXNET_USE_CUDA
@@ -378,6 +704,12 @@ class CommDevice : public Comm {
     NDArray merged;
     /// \brief the gpu buffer
     std::vector<NDArray> copy_buf;
+    /// \brief the residual buffer for gradient compression
+    std::vector<NDArray> residual;
+    /// \brief the small buffer for compressed data in sender
+    std::vector<NDArray> compressed_send_buf;
+    /// \brief the small buffer for compressed data in receiver
+    std::vector<NDArray> compressed_recv_buf;
   };
   std::unordered_map<int, BufferEntry> merge_buf_;
   bool inited_;
diff --git a/src/kvstore/gradient_compression-inl.h b/src/kvstore/gradient_compression-inl.h
new file mode 100644
index 000000000000..9b69bd11472c
--- /dev/null
+++ b/src/kvstore/gradient_compression-inl.h
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file gradient_compression-inl.h
+ * \author Rahul Huilgol
+ * \brief Declares and defines functions used to quantize and dequantize data
+ */
+#ifndef MXNET_KVSTORE_GRADIENT_COMPRESSION_INL_H_
+#define MXNET_KVSTORE_GRADIENT_COMPRESSION_INL_H_
+
+#include <vector>
+#include "../operator/mxnet_op.h"
+
+namespace mxnet {
+namespace kvstore {
+
+// these gpu functions are defined in gradient_compression.cu
+void Quantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                      const float threshold);
+void Dequantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                        const float threshold);
+
+struct quantize_2bit {
+  MSHADOW_XINLINE static void Map(int out_block_id,
+                                  int original_size,
+                                  float *out,
+                                  float *grad,
+                                  float *residual,
+                                  const float neg_threshold,
+                                  const float pos_threshold) {
+    // this block contains the compressed representation of
+    // upto 16 values starting from out_block_id*16
+    float *compr_block = out + out_block_id;
+    // init to 0
+    *compr_block = 0;
+    // start and end are indices in original grad array
+    const int start = out_block_id << 4;
+    const int end = (start + 16 <= original_size) ? start + 16 : original_size;
+    // cast as char* to manipulate bits of float addresses
+    char *block_ptr = reinterpret_cast < char * > (compr_block);
+    // masks to set bits when value meets pos_threshold
+    // 0xc0 is mask when value is to be represented by the first two bits in a char*
+    // 0xc0 means first two bits are set to 11
+    const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03};
+    // masks to set bits when value meets neg_threshold
+    const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02};
+    for (int i = start; i < end; i++) {
+      // adds offset to reach appropriate byte
+      char *curr_byte = block_ptr + ((i - start) >> 2);
+      // adds gradient to existing residual to get updated grad
+      residual[i] += grad[i];
+      if (residual[i] >= pos_threshold) {
+        // set data to 11
+        *curr_byte |= posbits[(i & 3)];
+        // reduce residual by pos_threshold
+        residual[i] -= pos_threshold;
+      } else if (residual[i] <= neg_threshold) {
+        // set data to 10
+        *curr_byte |= negbits[(i & 3)];
+        residual[i] -= neg_threshold;
+      }
+    }
+  }
+};
+
+template<typename xpu>
+void Quantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                              const float threshold) {
+  mxnet::op::mxnet_op::Kernel<quantize_2bit, xpu>
+    ::Launch(s,
+            inputs[2].Size(),         // compressed array size
+            inputs[0].Size(),         // original size
+            inputs[2].dptr<float>(),  // compressed array
+            inputs[0].dptr<float>(),  // original array
+            inputs[1].dptr<float>(),  // residual array
+            -1 *threshold,            // negative threshold
+            threshold);               // positive threshold
+}
+
+struct dequantize_2bit {
+  MSHADOW_XINLINE static void Map(int i,
+                                  float *out,
+                                  float *in,
+                                  const float neg_threshold,
+                                  const float pos_threshold) {
+    // get position of dequantized value to fill
+    float *outval = out + i;
+    // gets byte which holds quantized value for this position
+    char *ch_ptr = reinterpret_cast<char *>(in + (i >> 4));
+    ch_ptr += ((i & 15) >> 2);
+    // masks used to quantize data
+    const uint8_t posbits[] = {0xc0, 0x30, 0x0c, 0x03};
+    const uint8_t negbits[] = {0x80, 0x20, 0x08, 0x02};
+    // col denotes which two bits of a byte are set for this value
+    // col=0 implies first two bits, col=3 implies last two bits,...
+    const int col = i & 3;
+    const uint8_t mask = posbits[col];
+    const uint8_t negmask = negbits[col];
+    const uint8_t masked = *ch_ptr & mask;
+    if (masked == mask) {
+      *outval = pos_threshold;
+    } else if (masked == negmask) {
+      // use posbits for mask as posbits are both 1s
+      // then compare masked with negbits to see if only negbits were set
+      *outval = neg_threshold;
+    } else {
+      *outval = 0;
+    }
+  }
+};
+
+template<typename xpu>
+void Dequantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                                const float threshold) {
+  mxnet::op::mxnet_op::Kernel<dequantize_2bit, xpu>
+  ::Launch(s,
+          inputs[1].Size(),         // original size
+          inputs[1].dptr<float>(),  // out array
+          inputs[0].dptr<float>(),  // compressed array
+          -1 *threshold,            // negative threshold
+          threshold);               // positive threshold
+}
+
+inline void Quantize2BitImpl(mshadow::Stream<mshadow::cpu> *s,
+                             const std::vector<mxnet::TBlob> &inputs,
+                             const float threshold) {
+  Quantize2BitKernelLaunch(s, inputs, threshold);
+}
+
+inline void Dequantize2BitImpl(mshadow::Stream<mshadow::cpu> *s,
+                               const std::vector<mxnet::TBlob> &inputs,
+                               const float threshold) {
+  Dequantize2BitKernelLaunch(s, inputs, threshold);
+}
+}  // namespace kvstore
+}  // namespace mxnet
+
+#endif  // MXNET_KVSTORE_GRADIENT_COMPRESSION_INL_H_
diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc
new file mode 100644
index 000000000000..b8c626cd53a8
--- /dev/null
+++ b/src/kvstore/gradient_compression.cc
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file gradient_compression.cc
+ * \brief Gradient compression for kvstore
+ * \author Rahul Huilgol
+ */
+
+#include <sstream>
+#include <vector>
+#include "gradient_compression.h"
+#include "gradient_compression-inl.h"
+
+namespace mxnet {
+namespace kvstore {
+
+/*!
+ * \brief Splits a string into smaller strings using char as delimiter
+ * Example: "a,b,c,,d" is split into ["a","b","c","","d"]
+ * \param s string to split
+ * \param delim char to split string around
+ * \param result container for tokens extracted after splitting
+ */
+template<typename Out>
+void split(const std::string &s, const char delim, Out result) {
+  std::stringstream ss;
+  ss.str(s);
+  std::string item;
+  while (std::getline(ss, item, delim)) {
+    *(result++) = item;
+  }
+}
+
+DMLC_REGISTER_PARAMETER(GradientCompressionParam);
+
+GradientCompression::GradientCompression() {
+  type_ = CompressionType::kNone;
+}
+
+void GradientCompression::SetParams(const std::vector<std::pair<std::string, std::string> >
+                                    & kwargs) {
+  GradientCompressionParam params;
+  params.InitAllowUnknown(kwargs);
+  CHECK_GT(params.threshold, 0) << "threshold must be greater than 0";
+  if (params.type == "2bit") {
+    SetTwoBitCompression(params.threshold);
+  } else {
+    LOG(FATAL) << "Unknown type for gradient compression " << params.type;
+  }
+}
+
+CompressionType GradientCompression::get_type() {
+  return type_;
+}
+
+std::string GradientCompression::get_type_str() {
+  return std::to_string(static_cast<int>(type_));
+}
+
+void GradientCompression::SetTwoBitCompression(const float threshold) {
+  type_ = CompressionType::kTwoBit;
+  threshold_ = threshold;
+}
+
+std::string GradientCompression::EncodeParams() {
+  using namespace std;  // to reduce length of next line
+  string rval = get_type_str();
+  if (type_ == CompressionType::kTwoBit) {
+    rval += "," + to_string(threshold_);
+  }
+  return rval;
+}
+
+void GradientCompression::DecodeParams(const std::string &s) {
+  std::vector<std::string> elems;
+  split(s, ',', std::back_inserter(elems));
+  type_ = static_cast<CompressionType>(stoi(elems[0]));
+  if (elems.size() > 1) {
+    if (!elems[1].empty()) {
+      threshold_ = stof(elems[1]);
+    }
+  }
+}
+
+int GradientCompression::GetCompressionFactor() {
+  if (type_ == CompressionType::kTwoBit) {
+    return 16;
+  } else {
+    LOG(FATAL) << "Unsupported compression type: " << get_type_str();
+    return 0;
+  }
+}
+
+int64_t GradientCompression::GetCompressedSize(const int64_t original_size) {
+  const int bits = GetCompressionFactor();
+  return ((original_size % bits == 0) ?
+          original_size / bits :
+          original_size / bits + 1);
+}
+
+void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *to,
+                  mxnet::NDArray *residual, const int priority) {
+  CHECK(from.shape().ndim() != 0) << "source operand has zero dimension shape";
+  CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape";
+  CHECK(residual->shape().ndim() != 0) << "residual operand has zero dimension shape";
+  const int a = from.ctx().dev_mask();
+  const int b = to->ctx().dev_mask();
+  const float threshold = threshold_;
+  if (type_ == CompressionType::kTwoBit) {
+    if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+      mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
+        std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
+        Quantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
+      }, from.ctx(), {from.var()}, {to->var(), residual->var()},
+      mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeCPU"));
+    } else {
+#if MXNET_USE_CUDA
+      if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
+        mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
+          std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
+          Quantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
+          // Wait GPU kernel to complete
+          ctx.get_stream<mshadow::gpu>()->Wait();
+        }, from.ctx(), {from.var()}, {to->var(), residual->var()},
+        mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("QuantizeGPU"));
+      } else {
+        LOG(FATAL) << "unknown device mask";
+      }
+#else
+    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    }
+  } else {
+    LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
+  }
+}
+
+void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to,
+                                     const int priority) {
+  CHECK(from.shape().ndim() != 0) << "source operands has zero dimension shape";
+  CHECK(to->shape().ndim() != 0) << "destination operand has zero dimension shape";
+  const int a = from.ctx().dev_mask();
+  const int b = to->ctx().dev_mask();
+  const float threshold = threshold_;
+  if (type_ == CompressionType::kTwoBit) {
+    if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+      mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
+        std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
+        Dequantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
+      }, from.ctx(), {from.var()}, {to->var()},
+      mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeCPU"));
+    } else {
+#if MXNET_USE_CUDA
+      if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
+        mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
+          std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
+          Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
+          // Wait GPU kernel to complete
+          ctx.get_stream<mshadow::gpu>()->Wait();
+        }, from.ctx(), {from.var()}, {to->var()},
+        mxnet::FnProperty::kNormal, priority, PROFILER_MESSAGE("DequantizeGPU"));
+      } else {
+        LOG(FATAL) << "unknown device mask";
+      }
+#else
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    }
+  } else {
+    LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
+  }
+}
+
+}  // namespace kvstore
+}  // namespace mxnet
+
diff --git a/src/kvstore/gradient_compression.cu b/src/kvstore/gradient_compression.cu
new file mode 100644
index 000000000000..b0d9662520b2
--- /dev/null
+++ b/src/kvstore/gradient_compression.cu
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file gradient_compression.cu
+ * \author Rahul Huilgol
+ * \brief Implementation for gpu version of code
+ */
+
+#include "gradient_compression-inl.h"
+
+namespace mxnet {
+namespace kvstore {
+void Quantize2BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
+                      const float threshold) {
+  Quantize2BitKernelLaunch(s, inputs, threshold);
+}
+
+void Dequantize2BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
+                        const float threshold) {
+  Dequantize2BitKernelLaunch(s, inputs, threshold);
+}
+}  // namespace kvstore
+}  // namespace mxnet
diff --git a/src/kvstore/gradient_compression.h b/src/kvstore/gradient_compression.h
new file mode 100644
index 000000000000..f40b45f5a513
--- /dev/null
+++ b/src/kvstore/gradient_compression.h
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file gradient_compression.h
+ * \brief Gradient compression for kvstore
+ * \author Rahul Huilgol
+ */
+
+#ifndef MXNET_KVSTORE_GRADIENT_COMPRESSION_H_
+#define MXNET_KVSTORE_GRADIENT_COMPRESSION_H_
+#include <dmlc/parameter.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include "mxnet/ndarray.h"
+
+namespace mxnet {
+namespace kvstore {
+
+enum class CompressionType {
+  kNone, kTwoBit
+};
+
+struct GradientCompressionParam : public dmlc::Parameter<GradientCompressionParam> {
+  std::string type;
+  float threshold;
+  DMLC_DECLARE_PARAMETER(GradientCompressionParam) {
+    DMLC_DECLARE_FIELD(type)
+      .describe("Type of gradient compression to use, like `2bit` for example");
+    DMLC_DECLARE_FIELD(threshold).set_default(0.5)
+      .describe("Threshold to use for 2bit gradient compression");
+  }
+};
+
+class GradientCompression {
+ public:
+  GradientCompression();
+
+  virtual ~GradientCompression() {}
+
+  /*!
+   * \brief sets parameters for gradient compression
+   * \param kwargs a vector of pair of strings. A pair represents key and value
+   * of the parameter. Will be parsed by GradientCompressionParam
+   */
+  void SetParams(const std::vector<std::pair<std::string, std::string> >& kwargs);
+
+  /*!
+   * \brief returns type of compression if any
+   */
+  CompressionType get_type();
+
+  /*!
+   * \brief returns as string the enum value of compression type
+   */
+  std::string get_type_str();
+
+  /*!
+   * \brief sets two bit gradient compression
+   * \param threshold float value used for thresholding gradients
+   */
+  void SetTwoBitCompression(const float threshold);
+
+  /*!
+   * \brief encodes parameters of gc into a string
+   */
+  std::string EncodeParams();
+
+  /*!
+   * \brief decodes parameters of gc from a string and assigns them to member variables
+   */
+  void DecodeParams(const std::string &s);
+
+  /*!
+   * \brief returns compression factor, which is the factor by which size of gradient
+   * reduces when using a particular type of compression
+   */
+  int GetCompressionFactor();
+
+  /*!
+   * \brief returns the size of compressed gradients given an original sized gradient array
+   */
+  int64_t GetCompressedSize(const int64_t original_size);
+
+  /*!
+  * \brief Issues quantize operation to be scheduled by the engine
+  * Compresses `from` into `to` and accumulates the quantization error
+  * into 'residual', using the quantization of type `type_`
+  * \param from the ndarray containing original data to be quantized
+  * \param to the target ndarray which contains quantized data
+  * \param residual the ndarray which accumulates quantization error
+  * \param priority Priority of the action.
+  */
+  void Quantize(const mxnet::NDArray &from, mxnet::NDArray *to,
+                mxnet::NDArray *residual, const int priority);
+
+  /*!
+  * \brief Issues dequantize operation to be scheduled by the engine
+  * Decompresses `from` into `to` using current parameters of `type` and `threshold`
+  * \param from the ndarray containing quantized data
+  * \param to the target ndarray which contains final dequantized data
+  * \param priority Priority of the action.
+  */
+  void Dequantize(const mxnet::NDArray &from, mxnet::NDArray *to, const int priority);
+
+ private:
+  /*!
+   * \brief denotes the type of gradient compression which has been set
+   */
+  CompressionType type_;
+
+  /*!
+   * \brief denotes threshold used for quantization and dequantization
+   * Must be a positive value. All positive gradients will be thresholded to `threshold_` and
+   * all negative gradients will be thresholded to -1*`threshold_`
+   */
+  float threshold_ = 0;
+};
+}  // namespace kvstore
+}  // namespace mxnet
+#endif  // MXNET_KVSTORE_GRADIENT_COMPRESSION_H_
diff --git a/src/kvstore/kvstore.cc b/src/kvstore/kvstore.cc
index a288676102cb..8f749f3f78b9 100644
--- a/src/kvstore/kvstore.cc
+++ b/src/kvstore/kvstore.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file kvstore.cc
  * \brief implement kv_store
  */
@@ -28,6 +29,9 @@
 #if MXNET_USE_DIST_KVSTORE
 #include "./kvstore_dist.h"
 #endif  // MXNET_USE_DIST_KVSTORE
+#if MXNET_USE_NCCL
+#include "./kvstore_nccl.h"
+#endif  // MXNET_USE_NCCL
 
 namespace mxnet {
 
@@ -48,14 +52,23 @@ KVStore* KVStore::Create(const char *type_name) {
     kv = new kvstore::KVStoreDist(use_device_comm);
     if (!has("_async") && kv->IsWorkerNode() && kv->get_rank() == 0) {
       // configure the server to be the sync mode
-      kv->SendCommandToServers(kvstore::kSyncMode, "");
+      kv->SendCommandToServers(static_cast<int>(kvstore::CommandType::kSyncMode), "");
     }
 #else
     LOG(FATAL) << "compile with USE_DIST_KVSTORE=1 to use " << tname;
     return nullptr;
 #endif  // MXNET_USE_DIST_KVSTORE
   } else {
-    kv =  new kvstore::KVStoreLocal(use_device_comm);
+    if (has("nccl")) {
+#if MXNET_USE_NCCL
+      kv = new kvstore::KVStoreNCCL();
+#else
+      LOG(FATAL) << "compile with USE_NCCL=1 to use " << tname;
+      return nullptr;
+#endif
+    } else {
+      kv =  new kvstore::KVStoreLocal(use_device_comm);
+    }
   }
   kv->type_ = tname;
   return kv;
diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h
index b64d7c6369bc..b00d0de935f7 100644
--- a/src/kvstore/kvstore_dist.h
+++ b/src/kvstore/kvstore_dist.h
@@ -18,6 +18,7 @@
  */
 
 /**
+ * Copyright (c) 2015 by Contributors
  * @file   kvstore_dist.h
  * @brief  distributed implementation based on ps-lite
  */
@@ -25,6 +26,8 @@
 #define MXNET_KVSTORE_KVSTORE_DIST_H_
 #include <string>
 #include <vector>
+#include <algorithm>
+#include <utility>
 #include "./kvstore_local.h"
 #include "mxnet/engine.h"
 #include "ps/ps.h"
@@ -40,10 +43,6 @@ namespace kvstore {
 /**
  * \brief distributed kvstore
  *
- * for a worker node, it always guarantees that all push and pull issued from
- * this worker on the same key are serialized. namely push(3) and then pull(3),
- * then the data pulled is always containing the modification from the push(3).
- *
  * it's the server node's job to control the data consistency among all
  * workers. see details on \ref ServerHandle::Start
  */
@@ -60,6 +59,7 @@ class KVStoreDist : public KVStoreLocal {
       }
     }
     bigarray_bound_ = dmlc::GetEnv("MXNET_KVSTORE_BIGARRAY_BOUND", 1000 * 1000);
+    log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false);
   }
 
   virtual ~KVStoreDist() {
@@ -69,7 +69,7 @@ class KVStoreDist : public KVStoreLocal {
         Barrier();
         if (get_rank() == 0) {
           // stop the executor at servers
-          SendCommandToServers(kStopServer, "");
+          SendCommandToServers(static_cast<int>(CommandType::kStopServer), "");
         }
       }
       ps::Finalize(barrier_before_exit_);
@@ -77,17 +77,114 @@ class KVStoreDist : public KVStoreLocal {
     }
   }
 
-  void Init(const std::vector<int>& keys,
-            const std::vector<NDArray>& values) override {
+  void set_updater(const Updater& updater) override {
+    CHECK(updater) << "invalid updater";
+    if (IsServerNode()) {
+      CHECK_NOTNULL(server_)->set_updater(updater);
+    } else {
+      updater_ = updater;
+    }
+  }
+
+  void SetGradientCompression(const std::vector<std::pair<std::string, std::string> >
+                              & kwargs) override {
+    KVStoreLocal::SetGradientCompression(kwargs);
+    if (get_rank() == 0) {
+      SendCommandToServers(static_cast<int>(CommandType::kSetGradientCompression),
+                           gradient_compression_->EncodeParams());
+    }
+  }
+
+  void Barrier() override {
+    ps::Postoffice::Get()->Barrier(ps::kWorkerGroup);
+  }
+
+  void SendCommandToServers(int cmd_id,
+                            const std::string& cmd_body) override {
+    CHECK_NOTNULL(ps_worker_);
+    ps_worker_->Wait(ps_worker_->Request(cmd_id, cmd_body, ps::kServerGroup));
+  }
+
+  int get_group_size() const override { return ps::NumWorkers(); }
+
+  int get_rank() const override { return ps::MyRank(); }
+
+  int get_num_dead_node(int node_id, int timeout) const override {
+    int number = 0;
+    auto dead_nodes = ps::Postoffice::Get()->GetDeadNodes(timeout);
+    const auto& watch_nodes = ps::Postoffice::Get()->GetNodeIDs(node_id);
+    std::unordered_set<int> watch_set(watch_nodes.begin(), watch_nodes.end());
+    for (int r : dead_nodes) {
+      if (watch_set.find(r) != watch_set.end()) number++;
+    }
+    return number;
+  }
+
+  void RunServer(const Controller& controller) override {
+    CHECK(!IsWorkerNode());
+    if (IsServerNode()) {
+      server_ = new KVStoreDistServer();
+      server_->set_controller(controller);
+    }
+
+    ps::StartAsync("mxnet_server\0");
+    if (!ps::Postoffice::Get()->is_recovery()) {
+      ps::Postoffice::Get()->Barrier(
+        ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
+    }
+    if (server_) server_->Run();
+    ps::Finalize();
+    if (server_) {
+      delete server_;
+    }
+    server_ = nullptr;
+  }
+
+ private:
+  /**
+   * \brief struct for ps keys and lens
+   */
+  struct PSKV {
+    ps::SArray<ps::Key> keys;  // n keys
+    ps::SArray<int> lens;  // the length of the i-th value
+    int size;
+  };
+
+  struct ComprPSKV {
+    PSKV push;
+    PSKV pull;
+  };
+
+  /**
+   * \brief cache all key partitions
+   *
+   * `ps_kv_` is used for pushes and pulls without gradient compression
+   * `compr_ps_kv_` is used for gradient compression. It contains different
+   * pskv for push and pull because sizes would be different in both cases.
+   * Note: `ps_kv_[k]` for some key k may not be the same as `compr_ps_kv_[k].pull`
+   * This is because sharding may cause slightly different divisions when size is
+   * not perfectly divisible.
+   */
+  std::unordered_map<int, PSKV> ps_kv_;
+  std::unordered_map<int, ComprPSKV> compr_ps_kv_;
+
+  /**
+   * \brief serialize access to ps_kv_ or push_ps_kv_/pull_ps_kv_ while encoding keys
+   */
+  std::mutex mu_;
+
+  void InitImpl(const std::vector<int>& keys,
+                const std::vector<NDArray>& values) override {
     CheckUnique(keys);
     for (size_t i = 0; i < keys.size(); ++i) {
-      comm_->Init(keys[i], values[i].shape(), values[i].dtype());
+      comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype());
     }
     if (get_rank() == 0) {
       Push_(keys, values, 0, false);
       // wait until the push is finished
-      for (const auto& v : values) {
-        v.WaitToWrite();
+      for (const int key : keys) {
+        comm_buf_[key].WaitToWrite();
+        compr_buf_[key].WaitToWrite();
       }
     } else {
       // do nothing
@@ -97,43 +194,52 @@ class KVStoreDist : public KVStoreLocal {
     }
   }
 
-  void Push(const std::vector<int>& keys,
-            const std::vector<NDArray>& values,
-            int priority) override {
+  void PushImpl(const std::vector<int>& keys,
+                const std::vector<NDArray>& values,
+                int priority) override {
     Push_(keys, values, priority, true);
   }
 
-  void Pull(const std::vector<int>& keys,
-            const std::vector<NDArray*>& values,
-            int priority) override {
+  void PullImpl(const std::vector<int>& keys,
+                const std::vector<NDArray*>& values,
+                int priority) override {
     std::vector<int> uniq_keys;
     std::vector<std::vector<NDArray*> > grouped_vals;
-    GroupKVPairs(keys, values, &uniq_keys, &grouped_vals);
+    GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals);
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
       // use the same array for merging to guarantee that pull always happens
       // after the previous push on this key
       auto& recv_buf = comm_buf_[key];
+      const auto storage_type = grouped_vals[i][0]->storage_type();
+      CHECK_EQ(storage_type, kDefaultStorage)
+               << "Expected stype of value to be kDefaultStorage";
       if (recv_buf.is_none()) {
         // it may happen for the first time a no-rank-0 worker pull the weight.
-        recv_buf = NDArray(
-          grouped_vals[i][0]->shape(), pinned_ctx_, true, grouped_vals[i][0]->dtype());
+        recv_buf = NDArray(grouped_vals[i][0]->shape(), pinned_ctx_,
+                           true, grouped_vals[i][0]->dtype());
       }
       auto pull_from_servers = [this, key, recv_buf](
           RunContext rctx, Engine::CallbackOnComplete cb) {
         // convert to ps keys
         size_t size = recv_buf.shape().Size();
-        PSKV& pskv = EncodeKey(key, size);
+
+        PSKV& pskv = (gradient_compression_->get_type() == CompressionType::kNone) ?
+                      EncodeDefaultKey(key, size, false) :
+                      EncodeCompressedKey(key, size, false);
 #if MKL_EXPERIMENTAL == 1
         mkl_set_tblob_eager_mode(recv_buf.data());
 #endif
-        real_t* data = static_cast<real_t*>(recv_buf.data().dptr_);
+        real_t* data = recv_buf.data().dptr<real_t>();
         // false means not to delete data when SArray is deleted
         auto vals = new ps::SArray<real_t>(data, size, false);
         // issue pull
+        int cmd = (gradient_compression_->get_type() != CompressionType::kNone) ?
+                  static_cast<int>(DataHandleType::kCompressedPushPull) :
+                  static_cast<int>(DataHandleType::kDefaultPushPull);
         CHECK_NOTNULL(ps_worker_)->ZPull(
-            pskv.keys, vals, &pskv.lens, 0, [vals, cb](){ delete vals; cb(); });
+          pskv.keys, vals, &pskv.lens, cmd, [vals, cb](){ delete vals; cb(); });
       };
 
       CHECK_NOTNULL(Engine::Get())->PushAsync(
@@ -143,118 +249,261 @@ class KVStoreDist : public KVStoreLocal {
           {recv_buf.var()},
           FnProperty::kNormal,
           priority,
-          PROFILER_MESSAGE("KVStoreDistPull"));
+          PROFILER_MESSAGE("KVStoreDistDefaultStoragePull"));
 
       comm_->Broadcast(key, recv_buf, grouped_vals[i], priority);
     }
   }
 
-  void set_updater(const Updater& updater) override {
-    CHECK(updater) << "invalid updater";
-    if (IsServerNode()) {
-      CHECK_NOTNULL(server_)->set_updater(updater);
-    } else {
-      updater_ = updater;
-    }
-  }
-
-  void Barrier() override {
-    ps::Postoffice::Get()->Barrier(ps::kWorkerGroup);
-  }
-
-
-  void SendCommandToServers(int cmd_id,
-                            const std::string& cmd_body) override {
-    CHECK_NOTNULL(ps_worker_);
-    ps_worker_->Wait(ps_worker_->Request(cmd_id, cmd_body, ps::kServerGroup));
-  }
-
-  int get_group_size() const override { return ps::NumWorkers(); }
-
-  int get_rank() const override { return ps::MyRank(); }
-
-  int get_num_dead_node(int node_id, int timeout) const override {
-    int number = 0;
-    auto dead_nodes = ps::Postoffice::Get()->GetDeadNodes(timeout);
-    const auto& watch_nodes = ps::Postoffice::Get()->GetNodeIDs(node_id);
-    std::unordered_set<int> watch_set(watch_nodes.begin(), watch_nodes.end());
-    for (int r : dead_nodes) {
-      if (watch_set.find(r) != watch_set.end()) number++;
-    }
-    return number;
-  }
-
-  void RunServer(const Controller& controller) override {
-    CHECK(!IsWorkerNode());
-    if (IsServerNode()) {
-      server_ = new KVStoreDistServer();
-      server_->set_controller(controller);
-    }
+  void PullRowSparseImpl(const std::vector<int>& keys,
+                         const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                         int priority = 0) override {
+    std::vector<int> uniq_keys;
+    std::vector<std::vector<std::pair<NDArray*, NDArray>>> grouped_val_rowids;
+    GroupKVPairsPullRsp(keys, val_rowids, &uniq_keys, &grouped_val_rowids);
 
-    ps::StartAsync("mxnet_server\0");
-    if (!ps::Postoffice::Get()->is_recovery()) {
-      ps::Postoffice::Get()->Barrier(
-        ps::kWorkerGroup + ps::kServerGroup + ps::kScheduler);
-    }
-    if (server_) server_->Run();
-    ps::Finalize();
-    if (server_) {
-      delete server_;
+    for (size_t i = 0; i < uniq_keys.size(); ++i) {
+      int key = uniq_keys[i];
+      // use the same array for merging to guarantee that pull always happens
+      // after the previous push on this key
+      auto& recv_buf = comm_buf_[key];
+      auto& grouped_val_rowid = grouped_val_rowids[i];
+      const auto storage_type = grouped_val_rowid[0].first->storage_type();
+      CHECK_EQ(storage_type, kRowSparseStorage)
+               << "expected kRowSparseStorage, but got " << storage_type;
+      if (recv_buf.is_none()) {
+        // it may happen for the first time a no-rank-0 worker pull the weight.
+        recv_buf = NDArray(storage_type, grouped_val_rowid[0].first->shape(),
+                           pinned_ctx_, true, grouped_val_rowid[0].first->dtype());
+      }
+      auto &target_val_rowids = grouped_val_rowids[i];
+      const size_t num_vals = target_val_rowids.size();
+      size_t num_rows = 0;
+      // TODO(haibin) refactor this for loop
+      for (size_t i = 0; i < num_vals; i++) {
+        auto &row_id = target_val_rowids[i].second;
+        NDArray indices(row_id.shape(), pinned_ctx_, false, mshadow::kInt64);
+        CopyFromTo(row_id, &indices, 0);
+        Unique(&indices, priority);
+        target_val_rowids[i].second = indices;
+        num_rows += indices.shape().Size();
+      }
+      if (num_vals > 1) {
+        // TODO(haibin) aggregate over all unique indices
+        LOG(FATAL) << "RowSparsePull with multiple values is not implemented yet";
+      } else {
+        auto& indices = target_val_rowids[0].second;
+        PullRowSparse_(key, recv_buf, indices, priority);
+        comm_->BroadcastRowSparse(key, recv_buf, grouped_val_rowid, num_vals == 1, priority);
+      }
     }
-    server_ = nullptr;
   }
 
- private:
   void Push_(const std::vector<int>& keys,
              const std::vector<NDArray>& values,
              int priority,
-             bool do_merge)  {
+             bool do_merge) {
     // first aggregate the values over keys
     std::vector<int> uniq_keys;
     std::vector<std::vector<NDArray> > grouped_vals;
-    GroupKVPairs(keys, values, &uniq_keys, &grouped_vals);
+    GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals);
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
-      // merge over devcies
+      // merge over devices
       int key = uniq_keys[i];
       const auto& vals = grouped_vals[i];
       NDArray merged = do_merge ? comm_->Reduce(key, vals, priority) : vals[0];
 
-      auto& send_buf = comm_buf_[key];
+      const auto storage_type = merged.storage_type();
+      auto &comm_buf = comm_buf_[key];
       if (merged.ctx().dev_mask() == cpu::kDevMask) {
-        send_buf = merged;  // avoid memory copy
+        // Start of a push doesn't guarantee that the previous pushes are completed.
+        // This shouldn't affect training of networks though because training involves
+        // a sequence of push, pull, then push. This imposes ordering that the
+        // second push happens after the first pull, and the pull happens after first push.
+        comm_buf = merged;  // avoid memory copy
       } else {
-        if (send_buf.is_none()) {
-          send_buf = NDArray(merged.shape(), pinned_ctx_, true, merged.dtype());
+        if (comm_buf.is_none()) {
+          if (storage_type == kDefaultStorage) {
+            comm_buf = NDArray(merged.shape(), pinned_ctx_, true, merged.dtype());
+          } else {
+            comm_buf = NDArray(storage_type, merged.shape(), pinned_ctx_, true, merged.dtype());
+          }
         }
-        CopyFromTo(merged, &send_buf);
+        CopyFromTo(merged, &comm_buf);
       }
 
       // push to servers
-      auto push_to_servers =
-          [this, key, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) {
-        // convert to ps keys
-        size_t size = send_buf.shape().Size();
-        PSKV& pskv = EncodeKey(key, size);
+      if (storage_type == kDefaultStorage) {
+        if (gradient_compression_->get_type() == CompressionType::kNone) {
+          PSKV& pskv = EncodeDefaultKey(key, comm_buf.shape().Size(), true);
+          PushDefault(key, comm_buf, pskv, priority);
+        } else {
+          // Note: gradient compression uses `do_merge` as proxy to
+          // detect whether the push is initialization of a key or not.
+          // is_active is false when push is initialization of key
+          bool is_active = do_merge;
+          PSKV &pskv = EncodeCompressedKey(key, comm_buf.shape().Size(), is_active);
+          // Returns push_pskv if active, else pull_pskv
+          // we want inactive gc to send uncompressed gradients,
+          // but sharded in the same way as later pushes would when gc becomes active
+          if (is_active) {
+            PushCompressed(key, comm_buf, pskv, priority);
+          } else {
+            PushDefault(key, comm_buf, pskv, priority);
+          }
+        }
+      } else if (storage_type == kRowSparseStorage) {
+        CHECK(gradient_compression_->get_type() == CompressionType::kNone)
+          << "Gradient compression for row sparse storage type is not supported";
+        PushRowSparse(key, comm_buf, priority);
+      } else {
+        LOG(FATAL) << "unknown storage type";
+      }
+    }
+  }
 
+  void PushCompressed(int key, const NDArray& comm_buf, const PSKV& pskv, int priority) {
+    auto &small_buf = compr_buf_[key];
+    auto &res_buf = residual_[key];
+    size_t original_size = comm_buf.shape().Size();
+
+    // Init the small buffer and residual_ buffer for quantize
+    if (small_buf.is_none()) {
+      small_buf = NDArray(TShape{pskv.size}, comm_buf.ctx(), false, comm_buf.dtype());
+      res_buf = NDArray(TShape{(int64_t) original_size}, comm_buf.ctx(),
+                        false, comm_buf.dtype());
+      res_buf = 0;
+    }
+    gradient_compression_->Quantize(comm_buf, &small_buf, &res_buf, priority);
+    auto push_to_servers =
+      [this, key, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) {
+        size_t size = small_buf.shape().Size();
+        real_t* data = small_buf.data().dptr<real_t>();
 #if MKL_EXPERIMENTAL == 1
-        mkl_set_tblob_eager_mode(send_buf.data());
+        mkl_set_tblob_eager_mode(small_buf.data());
 #endif
-        real_t* data = static_cast<real_t*>(send_buf.data().dptr_);
         // do push. false means no delete
         ps::SArray<real_t> vals(data, size, false);
         CHECK_NOTNULL(ps_worker_)->ZPush(
-            pskv.keys, vals, pskv.lens, 0, [cb]() { cb(); });
+          pskv.keys, vals, pskv.lens,
+          static_cast<int>(DataHandleType::kCompressedPushPull), [cb]() { cb(); });
       };
-      Engine::Get()->PushAsync(
-          push_to_servers,
-          pinned_ctx_,
-          {send_buf.var()},
-          {},
-          FnProperty::kNormal,
-          priority,
-          PROFILER_MESSAGE("KVStoreDistPush"));
-    }
+    // acquire locks on both comm_buf and small_buf so that
+    // pull (which uses comm_buf) for the same key waits till push finishes
+    Engine::Get()->PushAsync(
+      push_to_servers,
+      pinned_ctx_,
+      {small_buf.var(), comm_buf.var()},
+      {},
+      FnProperty::kNormal,
+      priority,
+      PROFILER_MESSAGE("KVStoreDistCompressedPush"));
+  }
+
+  void PushDefault(int key, const NDArray &send_buf, const PSKV& pskv, int priority) {
+    auto push_to_servers =
+        [this, key, pskv, send_buf](RunContext rctx, Engine::CallbackOnComplete cb) {
+          // convert to ps keys
+          size_t size = send_buf.shape().Size();
+          real_t* data = send_buf.data().dptr<real_t>();
+#if MKL_EXPERIMENTAL == 1
+          mkl_set_tblob_eager_mode(send_buf.data());
+#endif
+          // do push. false means no delete
+          ps::SArray<real_t> vals(data, size, false);
+          CHECK_NOTNULL(ps_worker_)->ZPush(
+              pskv.keys, vals, pskv.lens,
+              static_cast<int>(DataHandleType::kDefaultPushPull), [cb]() { cb(); });
+        };
+    Engine::Get()->PushAsync(
+        push_to_servers,
+        pinned_ctx_,
+        {send_buf.var()},
+        {},
+        FnProperty::kNormal,
+        priority,
+        PROFILER_MESSAGE("KVStoreDistDefaultPush"));
+  }
+
+  // push row sparse gradient
+  void PushRowSparse(int key, const NDArray &send_buf, int priority) {
+    using namespace rowsparse;
+    auto push_to_servers = [this, key, send_buf]
+                           (RunContext rctx, Engine::CallbackOnComplete cb) {
+#if MKL_EXPERIMENTAL == 1
+      mkl_set_tblob_eager_mode(send_buf.data());
+#endif
+      real_t* data = send_buf.data().dptr<real_t>();
+      const int64_t num_rows = send_buf.aux_shape(kIdx)[0];
+      const auto offsets = send_buf.aux_data(kIdx).dptr<int64_t>();
+      const auto unit_len = send_buf.shape().ProdShape(1, send_buf.shape().ndim());
+      const int64_t size = num_rows * unit_len;
+
+       // convert to ps keys in row sparse format
+      PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets,
+                                      unit_len, send_buf.shape()[0]);
+      if (this->log_verbose_) {
+        LOG(INFO) << "worker " << get_rank() << " push lens: " << pskv.lens << " keys: "
+                  << pskv.keys << " size: " << size;
+      }
+      ps::SArray<real_t> vals(data, size, false);
+      CHECK_NOTNULL(ps_worker_)->ZPush(pskv.keys, vals, pskv.lens,
+                                       static_cast<int>(DataHandleType::kRowSparsePushPull),
+                                       [cb]() { cb(); });
+    };
+    Engine::Get()->PushAsync(
+        push_to_servers,
+        pinned_ctx_,
+        {send_buf.var()},
+        {},
+        FnProperty::kNormal,
+        priority,
+        PROFILER_MESSAGE("KVStoreDistRowSparsePush"));
+  }
+
+
+  // pull row sparse weight into `recv_buf` based on indices given by `indices`
+  void PullRowSparse_(const int key, const NDArray& recv_buf,
+                      const NDArray& indices, int priority) {
+    using namespace rowsparse;
+    auto pull_from_servers = [this, key, recv_buf, indices]
+      (RunContext rctx, Engine::CallbackOnComplete cb) {
+      // allocate memory for the buffer
+      size_t num_rows = indices.shape().Size();
+      recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)});
+#if MKL_EXPERIMENTAL == 1
+      mkl_set_tblob_eager_mode(recv_buf.data());
+#endif
+      real_t* data = recv_buf.data().dptr<real_t>();
+      const auto offsets = indices.data().dptr<int64_t>();
+      const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim());
+      const int64_t size = num_rows * unit_len;
+      // convert to ps keys in row sparse format
+      PSKV& pskv = EncodeRowSparseKey(key, size, num_rows, offsets,
+                                      unit_len, recv_buf.shape()[0]);
+      if (this->log_verbose_) {
+        LOG(INFO) << "worker " << get_rank() << " pull lens: " << pskv.lens << " keys: "
+                  << pskv.keys << " size: " << size;
+      }
+      auto vals = new ps::SArray<real_t>(data, size, false);
+      // copy indices to recv_buf. this needs to be done before ZPull
+      // because after pull is done, the callback function returns and locks are released.
+      // at this point, later functions may access the indices variable while copy happens
+      mshadow::Copy(recv_buf.aux_data(kIdx).FlatTo1D<cpu, int64_t>(),
+                    indices.data().FlatTo1D<cpu, int64_t>());
+      CHECK_NOTNULL(ps_worker_)->ZPull(pskv.keys, vals, &pskv.lens,
+                                       static_cast<int>(DataHandleType::kRowSparsePushPull),
+                                       [vals, cb]() { delete vals; cb(); });
+    };
+    CHECK_NOTNULL(Engine::Get())->PushAsync(
+      pull_from_servers,
+      pinned_ctx_,
+      {indices.var()},
+      {recv_buf.var()},
+      FnProperty::kNormal,
+      priority,
+      PROFILER_MESSAGE("KVStoreDistRowSparsePull"));
   }
 
   /**
@@ -267,33 +516,13 @@ class KVStoreDist : public KVStoreLocal {
              static_cast<size_t>(keys.size()));
   }
 
-  /**
-   * \brief struct for ps keys and lens
-   */
-  struct PSKV {
-    ps::SArray<ps::Key> keys;  // n keys
-    ps::SArray<int> lens;  // the length of the i-th value
-    int size;
-  };
-
-  /**
-   * \brief cache all key partitions
-   */
-  std::unordered_map<int, PSKV> ps_kv_;
-
-  /**
-   * \brief serizelize EncodeKey
-   */
-  std::mutex mu_;
-
   /**
    * \brief convert to keys in ps
    */
-  inline PSKV& EncodeKey(int key, size_t size) {
+  inline PSKV& EncodeDefaultKey(int key, size_t size, bool is_push) {
     mu_.lock();
     PSKV& pskv = ps_kv_[key];
     mu_.unlock();
-
     if (!pskv.keys.empty()) {
       CHECK_EQ(static_cast<size_t>(pskv.size), size) << "The value size cannot be changed";
     } else {
@@ -315,8 +544,8 @@ class KVStoreDist : public KVStoreLocal {
         pskv.size = 0;
         for (int i = 0; i < num_servers; ++i) {
           size_t part_size =
-              static_cast<size_t>(round(static_cast<double>(size)/num_servers*(i+1))) -
-              static_cast<size_t>(round(static_cast<double>(size)/num_servers*i));
+            static_cast<size_t>(round(static_cast<double>(size)/num_servers*(i+1))) -
+            static_cast<size_t>(round(static_cast<double>(size)/num_servers*i));
           ps::Key ps_key = krs[i].begin() + key;
           CHECK_LT(ps_key, krs[i].end());
           pskv.keys.push_back(ps_key);
@@ -329,6 +558,154 @@ class KVStoreDist : public KVStoreLocal {
     return pskv;
   }
 
+  /**
+   * \brief Convert to keys in ps for compressed values
+   * Divides original array into equal parts for each server
+   * Populates both push and pull pskv on first call
+   */
+  inline PSKV& EncodeCompressedKey(int key, size_t original_size, bool is_push) {
+    auto krs = ps::Postoffice::Get()->GetServerKeyRanges();
+    int num_servers = krs.size();
+    CHECK_GT(num_servers, 0);
+
+    // represents size of data to be sent
+    size_t compr_size = gradient_compression_->GetCompressedSize(original_size);
+
+    mu_.lock();
+    PSKV& pskv = (is_push) ? compr_ps_kv_[key].push : compr_ps_kv_[key].pull;
+    mu_.unlock();
+
+    if (!pskv.keys.empty()) {
+      size_t size = (is_push) ? compr_size : original_size;
+      CHECK_EQ(static_cast<size_t >(pskv.size), size)<< "The value size can't be changed";
+    } else {
+      // populate both pull and push pskvs
+      // push pskv has sizes corresponding to compressed data
+      // pull pskv has decompressed sizes for parts in push_pskv
+      mu_.lock();
+      PSKV& pull_pskv = compr_ps_kv_[key].pull;
+      PSKV& push_pskv = compr_ps_kv_[key].push;
+      mu_.unlock();
+
+      if (original_size < bigarray_bound_) {
+        // a simple heuristic for load balancing
+        // send it to a single random picked server
+        int server = (key * 9973) % num_servers;
+        ps::Key ps_key = krs[server].begin() + key;
+        CHECK_LT(ps_key, krs[server].end());
+        // meta info
+        push_pskv.keys.push_back(krs[server].begin() + original_size);
+        push_pskv.lens.push_back(0);
+        // data
+        push_pskv.keys.push_back(ps_key);
+        pull_pskv.keys.push_back(ps_key);
+        push_pskv.lens.push_back(compr_size);
+        pull_pskv.lens.push_back(original_size);
+        push_pskv.size = compr_size;
+        pull_pskv.size = original_size;
+      } else {
+        // partition it to all servers
+        push_pskv.size = 0;
+        pull_pskv.size = 0;
+
+        for (int i = 0; i < num_servers; ++i) {
+          size_t part_compr, part_orig;
+          if (i == num_servers-1) {
+            part_compr = compr_size - push_pskv.size;
+            part_orig = original_size - pull_pskv.size;
+          } else {
+            part_compr =
+              static_cast<size_t> (round(static_cast<double>(compr_size)/num_servers*(i+1))) -
+              static_cast<size_t> (round(static_cast<double>(compr_size)/num_servers*(i)));
+            part_orig = part_compr * gradient_compression_->GetCompressionFactor();
+          }
+
+          // meta info
+          ps::Key ps_key_dummy = krs[i].begin() + part_orig;
+          CHECK_LT(ps_key_dummy, krs[i].end());
+          push_pskv.keys.push_back(ps_key_dummy);
+          push_pskv.lens.push_back(0);
+
+          // data
+          ps::Key ps_key = krs[i].begin() + key;
+          CHECK_LT(ps_key, krs[i].end());
+          push_pskv.keys.push_back(ps_key);
+          pull_pskv.keys.push_back(ps_key);
+          // push_pskv stores lengths of compressed blocks
+          push_pskv.lens.push_back(part_compr);
+          // pull_pskv stores lengths of original data
+          pull_pskv.lens.push_back(part_orig);
+          push_pskv.size += part_compr;
+          pull_pskv.size += part_orig;
+        }
+        CHECK_EQ(static_cast<size_t>(push_pskv.size), compr_size);
+        CHECK_EQ(static_cast<size_t>(pull_pskv.size), original_size);
+        CHECK_EQ(push_pskv.lens.size(), num_servers*2);
+        }
+      }
+    return pskv;
+  }
+
+  // Note: this encoding method for row sparse keys doesn't allow cross-layer batching
+  inline PSKV& EncodeRowSparseKey(const int key, const int64_t size, const int64_t num_rows,
+                                  const int64_t *offsets, const size_t unit_len,
+                                  const int64_t total_num_rows) {
+    using namespace common;
+    mu_.lock();
+    PSKV& pskv = ps_kv_[key];
+    mu_.unlock();
+    pskv.keys.clear();
+    pskv.lens.clear();
+    // TODO(haibin) cache this information
+    auto krs = ps::Postoffice::Get()->GetServerKeyRanges();
+    int num_servers = krs.size();
+    CHECK_GT(num_servers, 0);
+
+    if (total_num_rows * unit_len >= bigarray_bound_) {
+      pskv.size = 0;
+      int64_t start_row = 0;
+      // parition it to all servers
+      for (int i = 0; i < num_servers; ++i) {
+        ps::Key master_key = krs[i].begin() + key;
+        pskv.keys.push_back(master_key);
+        pskv.lens.push_back(0);
+        if (offsets && size > 0) {
+          // calculate partition ranges
+          int64_t part_num_rows =
+            llround(static_cast<double>(total_num_rows) / num_servers * (i + 1)) -
+            llround(static_cast<double>(total_num_rows) / num_servers * i);
+          auto end_row = start_row + part_num_rows;
+          // search for offsets in [start_row, end_row)
+          auto lb = std::lower_bound(offsets, offsets + num_rows, start_row);
+          auto ub = std::upper_bound(offsets, offsets + num_rows, end_row - 1);
+          for (auto offset = lb; offset < ub; offset++) {
+            ps::Key ps_key = krs[i].begin() + key + (*offset - start_row);
+            CHECK_LT(ps_key, krs[i].end());
+            pskv.keys.push_back(ps_key);
+            pskv.lens.push_back(unit_len);
+            pskv.size += unit_len;
+          }
+          start_row = end_row;
+        }
+      }
+      CHECK_EQ(static_cast<size_t>(pskv.size), size);
+    } else {
+      // send it to a single random picked server
+      int server = (key * 9973) % num_servers;
+      ps::Key master_key = krs[server].begin() + key;
+      pskv.keys.push_back(master_key);
+      pskv.lens.push_back(0);
+      for (int64_t i = 0; i < num_rows; i++) {
+        ps::Key ps_key = krs[server].begin() + key + offsets[i];
+        CHECK_LT(ps_key, krs[server].end());
+        pskv.keys.push_back(ps_key);
+        pskv.lens.push_back(unit_len);
+      }
+      pskv.size = size;
+    }
+    return pskv;
+  }
+
   /**
    * \brief for worker to push and pull data
    */
@@ -341,8 +718,24 @@ class KVStoreDist : public KVStoreLocal {
    * \brief threshold for partition
    */
   size_t bigarray_bound_;
-  /// \brief send & recver buffer
+  /**
+   * \brief buffer for non-compressed data.
+   * When gradient compression is active, this is used
+   * for the data in pull and for original data in push
+   */
   std::unordered_map<int, NDArray> comm_buf_;
+  /**
+   * \brief buffer for compressed data
+   * Used when gradient compression is active and action
+   * is push
+   */
+  std::unordered_map<int, NDArray> compr_buf_;
+  /**
+   * \brief residual buffer to accumulate quantization error
+   * during gradient compression
+   */
+  std::unordered_map<int, NDArray> residual_;
+  bool log_verbose_;
 };
 
 }  // namespace kvstore
diff --git a/src/kvstore/kvstore_dist_server.h b/src/kvstore/kvstore_dist_server.h
index 4e9f887173c5..de94c8669abb 100644
--- a/src/kvstore/kvstore_dist_server.h
+++ b/src/kvstore/kvstore_dist_server.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file mxnet_node.h
  * \brief implement mxnet nodes
  */
@@ -33,12 +34,19 @@
 #include <vector>
 #include "ps/ps.h"
 #include "mxnet/kvstore.h"
+#include "../operator/tensor/elemwise_binary_op-inl.h"
+#include "../operator/tensor/init_op.h"
 
 namespace mxnet {
 namespace kvstore {
 
-static const int kStopServer = -1;
-static const int kSyncMode = -2;
+enum class CommandType {
+  kController, kStopServer, kSyncMode, kSetGradientCompression
+};
+
+enum class DataHandleType {
+  kDefaultPushPull, kCompressedPushPull, kRowSparsePushPull
+};
 
 /**
  * \brief executor runs a function using the thread called \ref Start
@@ -110,8 +118,10 @@ class KVStoreDistServer {
     static_cast<ps::SimpleApp*>(ps_server_)->set_request_handle(
         std::bind(&KVStoreDistServer::CommandHandle, this, _1, _2));
     ps_server_->set_request_handle(
-        std::bind(&KVStoreDistServer::DataHandle, this, _1, _2, _3));
+        std::bind(&KVStoreDistServer::DataHandleEx, this, _1, _2, _3));
     sync_mode_ = false;
+    gradient_compression_ = std::make_shared<GradientCompression>();
+    log_verbose_ = dmlc::GetEnv("MXNET_KVSTORE_DIST_ROW_SPARSE_VERBOSE", false);
   }
 
   ~KVStoreDistServer() {
@@ -136,12 +146,21 @@ class KVStoreDistServer {
   }
 
  private:
+  struct MergeBuf {
+    std::vector<ps::KVMeta> request;
+    NDArray array;
+  };
+
   void CommandHandle(const ps::SimpleData& recved, ps::SimpleApp* app) {
-    if (recved.head == kStopServer) {
+    CommandType recved_type = static_cast<CommandType>(recved.head);
+    if (recved_type == CommandType::kStopServer) {
       exec_.Stop();
-    } else if (recved.head == kSyncMode) {
+    } else if (recved_type == CommandType::kSyncMode) {
       sync_mode_ = true;
+    } else if (recved_type == CommandType::kSetGradientCompression) {
+      gradient_compression_->DecodeParams(recved.body);
     } else {
+      // this uses value 0 for message id from frontend
       // let the main thread to execute ctrl, which is necessary for python
       exec_.Exec([this, recved]() {
           CHECK(controller_);
@@ -151,9 +170,291 @@ class KVStoreDistServer {
     app->Response(recved);
   }
 
-  void DataHandle(const ps::KVMeta& req_meta,
-                  const ps::KVPairs<real_t>& req_data,
-                  ps::KVServer<real_t>* server) {
+  void DataHandleEx(const ps::KVMeta& req_meta,
+                    const ps::KVPairs<real_t>& req_data,
+                    ps::KVServer<real_t>* server) {
+    DataHandleType recved_type = static_cast<DataHandleType>(req_meta.cmd);
+    if (recved_type == DataHandleType::kRowSparsePushPull) {
+      DataHandleRowSparse(req_meta, req_data, server);
+    } else if (recved_type == DataHandleType::kCompressedPushPull) {
+      DataHandleCompressed(req_meta, req_data, server);
+    } else {
+      DataHandleDefault(req_meta, req_data, server);
+    }
+    return;
+  }
+
+  inline void ApplyUpdates(const int key, MergeBuf *merged, NDArray *stored,
+                           ps::KVServer<real_t>* server) {
+    if (merged->request.size() == (size_t) ps::NumWorkers()) {
+      // let the main thread to execute updater_, which is necessary for python
+      if (updater_) {
+        exec_.Exec([this, key, merged, stored](){
+            CHECK(updater_);
+            updater_(key, merged->array, stored);
+          });
+      } else {
+        // if no updater, just copy
+        CopyFromTo(merged->array, stored);
+      }
+      if (log_verbose_)  {
+        LOG(INFO) << "sync response to " << merged->request.size() << " workers";
+      }
+      for (const auto& req : merged->request) {
+        server->Response(req);
+      }
+      merged->request.clear();
+      stored->WaitToRead();
+    } else {
+      merged->array.WaitToRead();
+    }
+  }
+
+  void DecodeRowIds(const ps::SArray<ps::Key> &keys, int64_t *indices,
+                    const int64_t master_key, const int64_t num_rows) {
+    indices[0] = 0;
+    for (int64_t i = 1; i <= num_rows; i++) {
+      int key = DecodeKey(keys[i]);
+      auto row_id = key - master_key;
+      indices[i - 1] = row_id;
+    }
+  }
+
+  void DataHandleRowSparse(const ps::KVMeta& req_meta,
+                       const ps::KVPairs<real_t>& req_data,
+                       ps::KVServer<real_t>* server) {
+    int master_key = DecodeKey(req_data.keys[0]);
+    auto num_rows = req_data.keys.size() - 1;
+    auto& stored = store_[master_key];
+    if (req_meta.push) {
+      CHECK_GT(req_data.lens.size(), 0) << "req_data.lens cannot be empty";
+      CHECK_EQ(req_data.lens[0], 0);
+      real_t* data = req_data.vals.data();
+      if (stored.is_none()) {
+        if (log_verbose_) LOG(INFO) << "initial push: " << master_key;
+        // initialization
+        CHECK_GT(num_rows, 0) << "init with empty data is not supported";
+        auto unit_len = req_data.lens[1];
+        CHECK_GT(unit_len, 0);
+        size_t ds[] = {num_rows, (size_t) unit_len};
+        TShape dshape(ds, ds + 2);
+        CHECK_EQ(req_data.vals.size(), num_rows * unit_len);
+        TBlob recv_blob(data, dshape, cpu::kDevMask);  // NOLINT(*)
+        NDArray recved = NDArray(recv_blob, 0);
+        stored = NDArray(kRowSparseStorage, dshape, Context());
+        Engine::Get()->PushAsync(
+          [recved, stored](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+            NDArray rsp = stored;
+            stored.CheckAndAlloc({mshadow::Shape1(recved.shape()[0])});
+            mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+            op::PopulateFullIdxRspImpl(s, &rsp);
+            mshadow::Copy(rsp.data().FlatTo1D<cpu, float>(),
+                          recved.data().FlatTo1D<cpu, float>(), s);
+            on_complete();
+          }, recved.ctx(), {recved.var()}, {stored.var()},
+          FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+        stored.WaitToRead();
+        server->Response(req_meta);
+        return;
+      }
+      // synced push
+      if (sync_mode_) {
+        if (log_verbose_) LOG(INFO) << "sync push: " << master_key << " " << req_data.keys;
+        auto& merged = merge_buf_[master_key];
+        if (merged.array.is_none()) {
+          merged.array = NDArray(kRowSparseStorage, stored.shape(), Context());
+        }
+        if (num_rows == 0) {
+          // reset to zeros
+          if (merged.request.size() == 0) {
+            merged.array = NDArray(kRowSparseStorage, stored.shape(), Context());
+          } else {
+            // nothing to aggregate
+          }
+          merged.request.push_back(req_meta);
+          ApplyUpdates(master_key, &merged,  &stored, server);
+          return;
+        }
+        auto unit_len = req_data.lens[1];
+        CHECK_GT(unit_len, 0);
+        // indices
+        std::vector<int64_t> indices(num_rows);
+        DecodeRowIds(req_data.keys, indices.data(), master_key, num_rows);
+        // data
+        TBlob idx_blob(indices.data(), mshadow::Shape1(num_rows), cpu::kDevMask);
+        size_t ds[] = {(size_t) num_rows, (size_t) unit_len};
+        TShape dshape(ds, ds + 2);
+        TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*)
+        // row_sparse NDArray
+        NDArray recved(kRowSparseStorage, stored.shape(), recv_blob, {idx_blob}, 0);
+
+        if (merged.request.size() == 0) {
+          CopyFromTo(recved, &merged.array, 0);
+        } else {
+          NDArray out(kRowSparseStorage, stored.shape(), Context());
+          std::vector<Engine::VarHandle> const_vars;
+          const_vars.push_back(recved.var());
+          const_vars.push_back(merged.array.var());
+          // accumulate row_sparse gradients
+          // TODO(haibin) override + operator for row_sparse NDArray
+          // instead of calling BinaryComputeRspRsp directly
+          using namespace mshadow;
+          Engine::Get()->PushAsync(
+            [recved, merged, out](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+              op::ElemwiseBinaryOp::ComputeEx<cpu, mshadow::op::plus>(
+                {}, {}, {recved, merged.array}, {kWriteTo}, {out});
+              on_complete();
+            }, recved.ctx(), const_vars, {out.var()},
+            FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+          CopyFromTo(out, &merged.array, 0);
+        }
+        merged.request.push_back(req_meta);
+        ApplyUpdates(master_key, &merged,  &stored, server);
+      } else {
+        // async push
+        if (log_verbose_) LOG(INFO) << "async push: " << master_key;
+        if (num_rows == 0) {
+          server->Response(req_meta);
+          return;
+        }
+        auto unit_len = req_data.lens[1];
+        CHECK_GT(unit_len, 0);
+        // indices
+        std::vector<int64_t> indices(num_rows);
+        DecodeRowIds(req_data.keys, indices.data(), master_key, num_rows);
+        TBlob idx_blob(indices.data(), mshadow::Shape1(num_rows), cpu::kDevMask);
+        size_t ds[] = {(size_t) num_rows, (size_t) unit_len};
+        TShape dshape(ds, ds + 2);
+        TBlob recv_blob(data, dshape, cpu::kDevMask); // NOLINT(*)
+        NDArray recved(kRowSparseStorage, stored.shape(), recv_blob, {idx_blob}, 0);
+        exec_.Exec([this, master_key, &recved, &stored](){
+            CHECK(updater_);
+            updater_(master_key, recved, &stored);
+          });
+        server->Response(req_meta);
+        stored.WaitToRead();
+      }
+    } else {
+      // pull
+      if (log_verbose_) LOG(INFO) << "pull: " << master_key;
+      ps::KVPairs<real_t> response;
+      if (num_rows == 0) {
+        std::vector<int> lens(req_data.keys.size(), 0);
+        response.keys = req_data.keys;
+        response.lens.CopyFrom(lens.begin(), lens.end());
+        server->Response(req_meta, response);
+        return;
+      }
+      CHECK(!stored.is_none()) << "init " << master_key << " first";
+      auto shape = stored.shape();
+      auto unit_len = shape.ProdShape(1, shape.ndim());
+      const float* data = stored.data().dptr<float>();
+      auto len = unit_len * num_rows;
+      // concat values
+      response.vals.resize(len);
+      #pragma omp parallel for
+      for (size_t i = 1; i <= num_rows; i++) {
+        int key = DecodeKey(req_data.keys[i]);
+        int64_t row_id = key - master_key;
+        const auto src = data + row_id * unit_len;
+        auto begin = (i - 1) * unit_len;
+        auto end = i * unit_len;
+        response.vals.segment(begin, end).CopyFrom(src, unit_len);
+      }
+      // setup response
+      response.keys = req_data.keys;
+      std::vector<int> lens(req_data.keys.size(), unit_len);
+      lens[0] = 0;
+      response.lens.CopyFrom(lens.begin(), lens.end());
+      server->Response(req_meta, response);
+    }
+  }
+
+  void DefaultStorageResponse(int key, const NDArray& stored,
+                              const ps::KVMeta& req_meta,
+                              const ps::KVPairs<real_t> &req_data,
+                              ps::KVServer<real_t>* server) {
+    ps::KVPairs<real_t> response;
+    CHECK(!stored.is_none()) << "init " << key << " first";
+    auto len = stored.shape().Size();
+    response.keys = req_data.keys;
+    response.lens = {len};
+    // TODO(mli) try to remove this CopyFrom
+    response.vals.CopyFrom(static_cast<const float*>(stored.data().dptr_), len);
+    server->Response(req_meta, response);
+  }
+
+  void DataHandleCompressed(const ps::KVMeta& req_meta,
+                            const ps::KVPairs<real_t> &req_data,
+                            ps::KVServer<real_t>* server) {
+    if (req_meta.push) {
+      // there used several WaitToRead, this is because \a recved's memory
+      // could be deallocated when this function returns. so we need to make sure
+      // the operators with \a NDArray are actually finished
+
+      // first for dummy key which represents original size of array, whose len is 0
+      CHECK_EQ(req_data.keys.size(), (size_t)2);
+      CHECK_EQ(req_data.lens.size(), (size_t)2);
+      CHECK_EQ(req_data.vals.size(), (size_t)req_data.lens[1]);
+
+      int original_size = DecodeKey(req_data.keys[0]);
+      int key = DecodeKey(req_data.keys[1]);
+      auto& stored = store_[key];
+
+      size_t ds[] = {(size_t)req_data.lens[1]};
+      TShape dshape(ds, ds + 1);
+      TBlob recv_blob((real_t*) req_data.vals.data(), // NOLINT(*)
+                      dshape, cpu::kDevMask);
+      NDArray recved = NDArray(recv_blob, 0);
+
+      NDArray decomp_buf = decomp_buf_[key];
+      dshape = TShape{(int64_t) original_size};
+
+      if (decomp_buf.is_none()) {
+        decomp_buf = NDArray(dshape, Context());
+      }
+
+      if (stored.is_none()) {
+        stored = NDArray(dshape, Context());
+        gradient_compression_->Dequantize(recved, &stored, 0);
+        server->Response(req_meta);
+        stored.WaitToRead();
+      } else if (sync_mode_) {
+        // synced push
+        auto& merged = merge_buf_[key];
+        if (merged.array.is_none()) {
+          merged.array = NDArray(dshape, Context());
+        }
+        if (merged.request.size() == 0) {
+          gradient_compression_->Dequantize(recved, &merged.array, 0);
+        } else {
+          gradient_compression_->Dequantize(recved, &decomp_buf, 0);
+          merged.array += decomp_buf;
+        }
+        merged.request.push_back(req_meta);
+        ApplyUpdates(key, &merged, &stored, server);
+      } else {
+        // async push
+        gradient_compression_->Dequantize(recved, &decomp_buf, 0);
+        exec_.Exec([this, key, &decomp_buf, &stored]() {
+          CHECK(updater_);
+          updater_(key, decomp_buf, &stored);
+        });
+        server->Response(req_meta);
+        stored.WaitToRead();
+      }
+    } else {       // pull
+      CHECK_EQ(req_data.keys.size(), (size_t)1);
+      CHECK_EQ(req_data.lens.size(), (size_t)0);
+      int key = DecodeKey(req_data.keys[0]);
+      DefaultStorageResponse(key, store_[key], req_meta, req_data, server);
+    }
+  }
+
+  void DataHandleDefault(const ps::KVMeta& req_meta,
+                         const ps::KVPairs<real_t> &req_data,
+                         ps::KVServer<real_t>* server) {
+    CHECK_EQ(req_meta.cmd, static_cast<int>(DataHandleType::kDefaultPushPull));
     // do some check
     CHECK_EQ(req_data.keys.size(), (size_t)1);
     if (req_meta.push) {
@@ -185,35 +486,13 @@ class KVStoreDistServer {
         if (merged.array.is_none()) {
           merged.array = NDArray(dshape, Context());
         }
-
         if (merged.request.size() == 0) {
           CopyFromTo(recved, &merged.array, 0);
         } else {
           merged.array += recved;
         }
-
         merged.request.push_back(req_meta);
-
-        if (merged.request.size() == (size_t)ps::NumWorkers()) {
-          // let the main thread to execute updater_, which is necessary for
-          // python
-          if (updater_) {
-            exec_.Exec([this, key, &merged, &stored](){
-                CHECK(updater_);
-                updater_(key, merged.array, &stored);
-              });
-          } else {
-            // if no updater, just copy
-            CopyFromTo(merged.array, &stored);
-          }
-          for (const auto& req : merged.request) {
-            server->Response(req);
-          }
-          merged.request.clear();
-          stored.WaitToRead();
-        } else {
-          merged.array.WaitToRead();
-        }
+        ApplyUpdates(key, &merged, &stored, server);
       } else {
         // async push
         exec_.Exec([this, key, &recved, &stored](){
@@ -224,15 +503,7 @@ class KVStoreDistServer {
         stored.WaitToRead();
       }
     } else {
-      // pull
-      ps::KVPairs<real_t> response;
-      CHECK(!stored.is_none()) << "init " << key << " first";
-      int len = stored.shape()[0];
-      response.keys = req_data.keys;
-      response.lens = {len};
-      // TODO(mli) try to remove this CopyFrom
-      response.vals.CopyFrom(static_cast<const float*>(stored.data().dptr_), len);
-      server->Response(req_meta, response);
+      DefaultStorageResponse(key, stored, req_meta, req_data, server);
     }
   }
 
@@ -241,24 +512,44 @@ class KVStoreDistServer {
     return key - kr.begin();
   }
 
+
   /**
-   * \brief user defined
+   * \brief user defined mode for push
    */
   bool sync_mode_;
   KVStore::Controller controller_;
   KVStore::Updater updater_;
 
+  /**
+   * \brief store_ contains the value at kvstore for each key
+   */
   std::unordered_map<int, NDArray> store_;
 
-  struct MergeBuf {
-    std::vector<ps::KVMeta> request;
-    NDArray array;
-  };
+  /**
+   * \brief merge_buf_ is a buffer used if sync_mode is true. It represents
+   * values from different workers being merged. The store will be updated
+   * to this value when values from all workers are pushed into this buffer.
+   */
   std::unordered_map<int, MergeBuf> merge_buf_;
 
-  Executor exec_;
+  /**
+   * \brief decomp_buf_ is a buffer into which compressed values are
+   * decompressed before merging to the store. used when compress_!='none'
+   */
+  std::unordered_map<int, NDArray> decomp_buf_;
 
+  Executor exec_;
   ps::KVServer<float>* ps_server_;
+
+  // whether to LOG verbose information
+  bool log_verbose_;
+
+  /**
+   * \brief gradient compression object.
+   * starts with none, used after SetGradientCompression sets the type
+   * currently there is no support for unsetting gradient compression
+   */
+  std::shared_ptr<kvstore::GradientCompression> gradient_compression_;
 };
 
 }  // namespace kvstore
diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h
index 536a89b46e13..1bb84fdc1114 100644
--- a/src/kvstore/kvstore_local.h
+++ b/src/kvstore/kvstore_local.h
@@ -18,6 +18,7 @@
  */
 
 /**
+ * Copyright (c) 2015 by Contributors
  * @file   kvstore_local.h
  * @brief  local implementation
  */
@@ -30,11 +31,19 @@
 #include <vector>
 #include <string>
 #include <utility>
+#include <functional>
 #include <algorithm>
 #include "./comm.h"
 
 namespace mxnet {
 namespace kvstore {
+
+enum KeyType {
+  kUndefinedKey = -1,
+  kStringKey,
+  kIntKey
+};
+
 /**
  * \brief store data in local machine
  */
@@ -50,24 +59,23 @@ class KVStoreLocal : public KVStore {
       comm_ = new CommCPU();
     }
     pinned_ctx_ = comm_->pinned_ctx();
+    gradient_compression_ = std::make_shared<GradientCompression>();
   }
 
   virtual ~KVStoreLocal() {
     delete comm_;
+    comm_ = nullptr;
   }
 
   void Init(const std::vector<int>& keys,
             const std::vector<NDArray>& values) override {
-    for (size_t i = 0; i < keys.size(); ++i) {
-      CHECK(local_.find(keys[i]) == local_.end())
-          << "duplicate init of key " << keys[i];
-      local_[keys[i]] = values[i].Copy(pinned_ctx_);
-      comm_->Init(keys[i], values[i].shape(), values[i].dtype());
-    }
+    SetKeyType(kIntKey);
+    InitImpl(keys, values);
   }
 
   void Init(const std::vector<std::string>& str_keys,
             const std::vector<NDArray>& values) override {
+    SetKeyType(kStringKey);
     std::vector<int> keys(str_keys.size());
     for (size_t i = 0; i < str_keys.size(); ++i) {
       auto &str_key = str_keys[i];
@@ -75,18 +83,84 @@ class KVStoreLocal : public KVStore {
             << "duplicate init of key " << str_key;
       auto key = next_str_key_++;
       str_key_dict_[str_key] = key;
+      // record reverse mapping from int to string
+      reverse_str_key_dict_[key] = str_key;
       keys[i] = key;
     }
-    Init(keys, values);
+    InitImpl(keys, values);
   }
 
   void Push(const std::vector<int>& keys,
             const std::vector<NDArray>& values,
             int priority) override {
+    SetKeyType(kIntKey);
+    PushImpl(keys, values, priority);
+  }
+
+  void Pull(const std::vector<int>& keys,
+            const std::vector<NDArray*>& values,
+            int priority) override {
+    SetKeyType(kIntKey);
+    PullImpl(keys, values, priority);
+  }
+
+  void PullRowSparse(const std::vector<int>& keys,
+                     const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                     int priority = 0) override {
+    SetKeyType(kIntKey);
+    PullRowSparseImpl(keys, val_rowids, priority);
+  }
+
+  void Push(const std::vector<std::string>& str_keys,
+            const std::vector<NDArray>& values,
+            int priority) override {
+    SetKeyType(kStringKey);
+    std::vector<int> keys(str_keys.size());
+    LookupKeys(str_keys, &keys);
+    PushImpl(keys, values, priority);
+  }
+
+  void Pull(const std::vector<std::string>& str_keys,
+            const std::vector<NDArray*>& values,
+            int priority) override {
+    SetKeyType(kStringKey);
+    std::vector<int> keys(str_keys.size());
+    LookupKeys(str_keys, &keys);
+    PullImpl(keys, values, priority);
+  }
+
+  void PullRowSparse(const std::vector<std::string>& str_keys,
+                     const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                     int priority = 0) override {
+    SetKeyType(kStringKey);
+    std::vector<int> keys(str_keys.size());
+    LookupKeys(str_keys, &keys);
+    PullRowSparseImpl(keys, val_rowids, priority);
+  }
+
+  void SetGradientCompression(const std::vector<std::pair<std::string, std::string> >
+                              & kwargs) override {
+    gradient_compression_->SetParams(kwargs);
+  }
+
+ private:
+  virtual void InitImpl(const std::vector<int>& keys,
+                        const std::vector<NDArray>& values) {
+    for (size_t i = 0; i < keys.size(); ++i) {
+      CHECK(local_.find(keys[i]) == local_.end())
+          << "duplicate init of key " << keys[i];
+      local_[keys[i]] = values[i].Copy(pinned_ctx_);
+      comm_->Init(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype());
+    }
+    comm_->SetGradientCompression(gradient_compression_);
+  }
+
+  virtual void PushImpl(const std::vector<int>& keys,
+                        const std::vector<NDArray>& values,
+                        int priority) {
     std::vector<int> uniq_keys;
     std::vector<std::vector<NDArray> > grouped_vals;
-    GroupKVPairs(keys, values, &uniq_keys, &grouped_vals);
-
+    GroupKVPairsPush(keys, values, &uniq_keys, &grouped_vals);
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
       const NDArray& merged = comm_->Reduce(key, grouped_vals[i], priority);
@@ -98,19 +172,34 @@ class KVStoreLocal : public KVStore {
             local.ctx().dev_mask() == cpu::kDevMask) {
           local = local.Copy(merged.ctx());
         }
-        updater_(key, merged,  &local);
+        // call the updater with string keys
+        // if string keys are used and str_updater_ is available
+        // otherwise fallback to updater_ which uses int key interface
+        if (key_type_ == kStringKey && str_updater_ != nullptr) {
+          // TODO(haibin) CHECK(str_updater_ != nullptr) if use_str_key
+          // after all language bindings picks up string interface changes
+          const std::string &str_key = reverse_str_key_dict_[key];
+          // TODO(haibin) avoid reverse key lookup if use_str_key
+          str_updater_(str_key, merged,  &local);
+        } else {
+          updater_(key, merged,  &local);
+        }
       } else {
-        local = merged;
+        if (merged.storage_type() != local.storage_type()) {
+          local = merged.Copy(local.ctx());
+        } else {
+          local = merged;
+        }
       }
     }
   }
 
-  void Pull(const std::vector<int>& keys,
-            const std::vector<NDArray*>& values,
-            int priority) override {
+  virtual void PullImpl(const std::vector<int>& keys,
+                        const std::vector<NDArray*>& values,
+                        int priority) {
     std::vector<int> uniq_keys;
     std::vector<std::vector<NDArray*> > grouped_vals;
-    GroupKVPairs(keys, values, &uniq_keys, &grouped_vals);
+    GroupKVPairsPull(keys, values, &uniq_keys, &grouped_vals);
 
     for (size_t i = 0; i < uniq_keys.size(); ++i) {
       int key = uniq_keys[i];
@@ -120,31 +209,115 @@ class KVStoreLocal : public KVStore {
     }
   }
 
-  void Push(const std::vector<std::string>& str_keys,
-            const std::vector<NDArray>& values,
-            int priority) override {
-    std::vector<int> keys(str_keys.size());
-    LookupKeys(str_keys, &keys);
-    Push(keys, values, priority);
+  virtual void PullRowSparseImpl(const std::vector<int>& keys,
+                                 const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                                 int priority = 0) {
+    std::vector<int> uniq_keys;
+    std::vector<std::vector<std::pair<NDArray*, NDArray>>> grouped_val_rowids;
+    GroupKVPairsPullRsp(keys, val_rowids, &uniq_keys, &grouped_val_rowids);
+    for (size_t i = 0; i < uniq_keys.size(); ++i) {
+      int key = uniq_keys[i];
+      const NDArray& local = local_[key];
+      CHECK(!local.is_none()) << "key " << key << " has not been inited";
+      CHECK_EQ(local.storage_type(), kRowSparseStorage)
+               << "PullRowSparse expects row_sparse src NDArray";
+      auto &target_val_rowids = grouped_val_rowids[i];
+      const size_t num_vals = target_val_rowids.size();
+      for (size_t i = 0; i < num_vals; i++) {
+        auto &row_id = target_val_rowids[i].second;
+        NDArray indices(row_id.shape(), pinned_ctx_, false, mshadow::kInt64);
+        CopyFromTo(row_id, &indices, 0);
+        Unique(&indices, priority);
+        target_val_rowids[i].second = indices;
+      }
+      comm_->BroadcastRowSparse(key, local, grouped_val_rowids[i], false, priority);
+    }
   }
 
-  void Pull(const std::vector<std::string>& str_keys,
-            const std::vector<NDArray*>& values,
-            int priority) override {
-    std::vector<int> keys(str_keys.size());
-    LookupKeys(str_keys, &keys);
-    Pull(keys, values, priority);
+ protected:
+  KVStoreLocal() : KVStore() {}
+  /**
+   * \brief set the key type of the kvstore if haven't already.
+   * If the key type is already defined, check if it matches the provided key type
+   */
+  void SetKeyType(const KeyType key_type) {
+    if (key_type_ == kUndefinedKey) key_type_ = key_type;
+    CHECK_EQ(key_type_, key_type) << "Mixed key types are not allowed";
+  }
+
+  /**
+   * \brief group values on keys for push
+   */
+  virtual void GroupKVPairsPush(const std::vector<int>& keys,
+                                const std::vector<NDArray>& values,
+                                std::vector<int> *uniq_keys,
+                                std::vector<std::vector<NDArray>> *grouped_vals) {
+    // check if the storage type of a value is valid
+    auto validator = [this](const int key, const NDArray& nd) -> bool {
+      auto stype = nd.storage_type();
+      // valid NDArray
+      if (stype == kDefaultStorage || stype == kRowSparseStorage) return true;
+      // invalid NDArray, abort
+      LOG(FATAL) << "Unexpected storage type detected during kvstore push: " << stype;
+      return false;
+    };
+    GroupKVPairs(keys, values, uniq_keys, grouped_vals, validator);
+  }
+  /**
+   * \brief group values on keys for pull
+   */
+  virtual void GroupKVPairsPull(const std::vector<int>& keys,
+                                const std::vector<NDArray*>& values,
+                                std::vector<int> *uniq_keys,
+                                std::vector<std::vector<NDArray*>> *grouped_vals) {
+    // check if the storage type of a value is valid
+    auto validator = [this](const int key, const NDArray* nd) -> bool {
+      // valid
+      if (nd->storage_type() == kDefaultStorage) return true;
+      // invalid, print warning messages once
+      if (this->warnings_printed_.find(key) == this->warnings_printed_.end()) {
+        LOG(INFO) << "Warning: non-default weights detected during kvstore pull. "
+                  << "This call has been ignored. "
+                  << "Please make sure to use row_sparse_pull with row_ids.";
+        this->warnings_printed_.insert(key);
+      }
+      return false;
+    };
+    GroupKVPairs(keys, values, uniq_keys, grouped_vals, validator);
+  }
+
+  typedef std::pair<NDArray*, NDArray> RSPVal;
+  /**
+   * \brief group values on keys for row_sparse_pull
+   */
+  virtual void GroupKVPairsPullRsp(const std::vector<int>& keys,
+                                   const std::vector<RSPVal>& values,
+                                   std::vector<int> *uniq_keys,
+                                   std::vector<std::vector<RSPVal>> *grouped_vals) {
+    // check if the storage type of a value is valid
+    auto validator = [this](const int key, const RSPVal& val_rowid) -> bool {
+      auto val_stype = val_rowid.first->storage_type();
+      auto rowid_stype = val_rowid.second.storage_type();
+      // check storage types
+      CHECK_EQ(val_stype, kRowSparseStorage) << "Expected row_sparse storage type for "
+              << "row_sparse_pull values, but detected storage type " << val_stype;
+      CHECK_EQ(rowid_stype, kDefaultStorage) << "Expected default storage type for "
+              << "row_sparse_pull rowids, but detected storage type " << rowid_stype;
+      return true;
+    };
+    GroupKVPairs(keys, values, uniq_keys, grouped_vals, validator);
   }
 
- protected:
   /**
-   * \brief group values on keys
+   * \brief group values on keys with validation.
+   * A value `v` is not included in the result if is_valid(v) returns false.
    */
-  template <typename V>
+  template <typename V, typename FValidate>
   void GroupKVPairs(const std::vector<int>& keys,
                     const std::vector<V>& values,
                     std::vector<int>* uniq_keys,
-                    std::vector<std::vector<V> >* grouped_vals) {
+                    std::vector<std::vector<V> >* grouped_vals,
+                    const FValidate& is_valid) {
     CHECK_EQ(keys.size(), values.size());
     // TODO(mli) check if already sorted as an optimization
     using Idx = std::pair<int, int>;
@@ -158,12 +331,14 @@ class KVStoreLocal : public KVStore {
 
     int pre_key = idx[0].first - 1;
     for (auto i : idx) {
-      if (i.first != pre_key) {
-        uniq_keys->push_back(i.first);
-        grouped_vals->push_back({values[i.second]});
-        pre_key = i.first;;
-      } else {
-        grouped_vals->back().push_back(values[i.second]);
+      if (is_valid(i.first, values[i.second])) {
+        if (i.first != pre_key) {
+          uniq_keys->push_back(i.first);
+          grouped_vals->push_back({values[i.second]});
+          pre_key = i.first;
+        } else {
+          grouped_vals->back().push_back(values[i.second]);
+        }
       }
     }
   }
@@ -178,6 +353,30 @@ class KVStoreLocal : public KVStore {
     }
   }
 
+  /**
+   * \brief sort and get unique values. Output is expected to be on cpu_pinned context
+   */
+  void Unique(NDArray *out, int priority = 0) {
+    CHECK_EQ(out->ctx().dev_mask(), pinned_ctx_.dev_mask())
+             << "Unique expects input with `pinned_ctx_`";
+    Engine::Get()->PushAsync(
+      [out](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+        NDArray *output = out;
+        CHECK_EQ(out->shape().ndim(), 1) << "Unique expects 1D inputs";
+        const auto size = out->shape()[0];
+        auto out_data = output->data();
+        MSHADOW_IDX_TYPE_SWITCH(out_data.type_flag_, IType, {
+          auto dptr = output->data().dptr<IType>();
+          common::ParallelSort(dptr, dptr + size, omp_get_max_threads());
+          auto num_unique_idx = std::unique(dptr, dptr + size) - dptr;
+          *output = output->Reshape(mshadow::Shape1(num_unique_idx));
+        });
+        on_complete();
+      }, pinned_ctx_, {}, {out->var()},
+      FnProperty::kCPUPrioritized, priority, PROFILER_MESSAGE("KVStoreUnique"));
+    out->WaitToRead();
+  }
+
   /// reducer and broadcaster
   Comm* comm_;
   /// pinned context
@@ -186,8 +385,14 @@ class KVStoreLocal : public KVStore {
   std::unordered_map<int, NDArray> local_;
   /// key mapping for string -> integer
   std::unordered_map<std::string, int> str_key_dict_;
+  /// reverse key mapping for integer -> string
+  std::unordered_map<int, std::string> reverse_str_key_dict_;
   /// the next available integer for string->int key mapping
   int next_str_key_ = 0;
+  /// whether printed warning due to mismatch stype in each key
+  std::unordered_set<int> warnings_printed_;
+  /// whether int or string is used for keys
+  KeyType key_type_ = kUndefinedKey;
 };
 }  // namespace kvstore
 }  // namespace mxnet
diff --git a/src/kvstore/kvstore_nccl.h b/src/kvstore/kvstore_nccl.h
new file mode 100644
index 000000000000..e97a6d7f7e55
--- /dev/null
+++ b/src/kvstore/kvstore_nccl.h
@@ -0,0 +1,549 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * @file   kvstore_nccl.h
+ * @brief  NCCL implementation of KVStore
+ */
+#ifndef MXNET_KVSTORE_KVSTORE_NCCL_H_
+#define MXNET_KVSTORE_KVSTORE_NCCL_H_
+
+#if MXNET_USE_NCCL
+
+#include <mxnet/kvstore.h>
+#include <nccl.h>
+#include <unordered_map>
+#include <bitset>
+#include <vector>
+#include <string>
+#include <utility>
+#include <functional>
+#include <algorithm>
+#include <tuple>
+#include "./comm.h"
+#include "./kvstore_local.h"
+#include "../common/cuda_utils.h"
+
+// NCCL v2 introduces NCCL_MAJOR macro for versioning,
+// so if there is no such macro defined in nccl.h
+// then it is NCCL v1
+#ifndef NCCL_MAJOR
+#define NCCL_MAJOR 1
+#endif
+
+#if NCCL_MAJOR == 1
+#define ncclGroupStart()
+#define ncclGroupEnd()
+#define ncclNumTypes nccl_NUM_TYPES
+#endif  // NCCL_MAJOR == 1
+
+namespace mxnet {
+namespace kvstore {
+
+/**
+ * \brief store data in local machine using NCCL
+ */
+class KVStoreNCCL : public KVStoreLocal {
+ public:
+  KVStoreNCCL() : KVStoreLocal() {
+    // Due to aggregation, we do not use the Comm interface
+    comm_ = nullptr;
+    pinned_ctx_ = Context::CPUPinned(0);
+    inited_ = false;
+  }
+
+  virtual ~KVStoreNCCL() {
+    for (auto e : nccl_data_) {
+      cudaStreamDestroy(e.second.stream);
+      ncclCommDestroy(e.second.comm);
+    }
+  }
+
+ private:
+  void InitImpl(const std::vector<int>& keys,
+                const std::vector<NDArray>& values) override {
+    for (size_t i = 0; i < keys.size(); ++i) {
+      CHECK(local_.find(keys[i]) == local_.end())
+          << "duplicate init of key " << keys[i];
+      local_[keys[i]] = values[i].Copy(pinned_ctx_);
+      InitKey(keys[i], values[i].storage_type(), values[i].shape(), values[i].dtype());
+    }
+  }
+
+  void PushImpl(const std::vector<int>& keys,
+                const std::vector<NDArray>& values,
+                int priority) override {
+    std::vector<int> uniq_keys;
+    std::vector<std::vector<NDArray> > grouped_vals;
+    GroupKVPairsHelper(keys, values, &uniq_keys, &grouped_vals);
+
+    std::vector<const NDArray*> merged_ptrs;
+    std::vector<NDArray*> local_ptrs;
+    bool nccl_called = false;
+
+    Reduce(uniq_keys, grouped_vals, priority, &merged_ptrs);
+
+    for (size_t i = 0; i < uniq_keys.size(); ++i) {
+      int key = uniq_keys[i];
+      if (grouped_vals[i].size() > 1) {
+        // We issued NCCL kernels, need to synchronize
+        nccl_called = true;
+      }
+      auto& merged = *(merged_ptrs[i]);
+      NDArray& local = local_[key];
+      if (updater_ != nullptr) {
+        CHECK(!local.is_none()) << "key " << key << " has not been inited";
+        // if merged is on gpu, we may need copy weight from cpu to gpu
+        if (merged.ctx().dev_mask() != cpu::kDevMask &&
+            local.ctx().dev_mask() == cpu::kDevMask) {
+          local = local.Copy(merged.ctx());
+        }
+      }
+      local_ptrs.push_back(&local);
+    }
+
+    // Sync after all reductions in a group
+    if (nccl_called) {
+      CommSync(merged_ptrs, priority);
+    }
+
+    for (size_t i = 0; i < uniq_keys.size(); ++i) {
+      int key = uniq_keys[i];
+      auto& merged = *(merged_ptrs[i]);
+      NDArray& local = *(local_ptrs[i]);
+      if (updater_ != nullptr) {
+        // call the updater with string keys
+        // if string keys are used and str_updater_ is available
+        // otherwise fallback to updater_ which uses int key interface
+        if (key_type_ == kStringKey && str_updater_ != nullptr) {
+          // after all language bindings picks up string interface changes
+          const std::string &str_key = reverse_str_key_dict_[key];
+          str_updater_(str_key, merged,  &local);
+        } else {
+          updater_(key, merged,  &local);
+        }
+      } else {
+        local = merged;
+      }
+    }
+  }
+
+  void PullImpl(const std::vector<int>& keys,
+                const std::vector<NDArray*>& values,
+                int priority) override {
+    std::vector<int> uniq_keys;
+    std::vector<std::vector<NDArray*> > grouped_vals;
+    GroupKVPairsHelper(keys, values, &uniq_keys, &grouped_vals);
+    std::vector<NDArray> locals;
+    bool nccl_called = false;
+
+    for (size_t i = 0; i < uniq_keys.size(); ++i) {
+      int key = uniq_keys[i];
+      const NDArray& local = local_[key];
+      locals.push_back(local_[key]);
+      CHECK(!local.is_none()) << "key " << key << " has not been inited";
+      if (grouped_vals[i].size() > 1) {
+        // We issued NCCL kernels, need to synchronize
+        nccl_called = true;
+      }
+    }
+
+    Broadcast(uniq_keys, locals, grouped_vals, priority);
+    // Sync after all broadcasts in a group
+    if (nccl_called) {
+      const std::vector<const NDArray*> values_copy(values.begin(), values.end());
+      CommSync(values_copy, priority);
+    }
+  }
+
+  void PullRowSparseImpl(const std::vector<int>& keys,
+                         const std::vector<std::pair<NDArray*, NDArray>>& val_rowids,
+                         int priority = 0) override {
+    LOG(FATAL) << "NCCL kvstore does not support sparse storage type";
+  }
+
+  void SetGradientCompression(const std::vector<std::pair<std::string, std::string> >
+                                      & kwargs) override {
+    LOG(FATAL) << "NCCL kvstore does not support gradient compression";
+  }
+
+ protected:
+  /**
+   * \brief group values on keys
+   */
+  template <typename T>
+  void GroupKVPairsHelper(const std::vector<int>& keys,
+                          const std::vector<T>& values,
+                          std::vector<int> *uniq_keys,
+                          std::vector<std::vector<T>> *grouped_vals) {
+    // check if the storage type of a value is valid
+    auto validator = [this](const int key, const T nd) -> bool {
+      auto stype = ptr(nd)->storage_type();
+      // valid NDArray
+      if (stype == kDefaultStorage) return true;
+      // invalid NDArray, abort
+      LOG(FATAL) << "NCCL kvstore does not support sparse storage type";
+      return false;
+    };
+    GroupKVPairs(keys, values, uniq_keys, grouped_vals, validator);
+  }
+
+ private:
+  // Aggregated reductions
+  virtual void Reduce(const std::vector<int> keys,
+                      const std::vector<std::vector<NDArray>>& srcs,
+                      int priority,
+                      std::vector<const NDArray*>* merged_ptrs) {
+    std::vector<size_t> root_ids(keys.size());
+    std::vector<NDArray> reduces(keys.size());
+    merged_ptrs->resize(keys.size());
+    std::vector<Engine::VarHandle> const_vars;
+    std::vector<Engine::VarHandle> mutate_vars;
+
+    for (size_t k = 0; k < keys.size(); ++k) {
+      auto& key = keys[k];
+      auto& src = srcs[k];
+      auto& root_id = root_ids[k];
+
+      // avoid extra copy for single device, but it may bring problems for
+      // abnormal usage of kvstore
+      if (src.size() == 1) {
+        (*merged_ptrs)[k] = &src[0];
+        continue;
+      }
+
+      if (!inited_) {
+        std::vector<Context> devs;
+        for (const auto& a : src) {
+          devs.push_back(a.ctx());
+        }
+        InitNCCL(devs);
+        InitMergeBuffer(devs);
+      }
+
+      // Check whether we got the same set of devices
+      std::vector<int> dev_ids;
+      for (auto e : src) {
+        dev_ids.push_back(e.ctx().dev_id);
+      }
+      std::sort(dev_ids.begin(), dev_ids.end());
+      CHECK(device_ids_ == dev_ids) << "NCCL KVStore supports only single set of devices";
+
+      auto& buf = merge_buf_[key];
+      int root = buf.merged.ctx().dev_id;
+      root_id = FindRootId(src, root);
+
+      auto& reduce = buf.merged;
+      (*merged_ptrs)[k] = &reduce;
+      // Need to pass NDArrays by value to the engine
+      reduces[k] = reduce;
+
+      for (size_t i = 0; i < src.size(); ++i) {
+        const_vars.push_back(src[i].var());
+      }
+      mutate_vars.push_back(reduce.var());
+    }
+
+    Engine::Get()->PushSync([srcs, reduces, root_ids, this](RunContext rctx) {
+        std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR > 1))
+        ncclGroupStart();
+#endif
+        for (size_t k = 0; k < srcs.size(); ++k) {
+          auto& src = srcs[k];
+          auto& root_id = root_ids[k];
+          auto& reduce = reduces[k];
+          if (src.size() <= 1) {
+            continue;
+          }
+          int root = nccl_data_[src[root_id].ctx().dev_id].rank;
+          ncclGroupStart();
+          for (size_t i = 0; i < src.size(); ++i) {
+            NCCLEntry cur = nccl_data_[src[i].ctx().dev_id];
+            if (i == root_id) {
+            MSHADOW_TYPE_SWITCH(src[i].dtype(), DType,
+            ncclReduce(src[i].data().dptr<DType>(),
+                              reduce.data().dptr<DType>(),
+                              src[i].shape().Size(),
+                              GetNCCLType(src[i].dtype()),
+                              ncclSum,
+                              root,
+                              cur.comm,
+                              cur.stream););
+            } else {
+            MSHADOW_TYPE_SWITCH(src[i].dtype(), DType,
+            ncclReduce(src[i].data().dptr<DType>(),
+                              NULL,
+                              src[i].shape().Size(),
+                              GetNCCLType(src[i].dtype()),
+                              ncclSum,
+                              root,
+                              cur.comm,
+                              cur.stream););
+            }
+          }
+          ncclGroupEnd();
+        }
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR > 1))
+        ncclGroupEnd();
+#endif
+      },
+      Context::CPU(),
+      const_vars,
+      mutate_vars,
+      FnProperty::kCPUPrioritized,
+      priority,
+      PROFILER_MESSAGE("KVStoreReduce"));
+  }
+
+  virtual void Broadcast(const std::vector<int> keys,
+      const std::vector<NDArray>& srcs,
+      const std::vector<std::vector<NDArray*>>& dsts,
+      int priority) {
+    std::vector<size_t> root_ids(keys.size());
+    std::vector<Engine::VarHandle> const_vars;
+    std::vector<Engine::VarHandle> mutable_vars;
+
+    for (size_t k = 0; k < keys.size(); ++k) {
+      auto& key = keys[k];
+      auto& src = srcs[k];
+      auto& dst = dsts[k];
+      auto& root_id = root_ids[k];
+
+      if (!inited_) {
+        // copy to a random device first
+        int dev_id = key % dst.size();
+        CopyFromTo(src, *dst[dev_id], priority);
+        for (size_t i = 0; i < dst.size(); ++i) {
+          if (i != static_cast<size_t>(dev_id)) {
+            CopyFromTo(*dst[dev_id], *dst[i], priority);
+          }
+        }
+      } else {
+        auto& buf = merge_buf_[key];
+        int root = src.ctx().dev_id;
+        assert(root == buf.ctx().dev_id);
+        root_id = FindRootId(dst, root);
+
+        // Check whether we got the same set of devices
+        std::vector<int> dev_ids;
+        for (size_t i = 0; i < dst.size(); ++i) {
+          auto& bcast = (i == root_id) ? src : *dst[i];
+          dev_ids.push_back(bcast.ctx().dev_id);
+        }
+        std::sort(dev_ids.begin(), dev_ids.end());
+        CHECK(device_ids_ == dev_ids) << "NCCL KVStore supports only single set of devices";
+
+        // On root perform simple copy to the output
+        CopyFromTo(src, *dst[root_id], priority);
+        for (size_t i = 0; i < dst.size(); ++i) {
+          if ( i != root_id)
+            mutable_vars.push_back(dst[i]->var());
+        }
+        const_vars.push_back(src.var());
+      }
+    }
+
+    // If not yet inited, then all work is already scheduled
+    if (!inited_) {
+      return;
+    }
+
+    // We need to capture NDArrays by value
+    // in order to push to the engine
+    std::vector<std::vector<NDArray>> broadcasts(dsts.size());
+    for (size_t i = 0; i < dsts.size(); ++i) {
+      auto& broadcast = broadcasts[i];
+      broadcast.resize(dsts[i].size());
+      for (size_t j = 0; j < dsts[i].size(); ++j) {
+        broadcast[j] = *(dsts[i][j]);
+      }
+    }
+
+    Engine::Get()->PushSync([srcs, broadcasts, root_ids, this](RunContext rctx) {
+        std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR > 1))
+        ncclGroupStart();
+#endif
+        for (size_t k = 0; k < srcs.size(); ++k) {
+          auto& src = srcs[k];
+          auto& dst = broadcasts[k];
+          auto& root_id = root_ids[k];
+          if (dst.size() <= 1) {
+            continue;
+          }
+
+          int root = nccl_data_[src.ctx().dev_id].rank;
+          ncclGroupStart();
+          for (size_t i = 0; i < dst.size(); ++i) {
+            auto& bcast = (i == root_id) ? src : dst[i];
+            NCCLEntry cur = nccl_data_[bcast.ctx().dev_id];
+            MSHADOW_TYPE_SWITCH(bcast.dtype(), DType,
+                ncclBcast(bcast.data().dptr<DType>(),
+                  bcast.shape().Size(),
+                  GetNCCLType(bcast.dtype()),
+                  root,
+                  cur.comm,
+                  cur.stream););
+          }
+          ncclGroupEnd();
+        }
+#if (NCCL_MAJOR > 2 || (NCCL_MAJOR == 2 && NCCL_MINOR > 1))
+        ncclGroupEnd();
+#endif
+      },
+      Context::CPU(),
+      const_vars,
+      mutable_vars,
+      FnProperty::kCPUPrioritized,
+      priority,
+      PROFILER_MESSAGE("KVStoreBCast"));
+  }
+
+  // Function that waits for NCCL collective to complete
+  template <typename T>
+  void CommSync(const std::vector<T>& dst, int priority) {
+    std::vector<Engine::VarHandle> mutate_vars;
+    for (size_t i = 0; i < dst.size(); ++i) {
+        mutate_vars.push_back(ptr(dst[i])->var());
+    }
+    Engine::Get()->PushSync([this](RunContext rctx) {
+        for (auto cur : nccl_data_) {
+          CUDA_CALL(cudaSetDevice(cur.second.dev_id));
+          CUDA_CALL(cudaStreamSynchronize(cur.second.stream));
+        }
+      },
+      Context::CPU(),
+      {},
+      mutate_vars,
+      FnProperty::kCPUPrioritized,
+      priority,
+      PROFILER_MESSAGE("KVStoreStreamSync"));
+  }
+
+  // Initialize single key
+  void InitKey(int key, const NDArrayStorageType stype, const TShape& shape,
+            int dtype = mshadow::kFloat32) {
+    if (stype == kDefaultStorage) {
+      key_attrs_.push_back(std::make_tuple(key, shape, dtype));
+    } else {
+      LOG(FATAL) << "NCCL KVStore does not support sparse storage type";
+    }
+  }
+
+  ncclDataType_t GetNCCLType(int dtype) {
+    switch (dtype) {
+      case mshadow::kFloat32:
+        return ncclFloat;
+      case mshadow::kFloat16:
+        return ncclHalf;
+      case mshadow::kFloat64:
+        return ncclDouble;
+      case mshadow::kUint8:
+        return ncclChar;
+      case mshadow::kInt32:
+        return ncclInt;
+      case mshadow::kInt64:
+        return ncclInt64;
+      default:
+        LOG(FATAL) << "Unknown type passed to NCCL KVStore";
+    }
+    return ncclNumTypes;
+  }
+
+  void InitNCCL(const std::vector<Context>& devs) {
+    for (size_t i = 0; i < devs.size(); ++i) {
+      device_ids_.push_back(devs[i].dev_id);
+    }
+    std::sort(device_ids_.begin(), device_ids_.end());
+    std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
+    std::vector<ncclComm_t> comms(devs.size());
+    ncclCommInitAll(&(comms[0]), devs.size(), &(device_ids_[0]));
+    for (size_t i = 0; i < devs.size(); ++i) {
+      NCCLEntry e;
+      e.dev_id = device_ids_[i];
+      e.comm = comms[i];
+      e.rank = i;
+      cudaSetDevice(e.dev_id);
+      cudaStreamCreate(&(e.stream));
+      nccl_data_[device_ids_[i]] = e;
+    }
+  }
+
+  using KeyAttrs = std::tuple<int, TShape, int>;
+  void InitMergeBuffer(const std::vector<Context>& devs) {
+    for (size_t i = 0; i < key_attrs_.size(); ++i) {
+      int key  = std::get<0>(key_attrs_[i]);
+      TShape s = std::get<1>(key_attrs_[i]);
+      int type = std::get<2>(key_attrs_[i]);
+      auto& buf = merge_buf_[key];
+      // always use devs[0] as root
+      buf.merged = NDArray(s, devs[0], false, type);
+    }
+    inited_ = true;
+  }
+
+  // Functions that enable templates to work on both references
+  // and pointers
+  template<typename T>
+  const T * ptr(const T & obj) { return &obj; }
+
+  template<typename T>
+  const T * ptr(T * obj) { return obj; }
+
+  // Find which element of the vector
+  // corresponds to root dev_id
+  template <typename T>
+  size_t FindRootId(const std::vector<T>& vec, int root) {
+    size_t root_id = -1;
+    for (size_t i = 0; i < vec.size(); ++i) {
+      if (ptr(vec[i])->ctx().dev_id == root) {
+        root_id = i;
+        break;
+      }
+    }
+    return root_id;
+  }
+
+  std::vector<KeyAttrs> key_attrs_;
+  /// \brief temporal space for pushing and pulling
+  struct BufferEntry {
+    /// \brief the merged value
+    NDArray merged;
+  };
+  struct NCCLEntry {
+    /// \brief device ID
+    int dev_id;
+    /// \brief NCCL commmunicator
+    ncclComm_t comm;
+    /// \brief NCCL rank
+    int rank;
+    /// \brief GPU stream to use with NCCL
+    cudaStream_t stream;
+  };
+  std::unordered_map<int, BufferEntry> merge_buf_;
+  std::unordered_map<int, NCCLEntry> nccl_data_;
+  bool inited_;
+  // \brief devices used with this KVStore
+  std::vector<int> device_ids_;
+};
+}  // namespace kvstore
+}  // namespace mxnet
+#endif  // MXNET_USE_NCCL
+#endif  // MXNET_KVSTORE_KVSTORE_NCCL_H_
diff --git a/src/ndarray/autograd.cc b/src/ndarray/autograd.cc
deleted file mode 100644
index 5ecea5decf03..000000000000
--- a/src/ndarray/autograd.cc
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file autograd.cc
- * \brief Implementation of AutogradRuntime module.
- */
-
-#include <mxnet/operator.h>
-#include <mxnet/executor.h>
-#include <nnvm/pass_functions.h>
-#include <unordered_set>
-#include <iostream>
-#include "../executor/graph_executor.h"
-#include "./autograd.h"
-
-namespace mxnet {
-namespace autograd {
-
-using nnvm::Symbol;
-using nnvm::Node;
-using nnvm::NodePtr;
-using nnvm::NodeEntry;
-using nnvm::NodeEntryMap;
-using exec::GraphExecutor;
-
-#if DMLC_CXX11_THREAD_LOCAL
-thread_local bool AutogradRuntime::is_train_ = false;
-thread_local bool AutogradRuntime::is_recording_ = false;
-#else
-MX_THREAD_LOCAL bool AutogradRuntime::is_train_ = false;
-MX_THREAD_LOCAL bool AutogradRuntime::is_recording_ = false;
-#endif
-
-template<typename FVisit>
-inline void AGDFSVisit(const std::vector<AGNodeEntry>& heads,
-                       FVisit fvisit) {
-  typedef const AGNodePtr* GNode;
-  std::vector<GNode> head_nodes(heads.size());
-  std::transform(heads.begin(), heads.end(), head_nodes.begin(),
-                 [](const AGNodeEntry& e)->GNode {
-                   return &e.ag_node;
-                 });
-  nnvm::PostOrderDFSVisit<GNode, AGNode*>(
-      head_nodes,
-      [fvisit](GNode n) { fvisit(*n); },  // FVisit
-      [](GNode n)->AGNode* { return n->get(); },  // HashFunc
-      [](GNode n)->uint32_t { return (*n)->inputs.size(); },
-      [](GNode n, uint32_t index)->GNode { return &(*n)->inputs.at(index).ag_node; });
-}
-
-nnvm::NodeEntry AGNodeEntry::nn_entry() const {
-  return nnvm::NodeEntry{ag_node->nn_node, index, version};
-}
-
-bool AGNodeEntry::is_none() const {
-  return ag_node == nullptr || ag_node->outputs.empty();
-}
-
-AutogradRuntime::AutogradRuntime() {}
-
-void AutogradRuntime::MarkVariables(
-    const std::vector<NDArray*>& variables,
-    const std::vector<mx_uint>& grad_reqs,
-    const std::vector<NDArray*>& gradients) {
-  for (uint32_t i = 0; i < variables.size(); ++i) {
-    std::string str_c(std::to_string(variable_count_++));
-
-    AGNodeEntry e{
-      AGNode::Create(
-        nnvm::Symbol::CreateVariable("var" + str_c).outputs[0].node), 0, 0};
-    variables[i]->entry_.clear();
-    e.ag_node->outputs.emplace_back(*variables[i]);
-
-    AGNodeEntry ge{
-      AGNode::Create(
-        nnvm::Symbol::CreateVariable("grad" + str_c).outputs[0].node), 0, 0};
-    gradients[i]->entry_.clear();
-    ge.ag_node->outputs.emplace_back(*gradients[i]);
-    gradients[i]->entry_ = std::move(ge);
-    e.ag_node->out_grads.emplace_back(*gradients[i]);
-
-    e.ag_node->grad_req = static_cast<OpReqType>(grad_reqs[i]);
-    variables[i]->entry_ = std::move(e);  // assign last to prevent cyclic reference
-  }
-}
-
-void AutogradRuntime::RecordImperativeFCompute(const nnvm::Op* op,
-                                               const nnvm::NodeAttrs& attrs,
-                                               std::vector<NDArray> *p_inputs,
-                                               std::vector<NDArray> *p_outputs) {
-  RecordOp(op, attrs, p_inputs, p_outputs, OpStatePtr());
-}
-
-void AutogradRuntime::RecordImperativeOperator(const OpStatePtr& state,
-                                               const nnvm::Op* op,
-                                               const nnvm::NodeAttrs& attrs,
-                                               std::vector<NDArray> *p_inputs,
-                                               std::vector<NDArray> *p_outputs) {
-  RecordOp(op, attrs, p_inputs, p_outputs, state);
-}
-
-std::shared_ptr<AutogradRuntime> AutogradRuntime::_GetSharedRef() {
-  static std::shared_ptr<AutogradRuntime> inst(new AutogradRuntime());
-  return inst;
-}
-
-AutogradRuntime* AutogradRuntime::Get() {
-  static AutogradRuntime *ptr = _GetSharedRef().get();
-  return ptr;
-}
-
-void AutogradRuntime::RecordOp(const nnvm::Op* op,
-                                    const nnvm::NodeAttrs& attrs,
-                                    std::vector<NDArray> *p_inputs,
-                                    std::vector<NDArray> *p_outputs,
-                                    const OpStatePtr& state) {
-  static auto& fgradient = nnvm::Op::GetAttr<nnvm::FGradient>("FGradient");
-  std::vector<NDArray>& inputs  = *p_inputs;
-  std::vector<NDArray>& outputs = *p_outputs;
-
-  for (uint32_t i = 0; i < outputs.size(); ++i) {
-    CHECK(outputs[i].entry_.is_none())
-      << "Inplace operations (+=, -=, x[:]=, etc) are not supported when "
-      << "recording with autograd. "
-      << "Assigning to NDArrays that are already in a computational graph "
-      << "will cause undefined behavior when evaluating gradients. "
-      << "Please call backward first to clear the graph or do this out side of "
-      << "a record section. ";
-  }
-  if (!fgradient.count(attrs.op)) return;
-  bool need_grad = false;
-  for (const auto& i : inputs) {
-    if (!i.entry_.is_none()) {
-      need_grad = true;
-      break;
-    }
-  }
-  if (!need_grad) return;
-
-  NodePtr nn_node = Node::Create();
-  nn_node->attrs = attrs;
-  nn_node->attrs.name = "node_" + std::to_string(node_count_++);
-
-  // Get backward dependency
-  std::vector<bool> save_inputs(inputs.size()), save_outputs(outputs.size());
-  for (uint32_t i = 0; i < inputs.size(); ++i) {
-    nn_node->inputs.emplace_back(NodeEntry{nullptr, i, 0});
-  }
-  std::vector<NodeEntry> ograd_entries;
-  for (uint32_t i = 0; i < outputs.size(); ++i) {
-    ograd_entries.emplace_back(NodeEntry{nullptr, i, 1});
-  }
-  auto igrad_entries = fgradient[nn_node->op()](nn_node, ograd_entries);
-  for (const auto& i : igrad_entries) {
-    if (i.node == nullptr && i.version == 0) {
-      save_inputs[i.index] = true;
-    } else if (i.node == nn_node) {
-      save_outputs[i.index] = true;
-    }
-  }
-  DFSVisit(igrad_entries, [&](const NodePtr& node) {
-      if (!node || node == nn_node) return;
-      for (const auto& i : node->inputs) {
-        if (i.node == nullptr && i.version == 0) {
-          save_inputs[i.index] = true;
-        } else if (i.node == nn_node) {
-          save_outputs[i.index] = true;
-        }
-      }
-    });
-
-  AGNodePtr ag_node = AGNode::Create(nn_node);
-  ag_node->state = state;
-
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    if (inputs[i].entry_.is_none()) {
-      AGNodeEntry e{
-        AGNode::Create(
-          nnvm::Symbol::CreateVariable(
-            "null" + std::to_string(variable_count_++)).outputs[0].node), 0, 0};
-      if (save_inputs[i]) {
-        e.ag_node->outputs.emplace_back(inputs[i]);
-      } else {
-        // Put a dummy array here since it will not be used.
-        e.ag_node->outputs.emplace_back(
-            TBlob(nullptr, inputs[i].shape(), inputs[i].ctx().dev_mask(),
-                  inputs[i].dtype()), inputs[i].ctx().dev_id);
-      }
-      e.ag_node->out_grads.emplace_back();
-      inputs[i].entry_ = std::move(e);  // assign last to prevent cyclic reference
-    }
-    nn_node->inputs[i] = inputs[i].entry_.nn_entry();
-    ag_node->inputs.push_back(inputs[i].entry_);
-    if (save_inputs[i]) {
-      inputs[i].entry_.ag_node->outputs[inputs[i].entry_.index] = inputs[i].Detach();
-    }
-  }
-
-  for (uint32_t i = 0; i < outputs.size(); ++i) {
-    if (save_outputs[i]) {
-      ag_node->outputs.emplace_back(outputs[i].Detach());
-    } else {
-      // Put a dummy array here since it will not be used.
-      ag_node->outputs.emplace_back(
-          TBlob(nullptr, outputs[i].shape(), outputs[i].ctx().dev_mask(),
-                outputs[i].dtype()), outputs[i].ctx().dev_id);
-    }
-    outputs[i].entry_ = AGNodeEntry{ag_node, i, 0};
-  }
-}
-
-void AutogradRuntime::ComputeGradient(const std::vector<NDArray>& outputs,
-                                      const std::vector<NDArray>& ograds,
-                                      bool retain_graph, bool is_train) {
-  static auto& fmutate_inputs = nnvm::Op::GetAttr<nnvm::FMutateInputs>("FMutateInputs");
-  std::vector<AGNodeEntry> heads;
-  Symbol sym;
-  NodeEntryMap<NDArray> feed_dict;
-  for (const auto& i : outputs) {
-    CHECK(!i.entry_.is_none())
-      << "Cannot differentiate node because it is not in a computational graph. "
-      << "You need to set is_recording to true or use autograd.record() to save "
-      << "computational graphs for backward. If you want to differentiate the same "
-      << "graph twice, you need to pass retain_graph=True to backward.";
-    heads.emplace_back(i.entry_);
-    sym.outputs.emplace_back(i.entry_.nn_entry());
-  }
-
-  std::unordered_set<AGNode*> mutable_set;
-  std::vector<AGNodePtr> vlist;
-  std::vector<NDArray> args, args_grad;
-  std::vector<NDArray> aux_states;
-  std::vector<OpReqType> grad_reqs;
-  std::unordered_map<const nnvm::Node*, OpStatePtr> saved_states;
-  AGDFSVisit(heads, [&](const AGNodePtr& n) {
-      CHECK(n->nn_node != nullptr)
-          << "Node is differentiated twice without retaining graph the first time. "
-          << "This usually happens when you want to differentiate a graph twice but "
-          << "forgot to set retain_graph=True the first time. If you are training "
-          << "recurrent model (like LSTMs) maybe you forgot to detach the hidden "
-          << "state from the previous iteration before feeding it to the next iteration.";
-      if (n->nn_node->is_variable()) {
-        vlist.push_back(n);
-      } else {
-        if (n->state) {
-          saved_states.insert({n->nn_node.get(), n->state});
-        }
-        if (fmutate_inputs.count(n->nn_node->op())) {
-          for (uint32_t i : fmutate_inputs[n->nn_node->op()](n->nn_node->attrs)) {
-            mutable_set.insert(n->inputs[i].ag_node.get());
-          }
-        }
-      }
-      for (uint32_t i = 0; i < n->outputs.size(); ++i) {
-        feed_dict.insert({NodeEntry{n->nn_node, i, 0}, n->outputs[i]});
-      }
-    });
-
-  bool has_writeto = false;
-  for (const auto& n : vlist) {
-    if (mutable_set.count(n.get())) {
-      aux_states.push_back(n->outputs[0]);
-    } else {
-      if (n->grad_req != kNullOp) {
-        n->fresh_out_grad = true;
-      }
-      args.push_back(n->outputs[0]);
-      args_grad.push_back(n->out_grads[0]);
-      grad_reqs.push_back(n->grad_req);
-      has_writeto = has_writeto || n->grad_req == kWriteTo;
-    }
-  }
-
-  if (args.size()) {
-    std::map<std::string, Context> ctx_map;
-    auto exec = new exec::GraphExecutor();
-    // (TODO) too hack here
-    exec->saved_states_ = saved_states;
-    exec->Init(sym, args[0].ctx(), ctx_map,
-               args, args_grad, grad_reqs,
-               aux_states, nullptr, feed_dict);
-
-    std::vector<NDArray> head_grads;
-    head_grads.reserve(exec->head_grad_array_.size());
-    CHECK_EQ(ograds.size(), exec->output_arrays_.size());
-
-    for (size_t i = 0; i < ograds.size(); ++i) {
-      if (ograds[i].is_none()) {
-        head_grads.emplace_back(
-          exec->output_arrays_[i].shape(), exec->output_arrays_[i].ctx(),
-          false, exec->output_arrays_[i].dtype());
-        head_grads.back() = static_cast<real_t>(1.0);
-      } else {
-        head_grads.emplace_back(ograds[i]);
-      }
-    }
-
-    // std::stringstream os;
-    // exec->Print(os);
-    // LOG(INFO) << os.str();
-
-    exec->Backward(head_grads, is_train);
-    delete exec;
-  }
-
-  if (!retain_graph) {
-    for (auto& i : heads) {
-      i.ag_node->clear_history();
-    }
-  } else if (has_writeto) {
-    LOG(INFO)
-        << "Warning: when calling backward with retain_graph=True, grad_req for "
-        << "Parameters should be set to 'add'. Otherwise the second backward "
-        << "will over-write gradients from the first backward. Also remember "
-        << "to manually set gradients to zero with zero_grad before starting the "
-        << "next iteration.";
-  }
-}
-
-}  // namespace autograd
-}  // namespace mxnet
diff --git a/src/ndarray/autograd.h b/src/ndarray/autograd.h
deleted file mode 100644
index 199af350bf93..000000000000
--- a/src/ndarray/autograd.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file autograd.h
- * \brief AutogradRuntime can automatically compute gradients
- */
-#ifndef MXNET_NDARRAY_AUTOGRAD_H_
-#define MXNET_NDARRAY_AUTOGRAD_H_
-
-#include <dmlc/logging.h>
-#include <mxnet/base.h>
-#include <mxnet/ndarray.h>
-#include <mxnet/op_attr_types.h>
-#include <mxnet/c_api.h>
-#include <nnvm/symbolic.h>
-#include <nnvm/op.h>
-#include <nnvm/graph.h>
-#include <vector>
-#include <atomic>
-#include <unordered_map>
-
-namespace mxnet {
-namespace autograd {
-
-class AGNode {
- public:
-  OpReqType grad_req;
-  nnvm::NodePtr nn_node;
-  OpStatePtr state;
-  std::vector<AGNodeEntry> inputs;
-  std::vector<NDArray> outputs;
-  std::vector<NDArray> out_grads;
-  bool fresh_out_grad;
-
-  explicit AGNode(const nnvm::NodePtr& nn_node_) :
-    grad_req(kNullOp), nn_node(nn_node_), fresh_out_grad(false) {}
-
-  static AGNodePtr Create(const nnvm::NodePtr& nn_node_) {
-    return std::make_shared<AGNode>(nn_node_);
-  }
-
-  void clear_history() {
-    if (out_grads.size()) return;
-    state.reset();
-    outputs.clear();
-    nn_node.reset();
-    for (auto& i : inputs) i.ag_node->clear_history();
-    inputs.clear();
-  }
-};
-
-/*!
- * \brief AutogradRuntime Interface
- */
-class AutogradRuntime {
- public:
-  /*! \brief turn on or turn off operator recording for autograd. */
-  bool SetIsTraining(bool is_train) {
-      bool old = is_train_;
-      is_train_ = is_train;
-      return old;
-  }
-  /*! \brief whether operator recording is on. */
-  bool IsTraining() const {
-    return is_train_;
-  }
-  /*! \brief turn on or turn off operator recording for autograd. */
-  bool SetIsRecording(bool is_recording) {
-      bool old = is_recording_;
-      is_recording_ = is_recording;
-      return old;
-  }
-  /*! \brief whether operator recording is on. */
-  bool IsRecording() const {
-    return is_recording_;
-  }
-  /*! \brief mark variables for computing gradients. */
-  void MarkVariables(const std::vector<NDArray*>& variables,
-                     const std::vector<mx_uint>& grad_reqs,
-                     const std::vector<NDArray*>& gradients);
-  /*! \brief record imperative operator which is executed by fcompute. */
-  void RecordImperativeFCompute(const nnvm::Op* op,
-                                const nnvm::NodeAttrs& attrs,
-                                std::vector<NDArray>* p_inputs,
-                                std::vector<NDArray>* p_outputs);
-  /*! \brief record imperative operator which is executed by operator. */
-  void RecordImperativeOperator(const OpStatePtr& state,
-                                const nnvm::Op* op,
-                                const nnvm::NodeAttrs& attrs,
-                                std::vector<NDArray>* p_inputs,
-                                std::vector<NDArray>* p_outputs);
-  /*! \brief compute the gradient of outputs w.r.t variables. */
-  void ComputeGradient(const std::vector<NDArray>& outputs,
-                       const std::vector<NDArray>& ograds,
-                       bool retain_graph, bool is_train);
-  /*! \return AutogradRuntime singleton */
-  static AutogradRuntime* Get();
-  /*! \brief Get shared pointer reference to AutogradRuntime singleton.
-   *   Most user should not call this function.
-   *   This function is called by another singleton X who requires
-   *   AutogradRuntime to be destructed after X.
-   *
-   *  \return A shared pointer to AutogradRuntime singleton.
-   */
-  static std::shared_ptr<AutogradRuntime> _GetSharedRef();
-
- protected:
-  /*! \brief make constructor protected. */
-  AutogradRuntime();
-
- private:
-  /*! \brief to record operator, return corresponding node. */
-  void RecordOp(const nnvm::Op* op,
-                     const nnvm::NodeAttrs& attrs,
-                     std::vector<NDArray>* p_inputs,
-                     std::vector<NDArray>* p_outputs,
-                     const OpStatePtr& state);
-  /*! \brief AutogradRuntime singleton. */
-  static AutogradRuntime* instance_;
-  /*! \brief indicate whether is training. */
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local bool is_train_;
-  static thread_local bool is_recording_;
-#else
-  static MX_THREAD_LOCAL bool is_train_;
-  static MX_THREAD_LOCAL bool is_recording_;
-#endif
-  /*! \brief node count used for naming */
-  std::atomic<uint64_t> node_count_{0};
-  /*! \brief variable count used for naming */
-  std::atomic<uint64_t> variable_count_{0};
-};
-
-}  // namespace autograd
-}  // namespace mxnet
-#endif  // MXNET_NDARRAY_AUTOGRAD_H_
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 8e71df729b73..4a096d1bd085 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ndarray.cc
  * \brief ndarry module of mxnet
  */
@@ -28,9 +29,12 @@
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
 #include <mxnet/resource.h>
+#include <mxnet/imperative.h>
 #include <mshadow/tensor.h>
 #include "./ndarray_function.h"
-#include "./autograd.h"
+#include "../common/utils.h"
+#include "../operator/tensor/matrix_op-inl.h"
+#include "../operator/tensor/init_op.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -43,80 +47,88 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg);
 namespace mxnet {
 
 NDArray NDArray::grad() const {
-  if (this->entry_.ag_node && this->entry_.ag_node->out_grads.size()) {
-    CHECK_EQ(this->entry_.ag_node->out_grads.size(), 1);
-    return this->entry_.ag_node->out_grads[0];
+  if (Imperative::AGInfo::IsNone(*this)) return NDArray();
+  Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
+  if (info.out_grads.size()) {
+    CHECK_EQ(info.out_grads.size(), 1);
+    return info.out_grads[0];
   }
   return NDArray();
 }
 
+nnvm::Symbol NDArray::get_autograd_symbol() const {
+  CHECK(!Imperative::AGInfo::IsNone(*this))
+    << "NDArray is not part of a computation graph. Did you forget to turn on recording?";
+  nnvm::Symbol ret;
+  ret.outputs.emplace_back(entry_);
+  return ret;
+}
+
 NDArray NDArray::Reshape(const TShape &shape) const {
-  using namespace autograd;
-  if (AutogradRuntime::Get()->IsTraining()) {
-    CHECK_GE(shape_.Size(), shape.Size())
-      << "NDArray.Reshape: target shape must have must have the same size as "
-      << "current shape when recording with autograd.";
-    NDArray ret = *this;
-    ret.shape_ = shape;
-    // fake a Reshape op
-    ret.entry_.clear();
-    const nnvm::Op* op = nnvm::Op::Get("Reshape");
-    nnvm::NodeAttrs attrs;
-    attrs.op = op;
-    std::ostringstream os;
-    os << shape;
-    attrs.dict.insert({"shape", os.str()});
-    op->attr_parser(&attrs);
-    std::vector<NDArray> inputs, outputs;
-    inputs.emplace_back(*this);
-    outputs.emplace_back(std::move(ret));
-    AutogradRuntime::Get()->RecordImperativeFCompute(
-      op, attrs, &inputs, &outputs);
-    return outputs[0];
-  } else {
-    CHECK_GE(shape_.Size(), shape.Size())
-      << "NDArray.Reshape: target shape size is larger current shape";
-    NDArray ret = *this;
-    ret.shape_ = shape;
-    return ret;
-  }
+  CHECK(!is_none()) << "NDArray is not initialized";
+  auto stype = storage_type();
+  // reshape is not supported for non-default ndarray with dismatching shapes
+  CHECK((shape_ == shape) || stype == kDefaultStorage)
+    << "Reshape for storage type " << stype << " is not implemented yet";
+  CHECK_GE(shape_.Size(), shape.Size())
+    << "NDArray.Reshape: target shape size is larger current shape";
+  NDArray ret = this->Detach();
+  ret.shape_ = shape;
+  return ret;
+}
+
+NDArray NDArray::ReshapeWithRecord(const TShape &shape) {
+  NDArray ret = this->Reshape(shape);
+  if (!Imperative::Get()->is_recording()) return ret;
+
+  CHECK_GE(shape_.Size(), shape.Size())
+    << "NDArray.Reshape: target shape must have must have the same size as "
+    << "current shape when recording with autograd.";
+  nnvm::NodeAttrs attrs;
+  attrs.op = nnvm::Op::Get("Reshape");;
+  std::ostringstream os;
+  os << shape;
+  attrs.dict.insert({"shape", os.str()});
+  attrs.op->attr_parser(&attrs);
+  std::vector<NDArray*> inputs(1, this), outputs(1, &ret);
+  Imperative::Get()->RecordOp(std::move(attrs), inputs, outputs);
+  return ret;
 }
 
 
 NDArray NDArray::Slice(index_t begin, index_t end) const {
-  using namespace autograd;
-  NDArray ret = *this;
-  CHECK(!is_none()) << "NDArray is not initialized";
-  CHECK_LT(begin, end) << "Invalid slicing range [" << begin << ", " << end << ")";
+  CHECK(!is_none()) << "NDArray is empty";
+  CHECK_LE(begin, end)
+      << "Invalid slicing range [" << begin << ", " << end << ")";
   CHECK_GE(shape_[0], end) << "Slice end index out of range";
+  CHECK_EQ(storage_type(), kDefaultStorage);
+  NDArray ret = this->Detach();
   size_t length = shape_.ProdShape(1, shape_.ndim());
   MSHADOW_TYPE_SWITCH(ret.dtype(), DType, {
     ret.byte_offset_ += begin * length * sizeof(DType);
   });
   ret.shape_[0] = end - begin;
-  if (AutogradRuntime::Get()->IsTraining()) {
-    // fake a slice_axis op
-    ret.entry_.clear();
-    const nnvm::Op* op = nnvm::Op::Get("slice_axis");
-    nnvm::NodeAttrs attrs;
-    attrs.op = op;
-    attrs.dict.insert({"axis", "0"});
-    attrs.dict.insert({"begin", std::to_string(begin)});
-    attrs.dict.insert({"end", std::to_string(end)});
-    op->attr_parser(&attrs);
-    std::vector<NDArray> inputs, outputs;
-    inputs.emplace_back(*this);
-    outputs.emplace_back(std::move(ret));
-    AutogradRuntime::Get()->RecordImperativeFCompute(
-      op, attrs, &inputs, &outputs);
-    return outputs[0];
-  } else {
-    return ret;
-  }
+  return ret;
 }
 
+NDArray NDArray::SliceWithRecord(index_t begin, index_t end) {
+  NDArray ret = this->Slice(begin, end);
+  if (!Imperative::Get()->is_recording()) return ret;
+  // fake a slice_axis op
+  nnvm::NodeAttrs attrs;
+  attrs.op = nnvm::Op::Get("slice_axis");
+  attrs.dict.insert({"axis", "0"});
+  attrs.dict.insert({"begin", std::to_string(begin)});
+  attrs.dict.insert({"end", std::to_string(end)});
+  attrs.op->attr_parser(&attrs);
+  std::vector<NDArray*> inputs(1, this), outputs(1, &ret);
+  Imperative::Get()->RecordOp(std::move(attrs), inputs, outputs);
+  return ret;
+}
 
 NDArray NDArray::At(index_t idx) const {
+  CHECK(storage_type() == kDefaultStorage) << "Storage type "
+                                           << storage_type() << " doesn't support At()";
   NDArray ret = this->Slice(idx, idx+1);
   if (shape_.ndim() > 1) {
     return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
@@ -125,17 +137,48 @@ NDArray NDArray::At(index_t idx) const {
   }
 }
 
+NDArray NDArray::AtWithRecord(index_t idx) {
+  CHECK(storage_type() == kDefaultStorage)
+      << "Storage type " << storage_type() << " doesn't support At()";
+  NDArray ret = this->SliceWithRecord(idx, idx+1);
+  if (shape_.ndim() > 1) {
+    return ret.ReshapeWithRecord(TShape(shape_.data()+1, shape_.data()+shape_.ndim()));
+  } else {
+    return ret;
+  }
+}
+
+/*!
+ * \brief Return deep copy of the current ndarry's aux_data(i)
+ * as an NDArray of default storage type. This function blocks.
+ */
+NDArray NDArray::aux_ndarray(size_t i) const {
+  CHECK_NE(storage_type(), kDefaultStorage);
+  CHECK(i < ptr_->aux_shapes.size());
+  // create a delay_alloc default ndarray as output
+  NDArray ret(TShape(), ctx(), true, aux_type(i));
+  ret.SyncCopyFromNDArray(*this, i);
+  return ret;
+}
+
+NDArray NDArray::data_ndarray() const {
+  NDArray ret(TShape(), ctx(), true, dtype_);
+  ret.SyncCopyFromNDArray(*this);
+  return ret;
+}
 
 bool NDArray::fresh_out_grad() const {
-  if (entry_.ag_node != nullptr) return entry_.ag_node->fresh_out_grad;
-  return false;
+  if (Imperative::AGInfo::IsNone(*this)) return false;
+  Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
+  return info.fresh_out_grad;
 }
 
 
 void NDArray::set_fresh_out_grad(bool state) const {
-  CHECK(entry_.ag_node != nullptr)
+  CHECK(!Imperative::AGInfo::IsNone(*this))
     << "NDArray has not been marked as a variable and does not have gradient state";
-  entry_.ag_node->fresh_out_grad = state;
+  Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node);
+  info.fresh_out_grad = state;
 }
 
 
@@ -239,11 +282,11 @@ void BinaryOp(const NDArray &lhs,
   // redirect everything to mshadow operations
   switch (lhs.ctx().dev_mask()) {
     case cpu::kDevMask: {
-      Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
-        }, lhs.ctx(), const_vars, {ret.var()},
-        FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+        Engine::Get()->PushSync([lhs, rhs, ret](RunContext ctx) {
+            TBlob tmp = ret.data();
+            ndarray::Eval<cpu, OP>(lhs.data(), rhs.data(), &tmp, ctx);
+          }, lhs.ctx(), const_vars, {ret.var()},
+          FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
       break;
     }
 #if MXNET_USE_CUDA
@@ -269,6 +312,7 @@ void SetValueOp(const real_t &rhs, NDArray *out) {
   switch (ret.ctx().dev_mask()) {
     case cpu::kDevMask: {
       Engine::Get()->PushSync([rhs, ret](RunContext ctx) {
+          CHECK(ret.storage_type() == kDefaultStorage);
           TBlob tmp = ret.data();
           ndarray::Eval<cpu>(rhs, &tmp, ctx);
         }, ret.ctx(), {}, {ret.var()},
@@ -340,60 +384,179 @@ void ScalarOp(const NDArray &lhs,
   }
 }
 
-void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
-  if (from.var() == to->var()) {
+size_t num_aux_data(NDArrayStorageType stype) {
+  size_t num = 0;
+  switch (stype) {
+    case kDefaultStorage: num = 0; break;
+    case kCSRStorage: num = 2; break;
+    case kRowSparseStorage: num = 1; break;
+     default: LOG(FATAL) << "Unknown storage type" << stype; break;
+  }
+  return num;
+}
+
+// Make a copy of a CSR NDArray
+template<typename from_xpu, typename to_xpu>
+inline void CopyFromToCsrImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
+  using namespace mshadow;
+  CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
+  // if source storage is not initialized, fill destination with zeros
+  auto s = ctx.get_stream<to_xpu>();
+  if (!from.storage_initialized()) {
+    op::FillZerosCsrImpl(s, to);
+    return;
+  }
+  // Allocate storage
+  to.CheckAndAllocAuxData(csr::kIndPtr, from.aux_shape(csr::kIndPtr));
+  to.CheckAndAllocAuxData(csr::kIdx, from.aux_shape(csr::kIdx));
+  to.CheckAndAllocData(from.aux_shape(csr::kIdx));
+  TBlob val = to.data();
+  TBlob indptr = to.aux_data(csr::kIndPtr);
+  TBlob idx = to.aux_data(csr::kIdx);
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
+                                  from.ctx(), to.ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIndPtr), &indptr,
+                                  from.ctx(), to.ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(csr::kIdx), &idx,
+                                  from.ctx(), to.ctx(), ctx);
+}
+
+// Make a copy of a row-sparse NDArray
+template<typename from_xpu, typename to_xpu>
+inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
+  using namespace mshadow;
+  CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
+  // if source is zeros, fill destination with zeros, too
+  auto s = ctx.get_stream<to_xpu>();
+  if (!from.storage_initialized()) {
+    op::FillZerosRspImpl(s, to);
+    return;
+  }
+  auto aux_shape = from.aux_shape(rowsparse::kIdx);
+  to.CheckAndAlloc({aux_shape});
+  TBlob val = to.data();
+  TBlob idx = to.aux_data(rowsparse::kIdx);
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
+                                  from.ctx(), to.ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx,
+                                  from.ctx(), to.ctx(), ctx);
+}
+
+// Make a copy of a dense NDArray
+template<typename from_xpu, typename to_xpu>
+inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) {
+  using namespace mshadow;
+  CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type";
+  TBlob tmp = to.data();
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
+                                  from.ctx(), to.ctx(), ctx);
+}
+
+// Make a copy of an NDArray based on storage type
+template<typename from_xpu, typename to_xpu>
+void CopyFromToImpl(const NDArray& from, const NDArray& to, RunContext rctx) {
+  using namespace std;
+  using namespace mshadow;
+  // if storage type doesn't match, cast the storage first
+  auto from_stype = from.storage_type();
+  auto to_stype = to.storage_type();
+  CHECK(from_stype == kDefaultStorage
+      || to_stype == kDefaultStorage
+      || from_stype == to_stype)
+    << "Copying ndarray of stype = " << from_stype
+    << " to stype = " << to_stype << " is not supported";
+  const auto from_ctx = from.ctx();
+  const auto to_ctx = to.ctx();
+  bool is_train = Imperative::Get()->is_training();
+  std::vector<Resource> requested;
+  if (is_same<from_xpu, mshadow::gpu>::value && from_stype != to_stype) {
+    requested.push_back(ResourceManager::Get()->Request(from_ctx,
+        ResourceRequest(ResourceRequest::kTempSpace)));
+  }
+  OpContext opctx{is_train,
+                  rctx,
+                  engine::CallbackOnComplete(),
+                  requested};
+  if (from_ctx == to_ctx && from_stype != to_stype) {
+    // same ctx, different stypes, use cast op directly without copying
+    common::CastStorageDispatch<from_xpu>(opctx, from, to);
+  } else {
+    NDArray casted_nd;  // an intermediate result before copying from to to
+    if (from_stype == to_stype) {
+      casted_nd = from;  // same stype, no need to cast from
+    } else {  // different stypes on different ctx needs an temporary casted_nd
+      TShape shape = from.shape();
+      if (to_stype == kDefaultStorage) {
+        casted_nd = NDArray(shape, from_ctx);
+      } else {
+        casted_nd = NDArray(to_stype, shape, from_ctx);
+      }
+      // convert from_nd to the same stype as to_nd
+      common::CastStorageDispatch<from_xpu>(opctx, from, casted_nd);
+    }
+
+    if (to_stype == kDefaultStorage) {
+      CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
+    } else if (to_stype == kRowSparseStorage) {
+      CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
+    } else if (to_stype == kCSRStorage) {
+      CopyFromToCsrImpl<from_xpu, to_xpu>(casted_nd, to, rctx);
+    } else {
+      LOG(FATAL) << "unknown storage type" << to_stype;
+    }
+  }
+  if (is_same<from_xpu, mshadow::gpu>::value || is_same<to_xpu, mshadow::gpu>::value) {
+    // Wait GPU kernel to complete
+    rctx.get_stream<gpu>()->Wait();
+  }
+}
+
+void CopyFromTo(const NDArray& from, const NDArray& to, int priority) {
+  if (from.var() == to.var()) {
     // skip to copy to itself
     return;
   }
-  CHECK(from.shape() == to->shape())
+  CHECK(from.shape() == to.shape())
       << "operands shape mismatch"
-      << "from.shape = " << from.shape() << " to.shape=" << to->shape();
+      << "from.shape = " << from.shape() << " to.shape=" << to.shape();
   CHECK(from.shape().ndim() != 0)
       << "source operands have zero dimension shape";
   // important: callback must always capture by value
-  NDArray ret = *to;
   int a = from.ctx().dev_mask();
-  int b = to->ctx().dev_mask();
-
+  int b = to.ctx().dev_mask();
   std::vector<Engine::VarHandle> const_vars;
-  if (from.var() != ret.var()) const_vars.push_back(from.var());
+  if (from.var() != to.var()) const_vars.push_back(from.var());
 
   if (a == cpu::kDevMask && b == cpu::kDevMask) {
-    Engine::Get()->PushSync([from, ret](RunContext ctx) {
-        TBlob tmp = ret.data();
-        ndarray::Copy<cpu, cpu>(from.data(), &tmp,
-                                from.ctx(), ret.ctx(), ctx);
-      }, from.ctx(), const_vars, {ret.var()},
+    Engine::Get()->PushAsync(
+      [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+        CopyFromToImpl<cpu, cpu>(from, to, ctx);
+        on_complete();
+      }, from.ctx(), const_vars, {to.var()},
       FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU"));
   } else {
 #if MXNET_USE_CUDA
     if (a == cpu::kDevMask && b == gpu::kDevMask) {
-      Engine::Get()->PushSync([from, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Copy<cpu, gpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-        }, ret.ctx(), const_vars, {ret.var()},
+      Engine::Get()->PushAsync(
+        [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+          CopyFromToImpl<cpu, gpu>(from, to, ctx);
+          on_complete();
+        }, to.ctx(), const_vars, {to.var()},
         FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU"));
     } else if (a == gpu::kDevMask && b == cpu::kDevMask) {
-      Engine::Get()->PushSync([from, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Copy<gpu, cpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-        }, from.ctx(), const_vars, {ret.var()},
+      Engine::Get()->PushAsync(
+        [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+          CopyFromToImpl<gpu, cpu>(from, to, ctx);
+          on_complete();
+        }, from.ctx(), const_vars, {to.var()},
         FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU"));
     } else if (a == gpu::kDevMask && b == gpu::kDevMask) {
-      Engine::Get()->PushSync([from, ret](RunContext ctx) {
-          TBlob tmp = ret.data();
-          ndarray::Copy<gpu, gpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
-        }, from.ctx(), const_vars, {ret.var()},
-        from.dtype() != ret.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
+      Engine::Get()->PushAsync(
+        [from, to](RunContext ctx, Engine::CallbackOnComplete on_complete) {
+          CopyFromToImpl<gpu, gpu>(from, to, ctx);
+          on_complete();
+        }, from.ctx(), const_vars, {to.var()},
+        from.dtype() != to.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
         priority, PROFILER_MESSAGE("CopyGPU2GPU"));
     } else {
       LOG(FATAL) << "unknown device mask";
@@ -404,6 +567,11 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority) {
   }
 }
 
+
+void CopyFromTo(const NDArray& from, const NDArray *to, int priority) {
+  CopyFromTo(from, *to, priority);
+}
+
 void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priority) {
   std::vector<Engine::VarHandle> const_vars;
   const_vars.reserve(source.size());
@@ -413,8 +581,8 @@ void ElementwiseSum(const std::vector<NDArray> &source, NDArray *out, int priori
     }
     CHECK_EQ(source[i].shape() , out->shape())
         << "operands shape mismatch";
-    if (out->ctx().dev_mask() == cpu::kDevMask) {
-      CHECK_EQ(source[i].ctx().dev_mask(),  cpu::kDevMask)
+    if (out->ctx().dev_mask() == Context::kCPU) {
+      CHECK_EQ(source[i].ctx().dev_mask(), Context::kCPU)
           << "operands context mismatch";
     } else {
       CHECK(source[i].ctx() == out->ctx())
@@ -492,10 +660,6 @@ void ClipOp(const NDArray &src,
   }
 }
 
-inline void CopyFromToSimple(const NDArray &from, NDArray *to) {
-  CopyFromTo(from, to, 0);
-}
-
 template<typename Distribution>
 void SampleOP(const real_t &a,
               const real_t &b,
@@ -665,34 +829,76 @@ NDArray &NDArray::operator/=(const real_t &src) {
 /* magic number for ndarray version 1, with int64_t TShape */
 static const uint32_t NDARRAY_V1_MAGIC = 0xF993fac8;
 
+/* magic number for ndarray version 2, with storage type */
+static const uint32_t NDARRAY_V2_MAGIC = 0xF993fac9;
+
 void NDArray::Save(dmlc::Stream *strm) const {
-  strm->Write(NDARRAY_V1_MAGIC);
+  // write magic number to mark this version
+  // for storage type
+  strm->Write(NDARRAY_V2_MAGIC);
+
+  // save storage type
+  int32_t stype = storage_type();
+  strm->Write(&stype, sizeof(stype));
+
+  const int32_t nad = num_aux_data(storage_type());
+  // save storage shape if ndarray is sparse
+  if (nad > 0) {
+    storage_shape().Save(strm);
+  }
+
+  // save shape
   shape_.Save(strm);
   if (is_none()) return;
+
   // save context
   Context ctx = this->ctx();
   ctx.Save(strm);
   TBlob save_data;
-  NDArray temp;
+  NDArray nd_cpu;  // a copy of *this on cpu
   if (ctx.dev_mask() != cpu::kDevMask) {
-    temp = this->Copy(Context::CPU());
-    temp.WaitToRead();
-    save_data = temp.data();
+    nd_cpu = this->Copy(Context::CPU());
+    nd_cpu.WaitToRead();
+    save_data = nd_cpu.data();
   } else {
     this->WaitToRead();
     save_data = this->data();
+    nd_cpu = *this;
   }
+
   // save type flag
   int32_t type_flag = save_data.type_flag_;
   strm->Write(&type_flag, sizeof(type_flag));
+
+  // save aux_types and aux_shapes
+  if (nad > 0) {
+    for (int i = 0; i < nad; ++i) {
+      int32_t aux_type_flag = aux_type(i);
+      strm->Write(&aux_type_flag, sizeof(aux_type_flag));
+      aux_shape(i).Save(strm);
+    }
+  }
+
+  // save data
   CHECK(save_data.CheckContiguous());
   size_t type_size = mshadow::mshadow_sizeof(type_flag);
-  strm->Write(save_data.dptr_, type_size * shape_.Size());
+  // save data could be values of sparse tensors
+  // must use save_data.shape_ instead of this->shape_
+  strm->Write(save_data.dptr_, type_size * save_data.shape_.Size());
+
+  // save aux data
+  if (nad > 0) {
+    for (int i = 0; i < nad; ++i) {
+      TBlob save_data = nd_cpu.aux_data(i);
+      // save aux_data
+      CHECK(save_data.CheckContiguous());
+      size_t aux_type_size = mshadow::mshadow_sizeof(aux_type(i));
+      strm->Write(save_data.dptr_, aux_type_size * save_data.Size());
+    }
+  }
 }
 
-bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape) {
-  uint32_t magic;
-  if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false;
+bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape, const uint32_t magic) {
   switch (magic) {
     case NDARRAY_V1_MAGIC:
       return shape->Load(strm);
@@ -708,10 +914,10 @@ bool LegacyTShapeLoad(dmlc::Stream *strm, TShape *shape) {
   }
 }
 
-bool NDArray::Load(dmlc::Stream *strm) {
+bool NDArray::LegacyLoad(dmlc::Stream *strm, const uint32_t magic) {
   // load shape
   TShape shape;
-  if (!LegacyTShapeLoad(strm, &shape)) return false;
+  if (!LegacyTShapeLoad(strm, &shape, magic)) return false;
   if (shape.ndim() == 0) {
     *this = NDArray(); return true;
   }
@@ -739,6 +945,88 @@ bool NDArray::Load(dmlc::Stream *strm) {
   }
 }
 
+bool NDArray::Load(dmlc::Stream *strm) {
+  uint32_t magic;
+  if (strm->Read(&magic, sizeof(uint32_t)) != sizeof(uint32_t)) return false;
+  if (magic != NDARRAY_V2_MAGIC) {
+    return LegacyLoad(strm, magic);
+  }
+
+  // load storage type
+  int32_t stype;
+  if (strm->Read(&stype, sizeof(stype)) != sizeof(stype)) return false;
+  const int32_t nad = num_aux_data(static_cast<NDArrayStorageType>(stype));
+
+  // load storage shape
+  TShape sshape;
+  if (nad > 0) {
+    if (!sshape.Load(strm)) return false;
+  }
+
+  // load shape
+  TShape shape;
+  if (!shape.Load(strm)) return false;
+  if (shape.ndim() == 0) {
+    *this = NDArray(); return true;
+  }
+
+  // load context
+  Context ctx;
+  if (!ctx.Load(strm)) return false;
+
+  // load type flag
+  int32_t type_flag;
+  if (strm->Read(&type_flag, sizeof(type_flag)) != sizeof(type_flag)) return false;
+
+  // load aux_types and aux_shapes
+  std::vector<int32_t> aux_types;
+  std::vector<TShape> aux_shapes;
+  if (nad > 0) {
+    aux_types.resize(nad);
+    aux_shapes.resize(nad);
+    for (int i = 0; i < nad; ++i) {
+      // load aux_type(i)
+      if (strm->Read(&aux_types[i], sizeof(aux_types[i])) != sizeof(aux_types[i])) return false;
+      // load aux_shapes(i)
+      if (!aux_shapes[i].Load(strm)) return false;
+    }
+  }
+
+  // load data into CPU
+  NDArray temp;
+  if (0 == nad) {
+    temp = NDArray(shape, Context::CPU(), false, type_flag);
+  } else {
+    temp = NDArray(static_cast<NDArrayStorageType>(stype), shape,
+                   Context::CPU(), false, type_flag,
+                   aux_types, aux_shapes, sshape);
+  }
+  // load data
+  TBlob load_data = temp.data();
+  size_t type_size = mshadow::mshadow_sizeof(type_flag);
+  size_t nread = type_size * load_data.Size();
+  if (strm->Read(load_data.dptr_, nread) != nread) return false;
+
+  // load aux_data
+  if (nad > 0) {
+    for (int i = 0; i < nad; ++i) {
+      load_data = temp.aux_data(i);
+      type_size = mshadow::mshadow_sizeof(load_data.type_flag_);
+      nread = type_size * load_data.Size();
+      if (strm->Read(load_data.dptr_, nread) != nread) return false;
+    }
+  }
+
+  if (ctx.dev_mask() == cpu::kDevMask) {
+    *this = std::move(temp); return true;
+  } else {
+#if MXNET_USE_CUDA
+    *this = temp.Copy(ctx); return true;
+#else
+    *this = std::move(temp); return true;
+#endif
+  }
+}
 
 const uint64_t kMXAPINDArrayListMagic = 0x112;
 
@@ -771,8 +1059,17 @@ void NDArray::Load(dmlc::Stream* fi,
 }
 
 NDArray NDArray::Copy(Context ctx) const {
-  NDArray ret(shape(), ctx, true, dtype_);
-  CopyFromTo(*this, &ret);
+  NDArray ret;
+  if (kDefaultStorage == storage_type()) {
+    ret = NDArray(shape(), ctx, true, dtype_);
+  } else if (kUndefinedStorage != storage_type()) {
+    ret = NDArray(storage_type(), shape(), ctx, true, dtype_,
+                  ptr_->aux_types, ptr_->aux_shapes, storage_shape());
+  } else {
+    LOG(FATAL) << "NDArray::Copy cannot copy undefined storage-type ndarray to ctx.dev_type="
+               << ctx.dev_type << ", ctx.dev_id=" << ctx.dev_id;
+  }
+  CopyFromTo(*this, ret);
   return ret;
 }
 
@@ -789,12 +1086,14 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
     ndarray::Copy<cpu, cpu>(src, &dst, Context::CPU(), Context::CPU(), rctx);
   } else {
 #if MXNET_USE_CUDA
-    Engine::Get()->PushSync([&](RunContext rctx) {
+    Engine::Get()->PushAsync(
+      [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         TBlob dst = this->data();
         ndarray::Copy<cpu, gpu>(src, &dst,
                                 Context::CPU(), this->ctx(), rctx);
         // Wait GPU kernel to complete
         rctx.get_stream<gpu>()->Wait();
+        on_complete();
       }, this->ctx(), {}, {this->var()},
       FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyCPU2GPU"));
     this->WaitToRead();
@@ -804,6 +1103,107 @@ void NDArray::SyncCopyFromCPU(const void *data, size_t size) const {
   }
 }
 
+/*!
+ * \brief Copy src.data()/aux_data(i) to dst->data()/aux_data(j).
+ */
+void NDArray::SyncCopyFromNDArray(const NDArray& src, int i, int j) {
+  if (i >= 0) {
+    CHECK_NE(src.storage_type(), kDefaultStorage);
+  } else {
+    CHECK(!src.is_none()) << "src dense ndarray must have been initialized";
+  }
+  if (j >= 0) {
+    CHECK_NE(storage_type(), kDefaultStorage);
+  } else {
+    CHECK(!this->is_none()) << "dst dense ndarray must have been initialized";
+  }
+
+  if (src.var() == var()) {
+    // skip to copy to itself
+    LOG(WARNING) << "SyncCopyFromNDArray does not support copying to self";
+    return;
+  }
+  const int src_dev_mask = src.ctx().dev_mask();
+  const int dst_dev_mask = ctx().dev_mask();
+  std::vector<Engine::VarHandle> const_vars;
+  const_vars.push_back(src.var());
+
+  // get or create a dst tblob for copying src to it
+  // if dst is a dense format and has not been allocated, allocate memory for it
+  // else if dst is not initialized, allocate corresponding data blob for it
+  auto get_dst_data = [&](const TShape& src_shape) {
+    if (this->storage_type() == kDefaultStorage) {
+      this->ReshapeAndAlloc(src_shape);
+    } else if (!this->storage_initialized()) {
+      if (j < 0) {
+        this->CheckAndAllocData(src_shape);
+      } else {
+        this->CheckAndAllocAuxData(j, src_shape);
+      }
+    }
+    TBlob dst_data = (j >= 0? this->aux_data(j) : this->data());
+    CHECK_LE(src_shape.Size(), dst_data.shape_.Size());
+    return dst_data;
+  };
+
+  if (src_dev_mask == cpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
+    Engine::Get()->PushSync([&](RunContext rctx) {
+        const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+        TBlob dst_data = get_dst_data(src_data.shape_);
+        ndarray::Copy<cpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+      }, this->ctx(), const_vars, {this->var()},
+      FnProperty::kNormal, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2CPU"));
+  } else {
+#if MXNET_USE_CUDA
+    if (src_dev_mask == cpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
+      Engine::Get()->PushAsync(
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+          TBlob dst_data = get_dst_data(src_data.shape_);
+          ndarray::Copy<cpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+          rctx.get_stream<gpu>()->Wait();
+          on_complete();
+        }, this->ctx(), const_vars, {this->var()},
+        FnProperty::kCopyToGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayCPU2GPU"));
+    } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == cpu::kDevMask) {
+      Engine::Get()->PushAsync(
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+          TBlob dst_data = get_dst_data(src_data.shape_);
+          ndarray::Copy<gpu, cpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+          rctx.get_stream<gpu>()->Wait();
+          on_complete();
+        }, this->ctx(), const_vars, {this->var()},
+        FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2CPU"));
+    } else if (src_dev_mask == gpu::kDevMask && dst_dev_mask == gpu::kDevMask) {
+      Engine::Get()->PushAsync(
+        [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+          const TBlob src_data = (i >= 0? src.aux_data(i) : src.data());
+          TBlob dst_data = get_dst_data(src_data.shape_);
+          ndarray::Copy<gpu, gpu>(src_data, &dst_data, src.ctx(), this->ctx(), rctx);
+          rctx.get_stream<gpu>()->Wait();
+          on_complete();
+        }, this->ctx(), const_vars, {this->var()},
+        src.dtype() != this->dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
+        0, PROFILER_MESSAGE("SyncCopyFromNDArrayGPU2GPU"));
+    } else {
+      LOG(FATAL) << "unknown device mask";
+    }
+#else
+    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+  }
+  // The copy operation was pushed to engine to execute.
+  // Need to wait here for it being completed.
+  // The reason for pushing the copy operation to engine
+  // is because when copying data from a sparse tensor
+  // to the current one, that sparse ndarray's storage_shape/aux_shape
+  // may not be ready or changed and we need to ensure
+  // thread safty for reading the correct shape info to allocate
+  // memory for the current ndarray.
+  WaitToRead();
+}
+
 void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   TShape dshape = this->shape();
   CHECK_EQ(dshape.Size(), size)
@@ -817,11 +1217,13 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
                             Context::CPU(), Context::CPU(), rctx);
   } else {
 #if MXNET_USE_CUDA
-    Engine::Get()->PushSync([&](RunContext rctx) {
+    Engine::Get()->PushAsync(
+      [&](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         ndarray::Copy<gpu, cpu>(this->data(), &dst,
                                 this->ctx(), Context::CPU(), rctx);
         // Wait GPU kernel to complete
         rctx.get_stream<gpu>()->Wait();
+        on_complete();
       }, this->ctx(), {this->var()}, {},
       FnProperty::kCopyFromGPU, 0, PROFILER_MESSAGE("SyncCopyGPU2CPU"));
     this->WaitToWrite();
@@ -831,6 +1233,40 @@ void NDArray::SyncCopyToCPU(void *data, size_t size) const {
   }
 }
 
+void NDArray::SyncCheckFormat(const bool full_check) const {
+  int32_t err = kNormalErr;
+  TBlob err_cpu(&err, mshadow::Shape1(1), cpu::kDevMask, 0);
+  if (this->ctx().dev_mask() == cpu::kDevMask) {
+    Engine::Get()->PushSync([&](RunContext rctx) {
+        common::CheckFormatWrapper<cpu>(rctx, *this, err_cpu, full_check);
+      }, this->ctx(), {this->var()}, {},
+      FnProperty::kNormal, 0, PROFILER_MESSAGE("CheckFormat"));
+  } else {
+#if MXNET_USE_CUDA
+    Engine::Get()->PushSync([&](RunContext rctx) {
+        common::CheckFormatWrapper<gpu>(rctx, *this, err_cpu, full_check);
+        rctx.get_stream<gpu>()->Wait();
+      }, this->ctx(), {this->var()}, {},
+      FnProperty::kNormal, 0, PROFILER_MESSAGE("CheckFormat"));
+#else
+    LOG(FATAL) << "GPU is not enabled";
+#endif
+  }
+  this->WaitToWrite();
+  CHECK_NE(err, kCSRShapeErr) << "Shape mismatch of this csr NDArray";
+  CHECK_NE(err, kCSRIndPtrErr)
+           << "IndPtr of csr NDArray should be non-negative, in non-decreasing order, "
+           << "start with 0, and end with value equal with size of indices.";
+  CHECK_NE(err, kCSRIdxErr)
+           << "Indices of csr NDArray should be non-negative, in ascending order per row "
+           << " and less than the number of columns.";
+  CHECK_NE(err, kRSPShapeErr) << "Shape mismatch of this row_sparse NDArray";
+  CHECK_NE(err, kRSPIdxErr)
+          << "Indices of row_sparse NDArray should be non-negative, "
+          << "less than the size of first dimension and in ascending order";
+  CHECK_EQ(err, kNormalErr) << "Check the validity of this sparse NDArray";
+}
+
 #if MXNET_PREDICT_ONLY == 0
 // register API function
 // those with underscore will be registered at NDArray
@@ -856,12 +1292,46 @@ MXNET_REGISTER_NDARRAY_FUN(fill_element_0index)
 // register API function
 // those with underscore will be registered at NDArray
 
+void CopyFromToSimple(
+    const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<NDArray>& outputs) {
+  CopyFromTo(inputs[0], outputs[0], 0);
+}
 
 // copy function is special
 // that we need to remove kAcceptEmptyMutateTarget from it
-MXNET_REGISTER_NDARRAY_FUN(_copyto)
-.set_function(CopyFromToSimple)
-.set_type_mask(kNDArrayArgBeforeScalar);
+NNVM_REGISTER_OP(_copyto)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FInferShape>("FInferShape", op::ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType",
+  [](const NodeAttrs& attrs, std::vector<int> *in_type, std::vector<int> *out_type) {
+    return !op::type_is_none((*in_type)[0]) && !op::type_is_none((*out_type)[0]);
+  })
+.set_attr<FInferStorageType>("FInferStorageType",
+  [](const NodeAttrs& attrs,
+     const int dev_mask,
+     DispatchMode* dispatch_mode,
+     std::vector<int>* in_attrs,
+     std::vector<int>* out_attrs) {
+    op::dispatch_mode_assign(dispatch_mode, DispatchMode::kFComputeEx);
+    if (op::storage_type_is_none((*out_attrs)[0])) {
+      (*out_attrs)[0] = (*in_attrs)[0];
+    }
+    return true;
+  })
+.set_attr<FExecType>("FExecType", [](const NodeAttrs& attrs) {
+    return ExecType::kLocal;
+  })
+.set_attr<nnvm::FGradient>("FGradient", op::ElemwiseGradUseNone{"_copyto"})
+.set_attr<bool>("TIsBackward", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CopyFromToSimple)
+.set_attr<FComputeEx>("FComputeEx<gpu>", CopyFromToSimple)
+.add_argument("data", "NDArray", "input data");
+
 
 void Imdecode(NDArray *ret, NDArray mean, size_t index,
               size_t x0, size_t y0, size_t x1, size_t y1, size_t n_channels,
@@ -892,7 +1362,7 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     CHECK_EQ(ret->shape().ndim(), 4U);
     buff = ret->Slice(index, index+1);
   }
-  CHECK_EQ(buff.ctx().dev_mask(), cpu::kDevMask);
+  CHECK_EQ(buff.ctx().dev_mask(), Context::kCPU);
   CHECK_EQ(n_channels, buff.shape()[1]);
   CHECK_EQ(y1-y0, buff.shape()[2]);
   CHECK_EQ(x1-x0, buff.shape()[3]);
@@ -912,7 +1382,7 @@ void Imdecode(NDArray *ret, NDArray mean, size_t index,
     })
   } else {
     CHECK_EQ(mean.dtype(), buff.dtype());
-    CHECK_EQ(mean.ctx().dev_mask(), cpu::kDevMask);
+    CHECK_EQ(mean.ctx().dev_mask(), Context::kCPU);
     CHECK_EQ(mean.shape()[0], buff.shape()[1]);
     CHECK_EQ(mean.shape()[1], buff.shape()[2]);
     CHECK_EQ(mean.shape()[2], buff.shape()[3]);
diff --git a/src/ndarray/ndarray_function-inl.cuh b/src/ndarray/ndarray_function-inl.cuh
new file mode 100644
index 000000000000..6d845c76c4f2
--- /dev/null
+++ b/src/ndarray/ndarray_function-inl.cuh
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file ndarray_function-inl.cuh
+ * \brief Implementation of ndarray function kernels on GPU
+ */
+#ifndef MXNET_NDARRAY_NDARRAY_FUNCTION_INL_CUH_
+#define MXNET_NDARRAY_NDARRAY_FUNCTION_INL_CUH_
+
+namespace mxnet {
+namespace ndarray {
+
+/*!
+ * \brief GPU kernel to perform RSP tensor addition: out += in
+ * Parallelization by non-zero input elements: 1 thread/element
+ */
+struct ElementWiseRspAdditionKernel {
+  /*!
+   * \brief
+   * \param tid         global thread id
+   * \param data_out    rsp output data
+   * \param row_flg     rsp output inclusive prefix sum array over non-zero marked rows
+   * \param row_idx_in  rsp input non-zero row indices
+   * \param data_in     rsp input data
+   * \param nnr_in      rsp input number of non-zero rows
+   * \param row_length  rsp input and output number of elements per row
+   */
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* data_out,
+                                             const IType* row_flg,
+                                             const IType* row_idx_in,
+                                             const DType* data_in,
+                                             const nnvm::dim_t nnr_in,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    if (tid < nnr_in * row_length) {
+      dim_t in_row = tid / row_length;
+      dim_t in_col = tid % row_length;
+      dim_t out_row = row_flg[row_idx_in[in_row]] - 1;
+      dim_t out_idx = out_row * row_length + in_col;
+      data_out[out_idx] += data_in[tid];
+    }
+  }
+};
+
+}  // namespace ndarray
+}  // namespace mxnet
+
+#endif  // MXNET_NDARRAY_NDARRAY_FUNCTION_INL_CUH_
diff --git a/src/ndarray/ndarray_function-inl.h b/src/ndarray/ndarray_function-inl.h
index 2be55f50f934..821ef2c129bc 100644
--- a/src/ndarray/ndarray_function-inl.h
+++ b/src/ndarray/ndarray_function-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ndarray_function-inl.h
  * \brief The real implementation of NDArray functions.
  */
@@ -30,27 +31,28 @@
 // macro to help specialize evaluation function
 
 #ifndef DECL_TERNARY
-#define DECL_TERNARY(XPU, OP, FUN)                                       \
-  template<>                                                            \
-  void Eval<XPU, OP>(const TBlob &lhs, const TBlob &mhs, \
-                                       const TBlob &rhs, TBlob *ret, RunContext ctx) { \
-    FUN<XPU, OP>(lhs, mhs, rhs, ret, ctx);                                   \
+#define DECL_TERNARY(XPU, OP, FUN)                                          \
+  template<>                                                                \
+  void Eval<XPU, OP>(const TBlob &lhs, const TBlob &mhs,                    \
+                     const TBlob &rhs, TBlob *ret, RunContext ctx) {        \
+    FUN<XPU, OP>(lhs, mhs, rhs, ret, ctx);                                  \
   }
 #endif
 
 #ifndef DECL_BINARY
-#define DECL_BINARY(XPU, OP, FUN)                                       \
-  template<>                                                            \
+#define DECL_BINARY(XPU, OP, FUN)                                                      \
+  template<>                                                                           \
   void Eval<XPU, OP>(const TBlob &lhs, const TBlob &rhs, TBlob *ret, RunContext ctx) { \
-    FUN<XPU, OP>(lhs, rhs, ret, ctx);                                   \
+    FUN<XPU, OP>(lhs, rhs, ret, ctx);                                                  \
   }
 #endif
 
 #ifndef DECL_SCALAR
-#define DECL_SCALAR(XPU, OP, FUN, REVERSE)                              \
-  template<>                                                            \
-  void Eval<XPU, OP, REVERSE>(const TBlob &lhs, const real_t &rhs, TBlob *ret, RunContext ctx) { \
-    FUN<XPU, OP, REVERSE>(lhs, rhs, ret, ctx);                          \
+#define DECL_SCALAR(XPU, OP, FUN, REVERSE)                           \
+  template<>                                                         \
+  void Eval<XPU, OP, REVERSE>(const TBlob &lhs, const real_t &rhs,   \
+                                     TBlob *ret, RunContext ctx) {   \
+    FUN<XPU, OP, REVERSE>(lhs, rhs, ret, ctx);                       \
   }
 #endif
 
@@ -62,10 +64,11 @@
 
 namespace mxnet {
 namespace ndarray {
+
 // true implementation
 template<typename xpu, typename OP>
-inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
-                        TBlob *ret, RunContext ctx) {
+void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
+                 TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(ret->type_flag_, lhs.type_flag_)
@@ -79,10 +82,9 @@ inline void EvalBinary_(const TBlob &lhs, const TBlob &rhs,
   });
 }
 
-
 template<typename xpu, typename OP>
-inline void EvalOneHot_(const TBlob &index, const TBlob &rhs,
-                        TBlob *ret, RunContext ctx) {
+void EvalOneHot_(const TBlob &index, const TBlob &rhs,
+                 TBlob *ret, RunContext ctx) {
   LOG(INFO) << "The operator onehot_encode is deprecated; use one_hot instead.";
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -99,8 +101,8 @@ inline void EvalOneHot_(const TBlob &index, const TBlob &rhs,
 }
 
 template<typename xpu, typename OP>
-inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
-                                  TBlob *ret, RunContext ctx) {
+void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
+                           TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   // TODO(eric): support mixed type choose, i.e. int index and float rhs.
@@ -116,8 +118,8 @@ inline void EvalMatChooseRowElem_(const TBlob &lhs, const TBlob &rhs,
 }
 
 template<typename xpu, typename OP>
-inline void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob &rhs,
-                                  TBlob *ret, RunContext ctx) {
+void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob &rhs,
+                         TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   ret->get<xpu, 2, real_t>(s)
@@ -127,8 +129,8 @@ inline void EvalMatFillRowElem_(const TBlob &lhs, const TBlob &mhs, const TBlob
 }
 
 template<typename xpu, typename OP, bool reverse>
-inline void EvalScalar_(const TBlob &lhs, const real_t &rhs,
-                        TBlob *ret, RunContext ctx) {
+void EvalScalar_(const TBlob &lhs, const real_t &rhs,
+                 TBlob *ret, RunContext ctx) {
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(ret->type_flag_, lhs.type_flag_)
@@ -148,7 +150,7 @@ inline void EvalScalar_(const TBlob &lhs, const real_t &rhs,
 
 template<>
 void EvalClip<DEVICE>(const TBlob &src, const real_t &a_min, const real_t &a_max,
-                      TBlob *ret, RunContext ctx) {
+                             TBlob *ret, RunContext ctx) {
   typedef DEVICE xpu;
   using namespace mshadow::expr;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -163,12 +165,11 @@ void EvalClip<DEVICE>(const TBlob &src, const real_t &a_min, const real_t &a_max
 }
 
 template<>
-void EvalRandom<DEVICE, UniformDistribution>(
-    const real_t &a,
-    const real_t &b,
-    const Resource &resource,
-    TBlob *ret,
-    RunContext ctx) {
+void EvalRandom<DEVICE, UniformDistribution>(const real_t &a,
+                                             const real_t &b,
+                                             const Resource &resource,
+                                             TBlob *ret,
+                                             RunContext ctx) {
   typedef DEVICE xpu;
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   switch (ret->type_flag_) {
@@ -444,6 +445,7 @@ DECL_SCALAR(DEVICE, Plus, EvalScalar_, true)
 DECL_SCALAR(DEVICE, Minus, EvalScalar_, true)
 DECL_SCALAR(DEVICE, Mul, EvalScalar_, true)
 DECL_SCALAR(DEVICE, Div, EvalScalar_, true)
+
 // for reverse seq
 DECL_SCALAR(DEVICE, Plus, EvalScalar_, false)
 DECL_SCALAR(DEVICE, Minus, EvalScalar_, false)
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index e4af86d2c824..ef0adbe5f289 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -18,13 +18,14 @@
  */
 
 /*!
- * \file ndarray_function_cpu.cc
+ * \file ndarray_function.cc
  * \brief CPU Implementation of ndarray function.
  */
 
 // this will be invoked by gcc and compile CPU version
 #include "./ndarray_function.h"
 #include "./ndarray_function-inl.h"
+#include "../common/utils.h"
 
 namespace mxnet {
 namespace ndarray {
@@ -44,5 +45,143 @@ void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
     }
   })
 }
+
+template<typename DType, typename IType>
+void ElementwiseSumRspImpl(mshadow::Stream<cpu>* s,
+                           const std::vector<NDArray>& nds,
+                           const std::vector<IType>& uniq_row_idx,
+                           NDArray* out,
+                           const int nthreads = 4) {
+#pragma omp parallel num_threads(nthreads)
+  {
+    const size_t nnr = uniq_row_idx.size();
+    const int num_threads = omp_get_num_threads();
+    size_t row_block_len = (nnr + num_threads  - 1) / num_threads;
+    const size_t row_block_start = omp_get_thread_num() * row_block_len;
+    if (row_block_start < nnr) {
+      const size_t row_block_end = std::min(row_block_start+row_block_len, nnr);
+
+      const size_t row_length = out->data().shape_.ProdShape(1, out->data().shape_.ndim());
+      auto out_values = out->data().get_with_shape<cpu, 2, DType>(
+          mshadow::Shape2(out->storage_shape()[0], row_length), s);
+      auto out_indices = out->aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>();
+      for (size_t i = row_block_start; i < row_block_end; ++i) {
+        out_indices[i] = uniq_row_idx[i];
+      }
+      for (const auto& nd : nds) {
+        if (nd.storage_initialized()) {
+          const auto nd_indices = nd.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>();
+          const auto nd_values = nd.data().get_with_shape<cpu, 2, DType>(
+              mshadow::Shape2(nd.storage_shape()[0], row_length), s);
+          const auto nd_num_rows = nd.aux_shape(rowsparse::kIdx).Size();
+          const IType* nd_indices_start = &nd_indices[0];
+          const IType* nd_indices_end = nd_indices_start + nd_num_rows;
+          const IType* row_idx_ptr = std::lower_bound(nd_indices_start, nd_indices_end,
+                                                      out_indices[row_block_start]);
+          // skip this nd if all of its row indices are smaller than out_indices[row_block_start]
+          // or current row block is not covered by [*row_idx_ptr, nd_indices_end).
+          if (nd_indices_end == row_idx_ptr || *row_idx_ptr > out_indices[row_block_end-1]) {
+            continue;
+          }
+          for (size_t irow = row_block_start;
+               irow < row_block_end && row_idx_ptr != nd_indices_end;) {
+            if (out_indices[irow] == *row_idx_ptr) {
+              auto out_value_cur_row = out_values[irow];
+              const auto offset = row_idx_ptr - nd_indices_start;
+              auto nd_value_cur_row = nd_values[offset];
+              for (size_t j = 0; j < nd_value_cur_row.shape_[0]; ++j) {
+                out_value_cur_row[j] += nd_value_cur_row[j];
+              }
+              ++irow;
+              ++row_idx_ptr;
+            } else if (out_indices[irow] < *row_idx_ptr) {
+              ++irow;
+            } else {
+              ++row_idx_ptr;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/*!
+ * \brief Given a vector of ndarrays, generate a index vector containing
+ * all the unique row indices of the ndarrays.
+ */
+template<typename IType>
+void GetUniqueRspRowIdx(const std::vector<NDArray>& nds,
+                        std::vector<IType>* uniq_row_idx) {
+  using namespace rowsparse;
+  size_t total_num_rows = 0;
+  for (const auto& nd : nds) {
+    CHECK_EQ(nd.storage_type(), kRowSparseStorage);
+    if (nd.storage_initialized()) {
+      total_num_rows += nd.aux_shape(kIdx).Size();
+    }
+  }
+
+  uniq_row_idx->resize(total_num_rows);
+  int nthreads = omp_get_max_threads();
+  int offset = 0;
+  for (const auto& nd : nds) {
+    if (nd.storage_initialized()) {
+      const IType* nd_row_idx = nd.aux_data(kIdx).dptr<IType>();
+      const int num_rows = nd.aux_shape(kIdx).Size();
+#pragma omp parallel for num_threads(nthreads)
+      for (int i = 0; i < num_rows; ++i) {
+        (*uniq_row_idx)[offset+i] = nd_row_idx[i];
+      }
+      offset += num_rows;
+    }
+  }
+
+  common::ParallelSort(uniq_row_idx->begin(), uniq_row_idx->end(), nthreads);
+  auto it = std::unique(uniq_row_idx->begin(), uniq_row_idx->end());
+  uniq_row_idx->resize(it - uniq_row_idx->begin());
+}
+
+void ElementwiseSumRsp(mshadow::Stream<cpu>* s,
+                       const Resource& rsc,
+                       const std::vector<NDArray>& nds,
+                       NDArray* out) {
+  if (nds.empty()) return;
+  using namespace rowsparse;
+  CHECK_EQ(out->storage_type(), kRowSparseStorage)
+    << "Expected row sparse storage type ("
+    << out->storage_type() << " given)";
+
+  MSHADOW_TYPE_SWITCH(out->dtype(), DType, {
+    MSHADOW_IDX_TYPE_SWITCH(out->aux_type(kIdx), IType, {
+      // TODO(Jun): Use resource rsc for temporary vector instead of
+      //            allocating it directly in GetUniqueRspRowIdx
+      std::vector<IType> uniq_row_idx;
+      GetUniqueRspRowIdx(nds, &uniq_row_idx);
+      out->CheckAndAlloc({mshadow::Shape1(uniq_row_idx.size())});
+      out->data().FlatTo2D<cpu, DType>() = static_cast<DType>(0);
+      ElementwiseSumRspImpl<DType, IType>(s, nds, uniq_row_idx, out, omp_get_max_threads());
+    });
+  });
+}
+
+/*!
+ * \brief Parallel cpu impl of elemwise sum for sparse tensors.
+ * Currently only support row sparse sum.
+ */
+template<>
+void ElementwiseSum<cpu>(mshadow::Stream<cpu>* s,
+                         const Resource& rsc,
+                         const std::vector<NDArray>& nds,
+                         NDArray* out) {
+  if (nds.empty()) return;
+  if (nds[0].storage_type() == kRowSparseStorage) {
+    ElementwiseSumRsp(s, rsc, nds, out);
+  } else {
+    LOG(FATAL) << "ElementwiseSum<cpu> has not been implemented for storage_type = << "
+               << nds[0].storage_type();
+  }
+}
+
 }  // namespace ndarray
 }  // namespace mxnet
diff --git a/src/ndarray/ndarray_function.cu b/src/ndarray/ndarray_function.cu
index 30d532673cff..445f8459aef2 100644
--- a/src/ndarray/ndarray_function.cu
+++ b/src/ndarray/ndarray_function.cu
@@ -18,14 +18,21 @@
  */
 
 /*!
- * \file ndarray_function_cpu.cc
+ * \file ndarray_function.cu
  * \brief GPU Implementation of ndarray function.
  */
 
 // this will be invoked by nvcc and compile GPU version
+#include <cub/cub.cuh>
 #include <dmlc/logging.h>
+#include "../operator/mxnet_op.h"
+#include "../operator/tensor/init_op.h"
+#include "../operator/tensor/util/tensor_util-inl.h"
+#include "../operator/tensor/util/tensor_util-inl.cuh"
+#include "../common/cuda_utils.h"
 #include "./ndarray_function.h"
 #include "./ndarray_function-inl.h"
+#include "./ndarray_function-inl.cuh"
 
 namespace mxnet {
 namespace ndarray {
@@ -88,5 +95,113 @@ void Copy<gpu, gpu>(const TBlob &from, TBlob *to,
                         s->stream_);
   }
 }
+
+/*!
+ * \brief GPU impl of elemwise sum for rowsparse tensors.
+ */
+void ElementwiseSumRspImpl(mshadow::Stream<gpu>* s,
+                           const Resource& rsc,
+                           const std::vector<NDArray>& nds,
+                           NDArray* out) {
+  using namespace mxnet::op;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  CHECK_EQ(out->storage_type(), kRowSparseStorage)
+    << "Expected rowsparse storage_type (" << out->storage_type() << " given)";
+  int init = 0;
+  for (const auto& nd : nds) {
+    if (nd.storage_initialized()) {
+      init++;
+      break;
+    }
+  }
+  if (init == 0) {
+    FillZerosRspImpl(s, *out);
+    return;
+  }
+  const dim_t num_rows = out->shape()[0];
+  const dim_t row_length = out->shape().ProdShape(1, out->shape().ndim());
+  MSHADOW_TYPE_SWITCH(out->dtype(), DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(out->aux_type(kIdx), IType, {  // row_idx type
+      // Allocate temporary storage for row_flg array and cub's prefix sum operation
+      IType* row_flg = NULL;
+      void* d_temp_storage = NULL;
+      size_t temp_storage_bytes = 0;
+      cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                    temp_storage_bytes,
+                                    row_flg,
+                                    row_flg,
+                                    num_rows,
+                                    mshadow::Stream<gpu>::GetStream(s));
+      mshadow::Tensor<gpu, 1, char> workspace = rsc
+          .get_space_typed<gpu, 1, char>(mshadow::Shape1(num_rows * sizeof(IType) +
+                                                         temp_storage_bytes), s);
+      row_flg = reinterpret_cast<IType*>(workspace.dptr_);
+      d_temp_storage = workspace.dptr_ + num_rows*sizeof(IType);
+      // Mark row_flg array with 0 for zero rows and 1 for non-zero rows
+      dim_t num_threads = num_rows;
+      mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, num_threads, row_flg);
+      for (const auto& nd : nds) {
+        if (nd.storage_initialized()) {
+          const IType* nd_row_idx = nd.aux_data(kIdx).dptr<IType>();
+          const dim_t nd_nnr = nd.storage_shape()[0];
+          num_threads = nd_nnr;
+          mxnet_op::Kernel<MarkRspRowFlgKernel, gpu>::Launch(s, num_threads,
+              row_flg, nd_row_idx, nd_nnr);
+        }
+      }
+      // Compute inclusive prefix sum over row_flg
+      cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                    temp_storage_bytes,
+                                    row_flg,
+                                    row_flg,
+                                    num_rows,
+                                    mshadow::Stream<gpu>::GetStream(s));
+      // Get total number of output non-zero rows from GPU and allocate out data and row_idx
+      dim_t nnr_out = 0;
+      CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg[num_rows-1], sizeof(dim_t),
+                           cudaMemcpyDeviceToHost));
+      out->CheckAndAlloc({mshadow::Shape1(nnr_out)});
+      IType* out_row_idx = out->aux_data(kIdx).dptr<IType>();
+      DType* out_data = out->data().dptr<DType>();
+      // Fill row_idx array of output using row_flg
+      num_threads = num_rows;
+      mxnet_op::Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_threads,
+          out_row_idx, row_flg, num_rows);
+      // Perform elementwise addition, writing to output data
+      num_threads = nnr_out * row_length;
+      mxnet_op::Kernel<mxnet_op::set_zero, gpu>::Launch(s, num_threads, out_data);
+      for (const auto& nd : nds) {
+        if (nd.storage_initialized()) {
+          const IType* nd_row_idx = nd.aux_data(kIdx).dptr<IType>();
+          const DType* nd_data = nd.data().dptr<DType>();
+          const dim_t nd_nnr = nd.storage_shape()[0];
+          num_threads = nd_nnr * row_length;
+          mxnet_op::Kernel<ElementWiseRspAdditionKernel, gpu>::Launch(s, num_threads,
+              out_data, row_flg, nd_row_idx, nd_data, nd_nnr, row_length);
+        }
+      }
+    });
+  });
+}
+
+/*!
+ * \brief Parallel gpu impl of elemwise sum for sparse tensors.
+ * Currently only support row sparse sum.
+ */
+template<>
+void ElementwiseSum<gpu>(mshadow::Stream<gpu>* s,
+                         const Resource& rsc,
+                         const std::vector<NDArray>& nds,
+                         NDArray* out) {
+  if (nds.empty()) return;
+  if (nds[0].storage_type() == kRowSparseStorage) {
+    ElementwiseSumRspImpl(s, rsc, nds, out);
+  } else {
+    LOG(FATAL) << "ElementwiseSum<gpu> has not been implemented for storage_type = << "
+        << nds[0].storage_type();
+  }
+}
+
 }  // namespace ndarray
 }  // namespace mxnet
diff --git a/src/ndarray/ndarray_function.h b/src/ndarray/ndarray_function.h
index b1ed58db3e74..98ad3e9257a1 100644
--- a/src/ndarray/ndarray_function.h
+++ b/src/ndarray/ndarray_function.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file ndarray_op.h
  * \brief the real execution functions of ndarray operations
  */
@@ -28,6 +29,7 @@
 #include <mshadow/tensor.h>
 #include <mxnet/base.h>
 #include <mxnet/resource.h>
+#include <mxnet/ndarray.h>
 #include <vector>
 #include "../operator/mshadow_op.h"
 
@@ -168,6 +170,15 @@ void ElementwiseSum(const std::vector<TBlob> source,
                     TBlob *out,
                     RunContext ctx);
 
+/*!
+ * \brief Interface for parallel impl of elemwise sum for sparse matrices
+ */
+template<typename xpu>
+void ElementwiseSum(mshadow::Stream<xpu>* s,
+                    const Resource& rsc,
+                    const std::vector<NDArray>& nds,
+                    NDArray* out);
+
 // broadcasting
 template <typename Device>
 void EvalBroadcast(TBlob const& src, TBlob* ret, int size, RunContext ctx);
diff --git a/src/nnvm/legacy_json_util.cc b/src/nnvm/legacy_json_util.cc
index bdd983cd3a67..2ddd4a198950 100644
--- a/src/nnvm/legacy_json_util.cc
+++ b/src/nnvm/legacy_json_util.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2016 by Contributors
  * \file legacy_json_util.cc
diff --git a/src/nnvm/legacy_op_util.cc b/src/nnvm/legacy_op_util.cc
index 2bba5f1c3655..6048d154967e 100644
--- a/src/nnvm/legacy_op_util.cc
+++ b/src/nnvm/legacy_op_util.cc
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /*!
  *  Copyright (c) 2015 by Contributors
  * \file legacy_op_util.cc
@@ -30,8 +49,10 @@ class ParsedOpProp {
   std::vector<std::string> outputs;
   // initializer
   void Init(const NodeAttrs& attrs) {
-    std::vector<std::pair<std::string, std::string> > kwargs(
-        attrs.dict.begin(), attrs.dict.end());
+    // For performance, do a reserve first and then copy attrs.dict
+    std::vector<std::pair<std::string, std::string> > kwargs;
+    kwargs.reserve(attrs.dict.size());
+    kwargs.insert(kwargs.end(), attrs.dict.begin(), attrs.dict.end());
     try {
       ptr->Init(kwargs);
     } catch (const dmlc::ParamError& e) {
@@ -60,19 +81,20 @@ class OperatorState {
     opr_ = opr;
     fwd_init_ = bwd_init_ = false;
 
-    in_data_.resize(prop->ListArguments().size());
+    in_data_fwd_.resize(prop->ListArguments().size());
+    in_data_bwd_.resize(prop->ListArguments().size());
     out_data_.resize(prop->NumOutputs());
     aux_data_.resize(prop->ListAuxiliaryStates().size());
-    in_grad_.resize(in_data_.size());
+    in_grad_.resize(in_data_fwd_.size());
     out_grad_.resize(prop->NumVisibleOutputs());
 
     std::vector<TBlob*> out_grad_ptr(out_grad_.size());
     for (size_t i = 0; i < out_grad_.size(); ++i) {
       out_grad_ptr[i] = &out_grad_[i];
     }
-    std::vector<TBlob*> in_data_ptr(in_data_.size());
-    for (size_t i = 0; i < in_data_.size(); ++i) {
-      in_data_ptr[i] = &in_data_[i];
+    std::vector<TBlob*> in_data_ptr(in_data_fwd_.size());
+    for (size_t i = 0; i < in_data_fwd_.size(); ++i) {
+      in_data_ptr[i] = &in_data_bwd_[i];
     }
     std::vector<TBlob*> out_data_ptr(out_data_.size());
     for (size_t i = 0; i < out_data_.size(); ++i) {
@@ -89,16 +111,19 @@ class OperatorState {
                const std::vector<OpReqType>& req,
                const std::vector<TBlob>& outputs) {
     if (!fwd_init_) {
-      CHECK_EQ(inputs.size(), in_data_.size() + aux_data_.size());
+      CHECK_EQ(inputs.size(), in_data_fwd_.size() + aux_data_.size());
       CHECK_EQ(outputs.size(), out_data_.size());
-      for (size_t i = 0; i < in_data_.size(); ++i) in_data_[i] = inputs[i];
+      // in_data_bwd_ has the same tblobs as the ones in in_data_fwd_, except that the ones
+      // referred by arg_data_ptr_ will be overriden
+      for (size_t i = 0; i < in_data_fwd_.size(); ++i) in_data_fwd_[i] = inputs[i];
+      for (size_t i = 0; i < in_data_fwd_.size(); ++i) in_data_bwd_[i] = inputs[i];
       for (size_t i = 0; i < aux_data_.size(); ++i) {
-        aux_data_[i] = inputs[i + in_data_.size()];
+        aux_data_[i] = inputs[i + in_data_fwd_.size()];
       }
       for (size_t i = 0; i < out_data_.size(); ++i) out_data_[i] = outputs[i];
       fwd_init_ = true;
     }
-    opr_->Forward(ctx, in_data_, req, out_data_, aux_data_);
+    opr_->Forward(ctx, in_data_fwd_, req, out_data_, aux_data_);
   }
 
   void Backward(const OpContext &ctx,
@@ -108,6 +133,8 @@ class OperatorState {
     if (!bwd_init_) {
       CHECK(fwd_init_);
       CHECK_EQ(arg_data_ptr_.size() + aux_data_.size(), inputs.size());
+      // override tblobs pointed by arg_data_ptr_ since they might not contain
+      // initialized data during forward pass.
       for (size_t i = 0; i < arg_data_ptr_.size(); ++i) {
         *arg_data_ptr_[i] = inputs[i];
       }
@@ -118,13 +145,19 @@ class OperatorState {
       for (size_t i = 0; i < outputs.size(); ++i) in_grad_[i] = outputs[i];
       bwd_init_ = true;
     }
-    opr_->Backward(ctx, out_grad_, in_data_, out_data_, req, in_grad_, aux_data_);
+    opr_->Backward(ctx, out_grad_, in_data_bwd_, out_data_, req, in_grad_, aux_data_);
   }
 
  private:
   Operator *opr_;
   bool fwd_init_, bwd_init_;
-  std::vector<TBlob> in_data_, aux_data_, out_data_, in_grad_, out_grad_;
+  // input data blobs for forward and backward
+  // in_data_fwd_ and in_data_bwd_ will hold different tblobs when StorageFallbackOpExecutor
+  // performs storage fallback on a non-default input NDArray. The one in in_data_fwd_ is
+  // generated when setting up forward executor, while the one in in_data_bwd_ is generated
+  // when setting up backward executor.
+  std::vector<TBlob> in_data_fwd_, in_data_bwd_;
+  std::vector<TBlob> aux_data_, out_data_, in_grad_, out_grad_;
   std::vector<TBlob*> arg_data_ptr_;
 };
 
diff --git a/src/operator/activation-inl.h b/src/operator/activation-inl.h
index 6e6222bb64dd..a39fe9ab11dd 100644
--- a/src/operator/activation-inl.h
+++ b/src/operator/activation-inl.h
@@ -18,10 +18,12 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation-inl.h
  * \brief Activation operator
  * \author Bing Xu
 */
+
 #ifndef MXNET_OPERATOR_ACTIVATION_INL_H_
 #define MXNET_OPERATOR_ACTIVATION_INL_H_
 
@@ -34,6 +36,7 @@
 #include <vector>
 #include <utility>
 #include "./operator_common.h"
+#include "./mxnet_op.h"
 
 namespace mxnet {
 namespace op {
@@ -75,9 +78,16 @@ class ActivationOp : public Operator {
     CHECK_EQ(in_data.size(), 1U);
     CHECK_EQ(out_data.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> data = in_data[activation::kData].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> out = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
-    Assign(out, req[activation::kOut], F<ForwardOp>(data));
+    const TBlob& input = in_data[activation::kData];
+    const size_t sz = input.shape_.Size();
+    if (sz) {
+      MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<ForwardOp, Req>, xpu>::Launch(
+          s, sz,
+          out_data[activation::kOut].dptr<DType>(),
+          input.dptr<DType>());
+      });
+    }
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -93,14 +103,24 @@ class ActivationOp : public Operator {
     CHECK(in_data.size() == 1 && in_grad.size() == 1);
     CHECK_EQ(req.size(), 1U);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> m_out_grad = out_grad[activation::kOut].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> m_out_data = out_data[activation::kOut].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> m_in_grad = in_grad[activation::kData].FlatTo2D<xpu, DType>(s);
-    Assign(m_in_grad, req[activation::kData], F<BackwardOp>(m_out_data) * m_out_grad);
+    const TBlob& m_out_grad = out_grad[activation::kOut];
+    const TBlob& m_out_data = out_data[activation::kOut];
+    const TBlob&  m_in_grad = in_grad[activation::kData];
+    const size_t sz = m_out_data.shape_.Size();
+    if (sz) {
+      MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<
+          mxnet::op::mxnet_op::backward_grad<BackwardOp>, Req>, xpu>::Launch(
+          s, sz,
+          m_in_grad.dptr<DType>(),
+          m_out_grad.dptr<DType>(),
+          m_out_data.dptr<DType>());
+      });
+    }
   }
 };  // class ActivationOp
 
-// Decalre Factory function, used for dispatch specialization
+// Declare Factory function, used for dispatch specialization
 template<typename xpu>
 Operator* CreateOp(ActivationParam type, int dtype, const TShape& dshape);
 
@@ -137,9 +157,7 @@ class ActivationProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
           (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/activation.cc b/src/operator/activation.cc
index a33c11ce546d..bff1d5b962b2 100644
--- a/src/operator/activation.cc
+++ b/src/operator/activation.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief activation op
  * \author Bing Xu
diff --git a/src/operator/activation.cu b/src/operator/activation.cu
index 0ac51ad03109..71efa70b824c 100644
--- a/src/operator/activation.cu
+++ b/src/operator/activation.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/batch_norm-inl.h b/src/operator/batch_norm-inl.h
index a5313e292b6d..f1874b657638 100644
--- a/src/operator/batch_norm-inl.h
+++ b/src/operator/batch_norm-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file batch_norm-inl.h
  * \brief
  * \author Bing Xu, Chris Olivier
@@ -277,16 +278,12 @@ class BatchNormProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype_param;
       } else {
-        CHECK_EQ((*in_type)[i], dtype_param) << "This layer requires uniform type. "
-                                             << "Expected " << dtype_param << " v.s. given "
-                                             << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
       }
     }
     for (index_t i = 0; i < aux_type->size(); ++i) {
       if ((*aux_type)[i] != -1) {
-        CHECK_EQ((*aux_type)[i], dtype_param) << "This layer requires uniform type. "
-                                              << "Expected " << dtype_param << " v.s. given "
-                                              << (*aux_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
       }
     }
     const size_t n_aux = this->ListAuxiliaryStates().size();
diff --git a/src/operator/batch_norm.cc b/src/operator/batch_norm.cc
index 86f47dd6163f..3fb1aa270973 100644
--- a/src/operator/batch_norm.cc
+++ b/src/operator/batch_norm.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file batch_norm.cc
  * \brief
  * \author Bing Xu, Chris Olivier
@@ -230,7 +231,7 @@ void BatchNormOp<xpu, DType, AccReal>::DoBackward(mshadow::Stream<cpu> *,
   #pragma omp parallel for
   for (int channel = 0; channel < static_cast<int>(channelCount); ++channel) {
     const AccReal *weight = weights.dptr<AccReal>();
-    const AccReal w = weight ? weight[channel] : AccReal(1);
+    const AccReal w = !param_.fix_gamma ? weight[channel] : AccReal(1);
     AccReal mean, invstd;
     if (is_train_and_not_global_stats) {
       mean = saveMeanDataPtr[channel];
diff --git a/src/operator/batch_norm.cu b/src/operator/batch_norm.cu
index 64f7d9373823..f9f5e1ef7025 100644
--- a/src/operator/batch_norm.cu
+++ b/src/operator/batch_norm.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file batch_norm.cu
  * \brief CUDA Batch Normalization code
  * \author Chris Olivier, Bing Xu
@@ -283,7 +284,7 @@ __global__ void BatchNormalizationUpdateOutputKernel(
   }
 
   // Write normalized and update the output
-  const AccReal gamma = weight.numElements() > 0
+  const AccReal gamma = ((flags & FIX_GAMMA_FLAG) == 0 && weight.numElements() > 0)
                         ? ScalarConvert<DType, AccReal>::to(weight[plane])
                         : ScalarConvert<int, AccReal>::to(1);
   const AccReal beta = bias.numElements() > 0 ? ScalarConvert<DType, AccReal>::to(bias[plane])
@@ -332,7 +333,7 @@ static __global__ void BatchNormalizationBackwardKernel(
     invstd = VARIANCE_TO_INVSTD(tensors.runningVar[plane], eps);
   }
 
-  const AccReal weightVal = tensors.weight.numElements() > 0 ?
+  const AccReal weightVal = ((flags & FIX_GAMMA_FLAG) == 0 && tensors.weight.numElements() > 0) ?
                       ScalarConvert<DType, AccReal>::to(tensors.weight[plane]) : AccReal(1);
   const AccReal norm = AccReal(1) / N;
 
diff --git a/src/operator/batch_norm_v1-inl.h b/src/operator/batch_norm_v1-inl.h
index 092c4824f9e6..329d66d06d73 100644
--- a/src/operator/batch_norm_v1-inl.h
+++ b/src/operator/batch_norm_v1-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file batch_norm-inl_v1.h
  * \brief
  * \author Bing Xu
@@ -289,16 +290,12 @@ class BatchNormV1Prop : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype_param;
       } else {
-        CHECK_EQ((*in_type)[i], dtype_param) << "This layer requires uniform type. "
-                                             << "Expected " << dtype_param << " v.s. given "
-                                             << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]);
       }
     }
     for (index_t i = 0; i < aux_type->size(); ++i) {
       if ((*aux_type)[i] != -1) {
-        CHECK_EQ((*aux_type)[i], dtype_param) << "This layer requires uniform type. "
-                                              << "Expected " << dtype_param << " v.s. given "
-                                              << (*aux_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]);
       }
     }
     int n_aux = this->ListAuxiliaryStates().size();
diff --git a/src/operator/batch_norm_v1.cc b/src/operator/batch_norm_v1.cc
index 1abced8763c5..96111374b07c 100644
--- a/src/operator/batch_norm_v1.cc
+++ b/src/operator/batch_norm_v1.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file batch_norm_v1.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/batch_norm_v1.cu b/src/operator/batch_norm_v1.cu
index 8ed22a4dc6f1..2adbdef3c716 100644
--- a/src/operator/batch_norm_v1.cu
+++ b/src/operator/batch_norm_v1.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file batch_norm_v1.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/bilinear_sampler-inl.h b/src/operator/bilinear_sampler-inl.h
index 2d68d7855b6d..657aebafdb74 100644
--- a/src/operator/bilinear_sampler-inl.h
+++ b/src/operator/bilinear_sampler-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file bilinear_Sampler-inl.h
  * \brief
  * \author Xu Dong
diff --git a/src/operator/bilinear_sampler.cc b/src/operator/bilinear_sampler.cc
index d03f6798fde5..3365d98bb4db 100644
--- a/src/operator/bilinear_sampler.cc
+++ b/src/operator/bilinear_sampler.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file bilinear_sampler.cc
  * \brief
  * \author Xu Dong
diff --git a/src/operator/bilinear_sampler.cu b/src/operator/bilinear_sampler.cu
index 14b5cd20a3d7..0ab628da700b 100644
--- a/src/operator/bilinear_sampler.cu
+++ b/src/operator/bilinear_sampler.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file bilinear_sampler.cu
  * \brief
  * \author Xu Dong
diff --git a/src/operator/c_lapack_api.h b/src/operator/c_lapack_api.h
index 96a9b3a23709..293c3f2f81ff 100644
--- a/src/operator/c_lapack_api.h
+++ b/src/operator/c_lapack_api.h
@@ -70,11 +70,19 @@
 
 using namespace mshadow;
 
+// Will cause clash with MKL fortran layer headers
+#if MSHADOW_USE_MKL == 0
+
 extern "C" {
 
   // Fortran signatures
-  #define MXNET_LAPACK_FSIGNATURE1(func, dtype) \
-    void func##_(char *uplo, int *n, dtype *a, int *lda, int *info);
+  #ifdef __ANDROID__
+    #define MXNET_LAPACK_FSIGNATURE1(func, dtype) \
+      int func##_(char* uplo, int* n, dtype* a, int* lda, int *info);
+  #else
+    #define MXNET_LAPACK_FSIGNATURE1(func, dtype) \
+      void func##_(char* uplo, int* n, dtype* a, int* lda, int *info);
+  #endif
 
   MXNET_LAPACK_FSIGNATURE1(spotrf, float)
   MXNET_LAPACK_FSIGNATURE1(dpotrf, double)
@@ -86,17 +94,41 @@ extern "C" {
 
   void sposv_(char *uplo, int *n, int *nrhs,
     float *a, int *lda, float *b, int *ldb, int *info);
+
+  // Note: GELQF in row-major (MXNet) becomes GEQRF in column-major (LAPACK).
+  // Also, m and n are flipped, compared to the row-major version
+  #define MXNET_LAPACK_FSIG_GEQRF(func, dtype) \
+    void func##_(int *m, int *n, dtype *a, int *lda, dtype *tau, dtype *work, \
+                 int *lwork, int *info);
+
+  MXNET_LAPACK_FSIG_GEQRF(sgeqrf, float)
+  MXNET_LAPACK_FSIG_GEQRF(dgeqrf, double)
+
+  // Note: ORGLQ in row-major (MXNet) becomes ORGQR in column-major (LAPACK)
+  // Also, m and n are flipped, compared to the row-major version
+  #define MXNET_LAPACK_FSIG_ORGQR(func, dtype) \
+    void func##_(int *m, int *n, int *k, dtype *a, int *lda, dtype *tau, \
+                 dtype *work, int *lwork, int *info);
+
+  MXNET_LAPACK_FSIG_ORGQR(sorgqr, float)
+  MXNET_LAPACK_FSIG_ORGQR(dorgqr, double)
+
+  #define MXNET_LAPACK_FSIG_SYEVD(func, dtype) \
+    void func##_(char *jobz, char *uplo, int *n, dtype *a, int *lda, dtype *w, \
+                 dtype *work, int *lwork, int *iwork, int *liwork, int *info);
+
+  MXNET_LAPACK_FSIG_SYEVD(ssyevd, float)
+  MXNET_LAPACK_FSIG_SYEVD(dsyevd, double)
 }
 
-#define MXNET_LAPACK_ROW_MAJOR 101
-#define MXNET_LAPACK_COL_MAJOR 102
+#endif  // MSHADOW_USE_MKL == 0
+
 
 #define CHECK_LAPACK_UPLO(a) \
   CHECK(a == 'U' || a == 'L') << "neither L nor U specified as triangle in lapack call";
 
 inline char loup(char uplo, bool invert) { return invert ? (uplo == 'U' ? 'L' : 'U') : uplo; }
 
-
 /*!
  * \brief Transpose matrix data in memory
  *
@@ -130,7 +162,75 @@ inline void flip<cpu, double>(int m, int n,
 }
 
 
-#if MXNET_USE_LAPACK
+#if (MSHADOW_USE_MKL && MXNET_USE_LAPACK)
+
+  // We interface with the C-interface of MKL
+  // as this is the preferred way.
+  #include <mkl_lapacke.h>
+
+  #define MXNET_LAPACK_ROW_MAJOR LAPACK_ROW_MAJOR
+  #define MXNET_LAPACK_COL_MAJOR LAPACK_COL_MAJOR
+
+  // These function have already matching signature.
+  #define MXNET_LAPACK_spotrf LAPACKE_spotrf
+  #define MXNET_LAPACK_dpotrf LAPACKE_dpotrf
+  #define MXNET_LAPACK_spotri LAPACKE_spotri
+  #define MXNET_LAPACK_dpotri LAPACKE_dpotri
+  #define mxnet_lapack_sposv  LAPACKE_sposv
+  #define mxnet_lapack_dposv  LAPACKE_dposv
+
+  // The following functions differ in signature from the
+  // MXNET_LAPACK-signature and have to be wrapped.
+  #define MXNET_LAPACK_CWRAP_GELQF(prefix, dtype) \
+  inline int MXNET_LAPACK_##prefix##gelqf(int matrix_layout, int m, int n, \
+                                          dtype *a, int lda, dtype* tau, \
+                                          dtype* work, int lwork) { \
+    if (lwork != -1) { \
+      return LAPACKE_##prefix##gelqf(matrix_layout, m, n, a, lda, tau); \
+    } \
+    *work = 0; \
+    return 0; \
+  }
+  MXNET_LAPACK_CWRAP_GELQF(s, float)
+  MXNET_LAPACK_CWRAP_GELQF(d, double)
+
+  #define MXNET_LAPACK_CWRAP_ORGLQ(prefix, dtype) \
+  inline int MXNET_LAPACK_##prefix##orglq(int matrix_layout, int m, int n, \
+                                          dtype *a, int lda, dtype* tau, \
+                                          dtype* work, int lwork) { \
+    if (lwork != -1) { \
+      return LAPACKE_##prefix##orglq(matrix_layout, m, n, m, a, lda, tau); \
+    } \
+    *work = 0; \
+    return 0; \
+  }
+  MXNET_LAPACK_CWRAP_ORGLQ(s, float)
+  MXNET_LAPACK_CWRAP_ORGLQ(d, double)
+
+  // This has to be called internally in COL_MAJOR format even when matrix_layout
+  // is row-major as otherwise the eigenvectors would be returned as cols in a
+  // row-major matrix layout (see MKL documentation).
+  // We also have to allocate at least one DType element as workspace as the
+  // calling code assumes that the workspace has at least that size.
+  #define MXNET_LAPACK_CWRAP_SYEVD(prefix, dtype) \
+  inline int MXNET_LAPACK_##prefix##syevd(int matrix_layout, char uplo, int n, dtype *a, \
+                                          int lda, dtype *w, dtype *work, int lwork, \
+                                          int *iwork, int liwork) { \
+    if (lwork != -1) { \
+      char o(loup(uplo, (matrix_layout == MXNET_LAPACK_ROW_MAJOR))); \
+      return LAPACKE_##prefix##syevd(LAPACK_COL_MAJOR, 'V', o, n, a, lda, w); \
+    } \
+    *work = 1; \
+    *iwork = 0; \
+    return 0; \
+  }
+  MXNET_LAPACK_CWRAP_SYEVD(s, float)
+  MXNET_LAPACK_CWRAP_SYEVD(d, double)
+
+#elif MXNET_USE_LAPACK
+
+  #define MXNET_LAPACK_ROW_MAJOR 101
+  #define MXNET_LAPACK_COL_MAJOR 102
 
   // These functions can be called with either row- or col-major format.
   #define MXNET_LAPACK_CWRAPPER1(func, dtype) \
@@ -178,6 +278,62 @@ inline void flip<cpu, double>(int m, int n,
     return info;
   }
 
+  // Note: Both MXNET_LAPACK_*gelqf, MXNET_LAPACK_*orglq can only be called with
+  // row-major format (MXNet). Internally, the QR variants are done in column-major.
+  // In particular, the matrix dimensions m and n are flipped.
+  #define MXNET_LAPACK_CWRAP_GELQF(prefix, dtype) \
+  inline int MXNET_LAPACK_##prefix##gelqf(int matrix_layout, int m, int n, \
+                                          dtype *a, int lda, dtype* tau, \
+                                          dtype* work, int lwork) { \
+    if (matrix_layout == MXNET_LAPACK_ROW_MAJOR) { \
+      int info(0); \
+      prefix##geqrf_(&n, &m, a, &lda, tau, work, &lwork, &info); \
+      return info; \
+    } else { \
+      CHECK(false) << "MXNET_LAPACK_" << #prefix << "gelqf implemented for row-major layout only"; \
+      return 1; \
+    } \
+  }
+  MXNET_LAPACK_CWRAP_GELQF(s, float)
+  MXNET_LAPACK_CWRAP_GELQF(d, double)
+
+  // Note: The k argument (rank) is equal to m as well
+  #define MXNET_LAPACK_CWRAP_ORGLQ(prefix, dtype) \
+  inline int MXNET_LAPACK_##prefix##orglq(int matrix_layout, int m, int n, \
+                                          dtype *a, int lda, dtype* tau, \
+                                          dtype* work, int lwork) { \
+    if (matrix_layout == MXNET_LAPACK_ROW_MAJOR) { \
+      int info(0); \
+      prefix##orgqr_(&n, &m, &m, a, &lda, tau, work, &lwork, &info);    \
+      return info; \
+    } else { \
+      CHECK(false) << "MXNET_LAPACK_" << #prefix << "orglq implemented for row-major layout only"; \
+      return 1; \
+    } \
+  }
+  MXNET_LAPACK_CWRAP_ORGLQ(s, float)
+  MXNET_LAPACK_CWRAP_ORGLQ(d, double)
+
+  // Note: Supports row-major format only. Internally, column-major is used, so all
+  // inputs/outputs are flipped (in particular, uplo is flipped).
+  #define MXNET_LAPACK_CWRAP_SYEVD(func, dtype) \
+  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype *a, \
+                                 int lda, dtype *w, dtype *work, int lwork, \
+                                 int *iwork, int liwork) { \
+    if (matrix_layout == MXNET_LAPACK_ROW_MAJOR) { \
+      int info(0); \
+      char jobz('V'); \
+      char uplo_(loup(uplo, true)); \
+      func##_(&jobz, &uplo_, &n, a, &lda, w, work, &lwork, iwork, &liwork, &info); \
+      return info; \
+    } else { \
+      CHECK(false) << "MXNET_LAPACK_" << #func << " implemented for row-major layout only"; \
+      return 1; \
+    } \
+  }
+  MXNET_LAPACK_CWRAP_SYEVD(ssyevd, float)
+  MXNET_LAPACK_CWRAP_SYEVD(dsyevd, double)
+
 #else
 
   // use pragma message instead of warning
@@ -185,6 +341,9 @@ inline void flip<cpu, double>(int m, int n,
      " Ensure that lapack library is installed and build with USE_LAPACK=1 to get lapack" \
      " functionalities.")
 
+  #define MXNET_LAPACK_ROW_MAJOR 101
+  #define MXNET_LAPACK_COL_MAJOR 102
+
   // Define compilable stubs.
   #define MXNET_LAPACK_CWRAPPER1(func, dtype) \
   inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype* a, int lda) { \
@@ -192,6 +351,21 @@ inline void flip<cpu, double>(int m, int n,
     return 1; \
   }
 
+  #define MXNET_LAPACK_CWRAPPER2(func, dtype) \
+  inline int MXNET_LAPACK_##func(int matrix_layout, int m, int n, dtype* a, \
+                                 int lda, dtype* tau, dtype* work, int lwork) { \
+    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
+    return 1; \
+  }
+
+  #define MXNET_LAPACK_CWRAPPER3(func, dtype) \
+  inline int MXNET_LAPACK_##func(int matrix_layout, char uplo, int n, dtype *a, \
+                                 int lda, dtype *w, dtype *work, int lwork, \
+                                 int *iwork, int liwork) { \
+    LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
+    return 1; \
+  }
+
   #define MXNET_LAPACK_UNAVAILABLE(func) \
   inline int mxnet_lapack_##func(...) { \
     LOG(FATAL) << "MXNet build without lapack. Function " << #func << " is not available."; \
@@ -206,6 +380,14 @@ inline void flip<cpu, double>(int m, int n,
   MXNET_LAPACK_UNAVAILABLE(sposv)
   MXNET_LAPACK_UNAVAILABLE(dposv)
 
+  MXNET_LAPACK_CWRAPPER2(sgelqf, float)
+  MXNET_LAPACK_CWRAPPER2(dgelqf, double)
+  MXNET_LAPACK_CWRAPPER2(sorglq, float)
+  MXNET_LAPACK_CWRAPPER2(dorglq, double)
+
+  MXNET_LAPACK_CWRAPPER3(ssyevd, float)
+  MXNET_LAPACK_CWRAPPER3(dsyevd, double)
+
 #endif
 
 template <typename DType>
diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 113da9b35825..00cd8ae084bb 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file channel_op_common.h
  * \brief common function used for concat and split channel
  * \author Bing Xu
diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h
index ed553c8f99e7..fdbe33072a0f 100644
--- a/src/operator/concat-inl.h
+++ b/src/operator/concat-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file concat-inl.h
  * \brief
  * \author Bing Xu
@@ -129,7 +130,7 @@ class ConcatOp : public Operator {
 };  // class ConcatOp
 
 template<typename xpu>
-Operator *CreateOp(ConcatParam param, int dtype);
+Operator *CreateOp(ConcatParam param, int dtype, std::vector<TShape> *in_shape);
 
 #if DMLC_USE_CXX11
 class ConcatProp : public OperatorProperty {
diff --git a/src/operator/concat.cc b/src/operator/concat.cc
index 1bee4b45cd21..4d3c2fa1661f 100644
--- a/src/operator/concat.cc
+++ b/src/operator/concat.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file concat.cc
  * \brief
  * \author Bing Xu
@@ -33,10 +34,15 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<cpu>(ConcatParam param, int dtype) {
+Operator* CreateOp<cpu>(ConcatParam param, int dtype, std::vector<TShape> *in_shape) {
   Operator *op = NULL;
 #if MXNET_USE_MKL2017 == 1
-  if ((1 == param.dim) &&
+  // MKL supports 4D input tensors only for concat operation
+  // 2D/3D input tensors are reshaped to 4D in mkl_concat-inl.h
+  // hence MKL supports 2D/3D/4D input tensors for concat operation
+  size_t dims = (*in_shape)[0].ndim();
+  bool supportedDim = (dims >= 2 && dims <= 4);
+  if ((1 == param.dim) && supportedDim &&
     (param.num_args < (dnnResourceMultipleDst - dnnResourceMultipleSrc))) {
     switch (dtype) {
       case mshadow::kFloat32:
@@ -58,7 +64,7 @@ Operator* CreateOp<cpu>(ConcatParam param, int dtype) {
 
 Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                        std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape);
 }
 
 DMLC_REGISTER_PARAMETER(ConcatParam);
diff --git a/src/operator/concat.cu b/src/operator/concat.cu
index 06828fcbcd7d..394fa736ee84 100644
--- a/src/operator/concat.cu
+++ b/src/operator/concat.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file concat.cu
  * \brief
  * \author Bing Xu
@@ -28,7 +29,7 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator* CreateOp<gpu>(ConcatParam param, int dtype) {
+Operator* CreateOp<gpu>(ConcatParam param, int dtype, std::vector<TShape> *in_shape) {
   Operator *op = NULL;
   MSHADOW_TYPE_SWITCH(dtype, DType, {
     op = new ConcatOp<gpu, DType>(param);
diff --git a/src/operator/contrib/count_sketch-inl.h b/src/operator/contrib/count_sketch-inl.h
index 5df00968e4e5..76d1a7efb876 100644
--- a/src/operator/contrib/count_sketch-inl.h
+++ b/src/operator/contrib/count_sketch-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file count_sketch-inl.h
  * \brief count_sketch operator and symbol
  * \author Chen Zhu
@@ -188,9 +189,7 @@ class CountSketchProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/contrib/count_sketch.cc b/src/operator/contrib/count_sketch.cc
index 6aba8f44b3ad..12814116bbf4 100644
--- a/src/operator/contrib/count_sketch.cc
+++ b/src/operator/contrib/count_sketch.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file count_sketch.cc
  * \brief count_sketch op
  * \author Chen Zhu
@@ -55,12 +56,13 @@ Then the operator computs:
    out[h[i]] += data[i] * s[i]
 
 Example::
+
    out_dim = 5
    x = [[1.2, 2.5, 3.4],[3.2, 5.7, 6.6]]
-   h = [0, 3, 4]
-   s = [1, -1, 1]
+   h = [[0, 3, 4]]
+   s = [[1, -1, 1]]
    mx.contrib.ndarray.count_sketch(data=x, h=h, s=s, out_dim = 5) = [[1.2, 0, 0, -2.5, 3.4],
-                                                                    [3.2, 0, 0, -5.7, 6.6]]
+                                                                     [3.2, 0, 0, -5.7, 6.6]]
 
 )code" ADD_FILELINE)
 .add_argument("data", "NDArray-or-Symbol", "Input data to the CountSketchOp.")
diff --git a/src/operator/contrib/count_sketch.cu b/src/operator/contrib/count_sketch.cu
index 0f3d295ae43f..b849f4cf3e5c 100644
--- a/src/operator/contrib/count_sketch.cu
+++ b/src/operator/contrib/count_sketch.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file count_sketch.cu
  * \brief count_sketch op
  * \author Chen Zhu, Yang Shi
diff --git a/src/operator/contrib/ctc_include/LICENSE b/src/operator/contrib/ctc_include/LICENSE
index 79ac8d576fda..4946875860dd 100644
--- a/src/operator/contrib/ctc_include/LICENSE
+++ b/src/operator/contrib/ctc_include/LICENSE
@@ -1,5 +1,3 @@
-Copyright 2015-2016 Baidu USA LLC.  All rights reserved.
-
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
@@ -188,7 +186,7 @@ Copyright 2015-2016 Baidu USA LLC.  All rights reserved.
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2015-2016, Baidu USA LLC.
+   Copyright [yyyy] [name of copyright owner]
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -200,4 +198,8 @@ Copyright 2015-2016 Baidu USA LLC.  All rights reserved.
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
+   limitations under the License.
+
+   ----
+
+   Copyright 2015-2016, Baidu USA LLC.
\ No newline at end of file
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh b/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh
index a60144308bf5..afcfc00e6617 100644
--- a/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh
+++ b/src/operator/contrib/ctc_include/contrib/moderngpu/include/device/intrinsics.cuh
@@ -104,32 +104,40 @@ MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
 
 #endif // __CUDA_ARCH__ >= 200
 
-
+#if CUDA_VERSION >= 9000
 ////////////////////////////////////////////////////////////////////////////////
-// shfl_up
-
-__device__ __forceinline__ float shfl_up(float var,
-	unsigned int delta, int width = 32) {
+// shfl_add
 
+MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE, unsigned int threadmask = 0xFFFFFFFF) {
+	int result = 0;
 #if __CUDA_ARCH__ >= 300
-	var = __shfl_up(var, delta, width);
+	int mask = (WARP_SIZE - width)<< 8;
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.sync.up.b32 r0|p, %1, %2, %3, %4;"
+		"@p add.s32 r0, r0, %5;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(threadmask), "r"(x));
 #endif
-	return var;
+	return result;
 }
 
-__device__ __forceinline__ double shfl_up(double var,
-	unsigned int delta, int width = 32) {
-
+MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE, unsigned int threadmask = 0xFFFFFFFF) {
+	int result = 0;
 #if __CUDA_ARCH__ >= 300
-	int2 p = mgpu::double_as_int2(var);
-	p.x = __shfl_up(p.x, delta, width);
-	p.y = __shfl_up(p.y, delta, width);
-	var = mgpu::int2_as_double(p);
+	int mask = (WARP_SIZE - width)<< 8;
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.sync.up.b32 r0|p, %1, %2, %3, %4;"
+		"@p max.s32 r0, r0, %5;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(threadmask), "r"(x));
 #endif
-
-	return var;
+	return result;
 }
-
+#else
 ////////////////////////////////////////////////////////////////////////////////
 // shfl_add
 
@@ -162,6 +170,7 @@ MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
 #endif
 	return result;
 }
+#endif
 
 ////////////////////////////////////////////////////////////////////////////////
 // brev, popc, clz, bfe, bfi, prmt
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/mgpuenums.h b/src/operator/contrib/ctc_include/contrib/moderngpu/include/mgpuenums.h
index be2b8314a8ad..601614b21a91 100644
--- a/src/operator/contrib/ctc_include/contrib/moderngpu/include/mgpuenums.h
+++ b/src/operator/contrib/ctc_include/contrib/moderngpu/include/mgpuenums.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /******************************************************************************
  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  * 
diff --git a/src/operator/contrib/ctc_include/contrib/moderngpu/include/util/static.h b/src/operator/contrib/ctc_include/contrib/moderngpu/include/util/static.h
index c7209077506a..016015503b37 100644
--- a/src/operator/contrib/ctc_include/contrib/moderngpu/include/util/static.h
+++ b/src/operator/contrib/ctc_include/contrib/moderngpu/include/util/static.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 /******************************************************************************
  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  * 
diff --git a/src/operator/contrib/ctc_include/detail/cpu_ctc.h b/src/operator/contrib/ctc_include/detail/cpu_ctc.h
index f31ef62c384e..1509b6dda1ae 100644
--- a/src/operator/contrib/ctc_include/detail/cpu_ctc.h
+++ b/src/operator/contrib/ctc_include/detail/cpu_ctc.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 #include <tuple>
@@ -66,21 +85,21 @@ class CpuCTC {
     void* workspace_;
     int blank_label_;
 
-    void softmax(const ProbT* const activations, ProbT* probs,
-                 const int* const input_lengths);
+    void log_softmax(const ProbT* const activations, ProbT* log_probs,
+                     const int* const input_lengths);
 
     std::tuple<ProbT, bool>
-            cost_and_grad_kernel(ProbT *grad, const ProbT* const probs,
+            cost_and_grad_kernel(ProbT *grad, const ProbT* const log_probs,
                                  const int* const labels, int T, int L,
                                  int mb, size_t bytes_used);
 
-    ProbT compute_alphas(const ProbT* probs, int repeats, int S, int T,
+    ProbT compute_alphas(const ProbT* log_probs, int repeats, int S, int T,
                          const int* const e_inc,
                          const int* const s_inc,
                          const int* const labels,
                          ProbT* alphas);
 
-    ProbT compute_betas_and_grad(ProbT* grad, const ProbT* const probs,
+    ProbT compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs,
                                  ProbT log_partition, int repeats,
                                  int S, int T, const int* const e_inc,
                                  const int* const s_inc,
@@ -151,8 +170,8 @@ int CpuCTC<ProbT>::CpuCTC_metadata::setup_labels(const int* const labels,
 
 template<typename ProbT>
 void
-CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs,
-                       const int* const input_lengths) {
+CpuCTC<ProbT>::log_softmax(const ProbT* const activations, ProbT* log_probs,
+                           const int* const input_lengths) {
 #pragma omp parallel for
     for (int mb = 0; mb < minibatch_; ++mb) {
         for(int c = 0; c < input_lengths[mb]; ++c) {
@@ -163,12 +182,12 @@ CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs,
 
             ProbT denom = ProbT(0.);
             for(int r = 0; r < alphabet_size_; ++r) {
-                probs[r + col_offset] = std::exp(activations[r + col_offset] - max_activation);
-                denom += probs[r + col_offset];
+                denom += std::exp(activations[r + col_offset] - max_activation);
             }
 
             for(int r = 0; r < alphabet_size_; ++r) {
-                probs[r + col_offset] /= denom;
+                log_probs[r + col_offset] = activations[r + col_offset]
+                                            - max_activation - std::log(denom);
             }
         }
     }
@@ -176,7 +195,7 @@ CpuCTC<ProbT>::softmax(const ProbT* const activations, ProbT* probs,
 
 template<typename ProbT>
 std::tuple<ProbT, bool>
-CpuCTC<ProbT>::cost_and_grad_kernel(ProbT *grad, const ProbT* const probs,
+CpuCTC<ProbT>::cost_and_grad_kernel(ProbT *grad, const ProbT* const log_probs,
                                     const int* const labels,
                                     int T, int L, int mb, size_t bytes_used) {
 
@@ -190,11 +209,11 @@ CpuCTC<ProbT>::cost_and_grad_kernel(ProbT *grad, const ProbT* const probs,
         return std::make_tuple(ProbT(0), over_threshold); // TODO, not right to return 0
     }
 
-    ProbT llForward = compute_alphas(probs, ctcm.repeats, S, T, ctcm.e_inc,
+    ProbT llForward = compute_alphas(log_probs, ctcm.repeats, S, T, ctcm.e_inc,
                                      ctcm.s_inc, ctcm.labels_w_blanks,
                                      ctcm.alphas);
 
-    ProbT llBackward = compute_betas_and_grad(grad, probs, llForward, ctcm.repeats,
+    ProbT llBackward = compute_betas_and_grad(grad, log_probs, llForward, ctcm.repeats,
                                               S, T, ctcm.e_inc, ctcm.s_inc,
                                               ctcm.labels_w_blanks,
                                               ctcm.alphas,
@@ -211,7 +230,7 @@ CpuCTC<ProbT>::cost_and_grad_kernel(ProbT *grad, const ProbT* const probs,
 
 // Computes forward probabilities
 template<typename ProbT>
-ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs, int repeats, int S, int T,
+ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* log_probs, int repeats, int S, int T,
                                     const int* const e_inc,
                                     const int* const s_inc,
                                     const int* const labels,
@@ -221,7 +240,7 @@ ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs, int repeats, int S, int
             end = S > 1 ? 2 : 1;
 
     for (int i = start; i < end; ++i) {
-        alphas[i] = std::log(probs[labels[i]]);
+        alphas[i] = log_probs[labels[i]];
     }
 
     for(int t = 1; t < T; ++t) {
@@ -234,7 +253,7 @@ ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs, int repeats, int S, int
         int idx1 = t * S, idx2 = (t - 1) * S, idx3 = t * (alphabet_size_ * minibatch_);
 
         if (start == 0) {
-            alphas[idx1] = alphas[idx2] + std::log(probs[blank_label_ + idx3]);
+            alphas[idx1] = alphas[idx2] + log_probs[blank_label_ + idx3];
             startloop += 1;
         }
 
@@ -245,7 +264,7 @@ ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs, int repeats, int S, int
             if (labels[i] != blank_label_ && i != 1 && labels[i] != labels[i-2])
                 prev_sum = ctc_helper::log_plus<ProbT>()(prev_sum, alphas[(i-2) + idx2]);
 
-            alphas[i + idx1] = prev_sum + std::log(probs[labels[i] + idx3]);
+            alphas[i + idx1] = prev_sum + log_probs[labels[i] + idx3];
         }
     }
 
@@ -263,7 +282,7 @@ ProbT CpuCTC<ProbT>::compute_alphas(const ProbT* probs, int repeats, int S, int
 // NOTE computes gradient w.r.t UNNORMALIZED final layer activations.
 // Assumed passed in grads are already zeroed!
 template<typename ProbT>
-ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const probs,
+ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const log_probs,
                                             ProbT log_partition, int repeats,
                                             int S, int T, const int* const e_inc,
                                             const int* const s_inc,
@@ -278,7 +297,7 @@ ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const prob
 
     //set the starting values in the beta column at the very right edge
     for (int i = start; i < end; ++i) {
-        betas[i] = std::log(probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)]);
+        betas[i] = log_probs[labels[i] + (T - 1) * (alphabet_size_ * minibatch_)];
 
         //compute alpha * beta in log space at this position in (S, T) space
         alphas[i + (T - 1) * S] += betas[i];
@@ -294,11 +313,11 @@ ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const prob
         int idx3 = (T - 1) * alphabet_size_ * minibatch_ + i;
 
         if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
-            probs[idx3] == 0.0) {
-            grad[idx3] = probs[idx3];
+            log_probs[idx3] == ctc_helper::neg_inf<ProbT>()) {
+            grad[idx3] = std::exp(log_probs[idx3]);
         } else {
-            grad[idx3] = probs[idx3] - std::exp(output[i] -
-                                                std::log(probs[idx3]) - log_partition);
+            grad[idx3] = std::exp(log_probs[idx3])
+                         - std::exp(output[i] - log_probs[idx3] - log_partition);
         }
     }
 
@@ -321,7 +340,7 @@ ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const prob
             if (labels[i] != blank_label_ && i != (S-2) && labels[i] != labels[i+2]){
                 next_sum = ctc_helper::log_plus<ProbT>()(next_sum, betas[(i+2)]);
             }
-            betas[i] = next_sum + std::log(probs[labels[i] + idx3]);
+            betas[i] = next_sum + log_probs[labels[i] + idx3];
 
             //compute alpha * beta in log space
             alphas[i + idx1] += betas[i];
@@ -332,7 +351,7 @@ ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const prob
         }
 
         if (end == S) {
-            betas[(S-1)] = betas[(S-1)] + std::log(probs[blank_label_ + idx3]);
+            betas[(S-1)] = betas[(S-1)] + log_probs[blank_label_ + idx3];
             alphas[(S-1) + idx1] += betas[(S-1)];
 
             output[labels[S-1]] =
@@ -344,11 +363,11 @@ ProbT CpuCTC<ProbT>::compute_betas_and_grad(ProbT* grad, const ProbT* const prob
         for (int i = 0; i < alphabet_size_; ++i) {
 
             if (output[i] == 0.0 || output[i] == ctc_helper::neg_inf<ProbT>() ||
-                probs[idx3] == 0.0) {
-                grad[idx3] = probs[idx3];
+                log_probs[idx3] == ctc_helper::neg_inf<ProbT>()) {
+                grad[idx3] = std::exp(log_probs[idx3]);
             } else {
-                grad[idx3] = probs[idx3] - std::exp(output[i] -
-                                                    std::log(probs[idx3]) - log_partition);
+                grad[idx3] = std::exp(log_probs[idx3])
+                             - std::exp(output[i] - log_probs[idx3] - log_partition);
             }
             ++idx3;
         }
@@ -379,7 +398,7 @@ CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
         )
         return CTC_STATUS_INVALID_VALUE;
 
-    ProbT* probs = static_cast<ProbT *>(workspace_);
+    ProbT* log_probs = static_cast<ProbT *>(workspace_);
 
     int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
 
@@ -403,7 +422,7 @@ CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
     //labels w/blanks, e_inc, s_inc
     per_minibatch_bytes += 3 * sizeof(int) * maxS;
 
-    softmax(activations, probs, input_lengths);
+    log_softmax(activations, log_probs, input_lengths);
 
 #pragma omp parallel for
     for (int mb = 0; mb < minibatch_; ++mb) {
@@ -414,7 +433,7 @@ CpuCTC<ProbT>::cost_and_grad(const ProbT* const activations,
 
         std::tie(costs[mb], mb_status) =
                 cost_and_grad_kernel(grads + mb * alphabet_size_,
-                                     probs + mb * alphabet_size_,
+                                     log_probs + mb * alphabet_size_,
                                      flat_labels + std::accumulate(label_lengths, label_lengths + mb, 0),
                                      T, L, mb,
                                      bytes_used + mb * per_minibatch_bytes);
@@ -437,7 +456,7 @@ ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
         )
         return CTC_STATUS_INVALID_VALUE;
 
-    ProbT* probs = static_cast<ProbT *>(workspace_);
+    ProbT* log_probs = static_cast<ProbT *>(workspace_);
 
     int maxT = *std::max_element(input_lengths, input_lengths + minibatch_);
 
@@ -461,7 +480,7 @@ ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
     //labels w/blanks, e_inc, s_inc
     per_minibatch_bytes += 3 * sizeof(int) * maxS;
 
-    softmax(activations, probs, input_lengths);
+    log_softmax(activations, log_probs, input_lengths);
 
 #pragma omp parallel for
     for (int mb = 0; mb < minibatch_; ++mb) {
@@ -477,7 +496,7 @@ ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
         if (L + ctcm.repeats > T)
             costs[mb] = ProbT(0);
         else {
-            costs[mb] = -compute_alphas(probs + mb * alphabet_size_, ctcm.repeats, S, T,
+            costs[mb] = -compute_alphas(log_probs + mb * alphabet_size_, ctcm.repeats, S, T,
                                         ctcm.e_inc, ctcm.s_inc, ctcm.labels_w_blanks,
                                         ctcm.alphas);
         }
@@ -487,4 +506,4 @@ ctcStatus_t CpuCTC<ProbT>::score_forward(const ProbT* const activations,
     return CTC_STATUS_SUCCESS;
 }
 
-} // mxnet_warpctc
\ No newline at end of file
+} // mxnet_warpctc
diff --git a/src/operator/contrib/ctc_include/detail/ctc_helper.h b/src/operator/contrib/ctc_include/detail/ctc_helper.h
index 35b7a9601499..6dae61aa3b42 100644
--- a/src/operator/contrib/ctc_include/detail/ctc_helper.h
+++ b/src/operator/contrib/ctc_include/detail/ctc_helper.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 #include <limits>
diff --git a/src/operator/contrib/ctc_include/detail/gpu_ctc.h b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
index ef71f3cdf956..b3c5d1f18acf 100644
--- a/src/operator/contrib/ctc_include/detail/gpu_ctc.h
+++ b/src/operator/contrib/ctc_include/detail/gpu_ctc.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 
@@ -40,13 +59,13 @@ class GpuCTC {
     private:
 
         template<int NT, int VT>
-        ctcStatus_t launch_alpha_beta_kernels(const ProbT* const probs,
+        ctcStatus_t launch_alpha_beta_kernels(const ProbT* const log_probs,
                                               ProbT *grads,
                                               bool compute_alpha,
                                               bool compute_beta);
 
         ctcStatus_t
-        launch_gpu_kernels(const ProbT* const probs,
+        launch_gpu_kernels(const ProbT* const log_probs,
                            ProbT *grads,
                            size_t config,
                            bool launch_alpha,
@@ -64,7 +83,7 @@ class GpuCTC {
                                           size_t& best_config);
 
         ctcStatus_t
-        compute_probs(const ProbT* const activations);
+        compute_log_probs(const ProbT* const activations);
 
         ctcStatus_t
         compute_cost_and_score(const ProbT* const activations,
@@ -99,7 +118,7 @@ class GpuCTC {
         ProbT *nll_forward_;
         ProbT *nll_backward_;
         ProbT *denoms_; // Temporary storage for denoms for softmax
-        ProbT *probs_; // Temporary storage for probabilities (softmax output)
+        ProbT *log_probs_; // Temporary storage for probabilities (log softmax output)
 };
 
 template<typename ProbT>
@@ -249,7 +268,7 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
                                   gpu_bytes_used);
     gpu_bytes_used += activation_cols_ * sizeof(ProbT);
 
-    probs_ =
+    log_probs_ =
         reinterpret_cast<ProbT *>(static_cast<char*>(gpu_workspace_) +
                                   gpu_bytes_used);
     gpu_bytes_used += out_dim_ * activation_cols_ * sizeof(ProbT);
@@ -259,7 +278,7 @@ GpuCTC<ProbT>::setup_gpu_metadata(const int* const flat_labels,
 
 template<typename ProbT>
 template<int NT, int VT>
-ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
+ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const log_probs,
                                                      ProbT* grads,
                                                      bool compute_alpha,
                                                      bool compute_beta ) {
@@ -273,7 +292,7 @@ ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
 
     if (compute_alpha)
         compute_alpha_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
-            (probs, label_sizes_, utt_length_,
+            (log_probs, label_sizes_, utt_length_,
              repeats_, labels_without_blanks_, label_offsets_,
              labels_with_blanks_, alphas_, nll_forward_,
              stride, out_dim_, S_, T_, blank_label_);
@@ -281,7 +300,7 @@ ctcStatus_t GpuCTC<ProbT>::launch_alpha_beta_kernels(const ProbT* const probs,
 
     if (compute_beta) {
         compute_betas_and_grad_kernel<ProbT, NT, VT><<<grid_size, NT, 0, stream_>>>
-            (probs, label_sizes_, utt_length_, repeats_,
+            (log_probs, label_sizes_, utt_length_, repeats_,
              labels_with_blanks_, alphas_, nll_forward_, nll_backward_,
              grads, stride, out_dim_, S_, T_, blank_label_);
 
@@ -331,25 +350,25 @@ GpuCTC<ProbT>::create_metadata_and_choose_config(const int* const flat_labels,
 
 template<typename ProbT>
 ctcStatus_t
-GpuCTC<ProbT>::launch_gpu_kernels(const ProbT* const probs,
+GpuCTC<ProbT>::launch_gpu_kernels(const ProbT* const log_probs,
                                   ProbT* grads,
                                   size_t config,
                                   bool l_a,
                                   bool l_b) {
 
     switch(config) {
-        case 0:   {return launch_alpha_beta_kernels<32,   1>(probs, grads, l_a, l_b);}
-        case 1:   {return launch_alpha_beta_kernels<64,   1>(probs, grads, l_a, l_b);}
-        case 2:   {return launch_alpha_beta_kernels<128,  1>(probs, grads, l_a, l_b);}
-        case 3:   {return launch_alpha_beta_kernels<64,   3>(probs, grads, l_a, l_b);}
-        case 4:   {return launch_alpha_beta_kernels<128,  2>(probs, grads, l_a, l_b);}
-        case 5:   {return launch_alpha_beta_kernels<32,   9>(probs, grads, l_a, l_b);}
-        case 6:   {return launch_alpha_beta_kernels<64,   6>(probs, grads, l_a, l_b);}
-        case 7:   {return launch_alpha_beta_kernels<128,  4>(probs, grads, l_a, l_b);}
-        case 8:   {return launch_alpha_beta_kernels<64,   9>(probs, grads, l_a, l_b);}
-        case 9:   {return launch_alpha_beta_kernels<128,  6>(probs, grads, l_a, l_b);}
-        case 10:  {return launch_alpha_beta_kernels<128,  9>(probs, grads, l_a, l_b);}
-        case 11:  {return launch_alpha_beta_kernels<128, 10>(probs, grads, l_a, l_b);}
+        case 0:   {return launch_alpha_beta_kernels<32,   1>(log_probs, grads, l_a, l_b);}
+        case 1:   {return launch_alpha_beta_kernels<64,   1>(log_probs, grads, l_a, l_b);}
+        case 2:   {return launch_alpha_beta_kernels<128,  1>(log_probs, grads, l_a, l_b);}
+        case 3:   {return launch_alpha_beta_kernels<64,   3>(log_probs, grads, l_a, l_b);}
+        case 4:   {return launch_alpha_beta_kernels<128,  2>(log_probs, grads, l_a, l_b);}
+        case 5:   {return launch_alpha_beta_kernels<32,   9>(log_probs, grads, l_a, l_b);}
+        case 6:   {return launch_alpha_beta_kernels<64,   6>(log_probs, grads, l_a, l_b);}
+        case 7:   {return launch_alpha_beta_kernels<128,  4>(log_probs, grads, l_a, l_b);}
+        case 8:   {return launch_alpha_beta_kernels<64,   9>(log_probs, grads, l_a, l_b);}
+        case 9:   {return launch_alpha_beta_kernels<128,  6>(log_probs, grads, l_a, l_b);}
+        case 10:  {return launch_alpha_beta_kernels<128,  9>(log_probs, grads, l_a, l_b);}
+        case 11:  {return launch_alpha_beta_kernels<128, 10>(log_probs, grads, l_a, l_b);}
     }
 
     return CTC_STATUS_EXECUTION_FAILED;
@@ -357,11 +376,11 @@ GpuCTC<ProbT>::launch_gpu_kernels(const ProbT* const probs,
 
 template<typename ProbT>
 ctcStatus_t
-GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
+GpuCTC<ProbT>::compute_log_probs(const ProbT* const activations) {
 
     cudaError_t cuda_status;
     cuda_status =
-        cudaMemcpyAsync(probs_, activations,
+        cudaMemcpyAsync(log_probs_, activations,
                         activation_cols_ * out_dim_ *sizeof(ProbT),
                         cudaMemcpyDeviceToDevice, stream_);
     if (cuda_status != cudaSuccess)
@@ -374,9 +393,9 @@ GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
     using namespace mshadow::expr;
     Stream<mxnet::gpu> mxstream;
     mxstream.stream_ = stream_;
-    Tensor<mxnet::gpu, 2, ProbT> probs_handle(probs_, mshadow::Shape2(activation_cols_, out_dim_), &mxstream);
+    Tensor<mxnet::gpu, 2, ProbT> log_probs_handle(log_probs_, mshadow::Shape2(activation_cols_, out_dim_), &mxstream);
     Tensor<mxnet::gpu, 1, ProbT> denoms_handle(denoms_, mshadow::Shape1(activation_cols_), &mxstream);
-    denoms_handle = reduce_with_axis<red::maximum, false>(probs_handle, 1);
+    denoms_handle = reduce_with_axis<red::maximum, false>(log_probs_handle, 1);
 
 
 
@@ -387,21 +406,21 @@ GpuCTC<ProbT>::compute_probs(const ProbT* const activations) {
     const int num_elements = out_dim_ * activation_cols_;
     const int grid_size = ctc_helper::div_up(num_elements, NV);
 
-    prepare_stable_SM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
-       (ctc_helper::identity<ProbT>(), probs_,
+    prepare_stable_LSM_kernel<ProbT, VT> <<< grid_size, NT, 0, stream_>>>
+       (ctc_helper::identity<ProbT>(), log_probs_,
         denoms_, out_dim_, num_elements);
 
     // compute denominators for softmax
     denoms_handle = reduce_with_axis<red::sum, false>(
         F<mxnet::op::mshadow_op::exp>(
-            probs_handle -
-            broadcast<0>(reduce_with_axis<red::maximum, false>(probs_handle, 1),
-                         probs_handle.shape_)),
+            log_probs_handle -
+            broadcast<0>(reduce_with_axis<red::maximum, false>(log_probs_handle, 1),
+                         log_probs_handle.shape_)),
         1);
 
     // Kernel launch to calculate probabilities
-    compute_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
-        (ctc_helper::exponential<ProbT>(), probs_,
+    compute_log_probs_kernel<ProbT, VT><<<grid_size, NT, 0, stream_>>>
+        (ctc_helper::identity<ProbT>(), log_probs_,
          denoms_, out_dim_, num_elements);
 
     return CTC_STATUS_SUCCESS;
@@ -426,11 +445,11 @@ GpuCTC<ProbT>::compute_cost_and_score(const ProbT* const activations,
     if (status != CTC_STATUS_SUCCESS)
         return status;
 
-    status = compute_probs(activations);
+    status = compute_log_probs(activations);
     if (status != CTC_STATUS_SUCCESS)
         return status;
 
-    launch_gpu_kernels(probs_, grads, best_config,
+    launch_gpu_kernels(log_probs_, grads, best_config,
                        compute_alpha, compute_betas_and_grad);
 
     cudaError_t cuda_status_mem, cuda_status_sync;
@@ -484,4 +503,4 @@ GpuCTC<ProbT>::score_forward(const ProbT* const activations,
                                   label_lengths, input_lengths, true, false);
 }
 
-} // mxnet_warpctc
\ No newline at end of file
+} // mxnet_warpctc
diff --git a/src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h b/src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h
index 90eaeb81efb7..99d8e4dd2611 100644
--- a/src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h
+++ b/src/operator/contrib/ctc_include/detail/gpu_ctc_kernels.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 #include "../contrib/moderngpu/include/device/ctascan.cuh"
@@ -82,11 +101,11 @@ struct CTASegReduce {
 // S elements where S = 2L + 1.
 //
 // We only need to read in probabilities corresponding to the labels, thus a sparse
-// set of values are read from the probs matrix since the character set is much smaller
+// set of values are read from the log probs matrix since the character set is much smaller
 // than the labels. This is much more true for Mandarin than English.
 template<typename ProbT, int NT, int VT>
 __global__
-void compute_alpha_kernel (const ProbT* probs, const int *label_sizes,
+void compute_alpha_kernel (const ProbT* log_probs, const int *label_sizes,
                            const int *utt_length, const int *repeats_in_labels,
                            const int *labels_without_blanks, const int *label_offsets, 
                            int *labels_with_blanks, ProbT *alphas, 
@@ -146,7 +165,7 @@ void compute_alpha_kernel (const ProbT* probs, const int *label_sizes,
 
     // Initialize the first row corresponding to t=0;
     for(int i = tid; i < (end-start); i += blockDim.x)
-        alpha[i + start] = log(probs[prob_offset + label[i + start]]);
+        alpha[i + start] = log_probs[prob_offset + label[i + start]];
 
     __syncthreads();
 
@@ -165,7 +184,7 @@ void compute_alpha_kernel (const ProbT* probs, const int *label_sizes,
         if (tid == 0) {
             if (start == 0) {
                 alpha[start_cur_row] = alpha[start_prev_row] +
-                                       log(probs[prob_offset + start_prob_col + blank_label]);
+                                       log_probs[prob_offset + start_prob_col + blank_label];
             }
             else if (start == 1) {
                 alpha[start_cur_row] = alpha[start_prev_row];
@@ -189,7 +208,7 @@ void compute_alpha_kernel (const ProbT* probs, const int *label_sizes,
                 prev_sum = log_plus_f(prev_sum, alpha[(idx-2) + start_prev_row]);
 
             alpha[idx + start_cur_row] =
-                prev_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
+                prev_sum + log_probs[prob_offset + start_prob_col + label[idx]];
         }
 
         __syncthreads();
@@ -217,7 +236,7 @@ void compute_alpha_kernel (const ProbT* probs, const int *label_sizes,
 // See comments above compute_alphas for more context.
 template<typename ProbT, int NT, int VT>
 __global__
-void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
+void compute_betas_and_grad_kernel (const ProbT* log_probs, const int *label_sizes,
                                     const int *utt_length, const int *repeats_in_labels,
                                     const int *labels_with_blanks, ProbT *alphas,
                                     const ProbT* nll_forward, ProbT *nll_backward,
@@ -321,7 +340,7 @@ void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
     // Initialize the two rightmost values in the last row (assuming L non-zero)
     for(int i = tid; i < (end-start); i += blockDim.x)
         temp_buffer.beta[i + start] =
-            log(probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]]);
+            log_probs[prob_offset + (T - 1) * (out_dim * stride) + label[i + start]];
 
     __syncthreads();
 
@@ -339,7 +358,7 @@ void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
         // Start offsets into the current and next row
         const int start_cur_row = t * S;
 
-        // Starting offset of column that we read from the probs array
+        // Starting offset of column that we read from the log probs array
         const int start_prob_col = t * (out_dim * stride);
 
         if (t < T-1) {
@@ -356,7 +375,7 @@ void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
                     (idx != (S-2)) && (label[idx] != label[idx+2]))
                     next_sum = log_plus_f(next_sum, temp_buffer.beta[idx+2]);
 
-                beta_val[i] = next_sum + log(probs[prob_offset + start_prob_col + label[idx]]);
+                beta_val[i] = next_sum + log_probs[prob_offset + start_prob_col + label[idx]];
             }
 
             __syncthreads();
@@ -365,7 +384,7 @@ void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
             // Update input buffer for next iteration
             if ((tid == 0) && (end == S))
                 temp_buffer.beta[(S-1)] = temp_buffer.beta[(S-1)] +
-                                          log(probs[prob_offset + start_prob_col + blank_label]);
+                                          log_probs[prob_offset + start_prob_col + blank_label];
 
             #pragma unroll
             for(int idx = tid, i = 0; idx < (S-1); idx += NT, i++) {
@@ -409,7 +428,7 @@ void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
 
             for (int idx = tid; idx < out_dim; idx += blockDim.x) {
                 const int grads_offset = prob_offset + start_prob_col + idx;
-                grads[grads_offset] = probs[grads_offset];
+                grads[grads_offset] = exp(log_probs[grads_offset]);
             }
 
             __syncthreads();
@@ -419,11 +438,11 @@ void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
 
                 ProbT grad = output[idx];
 
-                if ((grad == 0.0) || (probs[grads_offset] == 0.0) ||
+                if ((grad == 0.0) || (log_probs[grads_offset] == ctc_helper::neg_inf<ProbT>()) ||
                     (grad == ctc_helper::neg_inf<ProbT>())) {
                 } else {
                     grads[grads_offset] =
-                        probs[grads_offset] - exp(grad - log(probs[grads_offset]) - log_partition);
+                        exp(log_probs[grads_offset]) - exp(grad - log_probs[grads_offset] - log_partition);
                 }
             }
 
@@ -452,7 +471,7 @@ void compute_betas_and_grad_kernel (const ProbT* probs, const int *label_sizes,
 }
 
 template <typename ProbT, int VT = 1, typename Op>
-__global__ void compute_probs_kernel(Op f, ProbT* probs,
+__global__ void compute_log_probs_kernel(Op f, ProbT* log_probs,
                                      const ProbT* const denom,
                                      int alphabet_size,
                                      int count) {
@@ -463,14 +482,14 @@ __global__ void compute_probs_kernel(Op f, ProbT* probs,
     for(int i = 0; i < VT; i++) {
         if (idx < count) {
             const int column_idx = idx / alphabet_size;
-            probs[idx] = f(probs[idx]) / denom[column_idx];
+            log_probs[idx] = log_probs[idx] - log(denom[column_idx]);
         }
         idx += stride;
     }
 }
 
 template <typename ProbT, int VT = 1, typename Op>
-__global__ void prepare_stable_SM_kernel(Op f, ProbT* probs,
+__global__ void prepare_stable_LSM_kernel(Op f, ProbT* log_probs,
                                          const ProbT* const col_max,
                                          int alphabet_size,
                                          int count) {
@@ -481,7 +500,7 @@ __global__ void prepare_stable_SM_kernel(Op f, ProbT* probs,
     for(int i = 0; i < VT; i++) {
         if (idx < count) {
             const int column_idx = idx / alphabet_size;
-            probs[idx] = f(probs[idx] - col_max[column_idx]);
+            log_probs[idx] = f(log_probs[idx] - col_max[column_idx]);
         }
         idx += stride;
     }
diff --git a/src/operator/contrib/ctc_include/detail/hostdevice.h b/src/operator/contrib/ctc_include/detail/hostdevice.h
index 7bec1e0017c7..4f68d0381a6c 100644
--- a/src/operator/contrib/ctc_include/detail/hostdevice.h
+++ b/src/operator/contrib/ctc_include/detail/hostdevice.h
@@ -1,3 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
 #pragma once
 
 #ifdef __CUDACC__
diff --git a/src/operator/contrib/ctc_loss-inl.h b/src/operator/contrib/ctc_loss-inl.h
index 0d0c0bf4cd09..ad1d1ec91f30 100644
--- a/src/operator/contrib/ctc_loss-inl.h
+++ b/src/operator/contrib/ctc_loss-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file ctc_loss-inl.h
  * \brief
  * \author Sebastian Bodenstien
@@ -40,6 +41,12 @@
 #include "../operator_common.h"
 #include "../sequence_op_common.h"
 #include "../mshadow_op.h"
+#include "../nn/sequence_mask-inl.h"
+
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+#define CUDNN_LABEL_LENGTH_LIMIT 256
+#include "../nn/softmax-inl.h"
+#endif  // CUDNN
 
 namespace mxnet {
 namespace op {
@@ -52,14 +59,14 @@ enum CTCLossOpForwardResource { kTempSpace };
 
 template <typename T>
 inline void get_workspace_size(std::vector<int> *label_lengths,
-                               std::vector<int> *input_lengths,
+                               std::vector<int> *data_lengths,
                                int alphabet_size, int minibatch, bool gpu,
                                size_t *size_bytes) {
   // This is the max of all S and T for all examples in the minibatch.
   int maxL = *std::max_element(label_lengths->data(),
                                label_lengths->data() + minibatch);
-  int maxT = *std::max_element(input_lengths->data(),
-                               input_lengths->data() + minibatch);
+  int maxT = *std::max_element(data_lengths->data(),
+                               data_lengths->data() + minibatch);
 
   const int S = 2 * maxL + 1;
 
@@ -125,34 +132,118 @@ inline void get_workspace_size(std::vector<int> *label_lengths,
 }
 
 // Takes a tensor of labels, and interprets 0-elements at the end of the vector
-// as padding. The tensor is packed into a std::vector without padding
-// characters. The sequence lengths are also inferred from the padding chars
+// as padding. The tensor is packed into an std::vector without padding
+// characters. The label sequence lengths are also inferred from the padding chars.
+// When cudnn is enabled, the return value signifies whether the cudnn length limit is exceeded.
 template <typename DType, typename xpu>
-inline void LabelTensorToPackedVector(mshadow::Tensor<xpu, 2, DType> labels,
+inline bool LabelTensorToPackedVector(mshadow::Tensor<xpu, 2, DType> labels,
+                                      int padding_mask,
                                       std::vector<int> *packed_labels,
                                       std::vector<int> *label_lengths) {
   int batch = labels.size(0);
   int max_num_labels = labels.size(1);
-  std::vector<index_t> cpu_labels(max_num_labels);
+  bool exceed_limit = false;
+
+  std::vector<int> cpu_labels(max_num_labels*batch);
+  mshadow::Tensor<xpu, 1, DType> flat_labels = labels.FlatTo1D();
+  IndexTensorToVector(flat_labels, &cpu_labels);
 
   for (int b = 0; b < batch; ++b) {
-    IndexTensorToVector(labels[b], &cpu_labels);
-    auto res = std::find(cpu_labels.begin(), cpu_labels.end(), 0);
-    int len = std::distance(cpu_labels.begin(), res);
-    std::copy(cpu_labels.begin(), cpu_labels.begin() + len,
+    auto start = cpu_labels.data()+b*max_num_labels;
+    auto res = std::find(start, start+max_num_labels, padding_mask);
+    int len = std::distance(start, res);
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+    exceed_limit = exceed_limit || len > CUDNN_LABEL_LENGTH_LIMIT;
+#endif
+    std::copy(start, start + len,
               std::back_inserter(*packed_labels));
-    label_lengths->emplace_back(len);
+    label_lengths->at(b) = len;
   }
+  return exceed_limit;
+}
+
+// Takes a tensor of labels, and a vector which specifies the actual length of each label
+// The tensor is packed into an std::vector without padding characters.
+// The label length vector is copied into an std::vector.
+// When cudnn is enabled, the return value signifies whether the cudnn length limit is exceeded.
+template <typename DType, typename xpu>
+inline bool PackLabelByLength(mshadow::Tensor<xpu, 2, DType> labels,
+                              mshadow::Tensor<xpu, 1, DType> in_label_lengths,
+                              std::vector<int> *packed_labels,
+                              std::vector<int> *label_lengths) {
+  int batch = labels.size(0);
+  int max_num_labels = labels.size(1);
+  bool exceed_limit = false;
+
+  IndexTensorToVector(in_label_lengths, label_lengths);
+
+  std::vector<int> cpu_labels(max_num_labels*batch);
+  mshadow::Tensor<xpu, 1, DType> flat_labels = labels.FlatTo1D();
+  IndexTensorToVector(flat_labels, &cpu_labels);
+
+  for (int b = 0; b < batch; ++b) {
+    auto start = cpu_labels.data()+b*max_num_labels;
+    int len = label_lengths->at(b);
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+    exceed_limit = exceed_limit || len > CUDNN_LABEL_LENGTH_LIMIT;
+#endif
+    std::copy(start, start + len,
+              std::back_inserter(*packed_labels));
+  }
+  return exceed_limit;
 }
 
 struct CTCLossParam : public dmlc::Parameter<CTCLossParam> {
-  DMLC_DECLARE_PARAMETER(CTCLossParam) {}
+  bool use_data_lengths;
+  bool use_label_lengths;
+  int blank_label;
+  DMLC_DECLARE_PARAMETER(CTCLossParam) {
+    DMLC_DECLARE_FIELD(use_data_lengths).set_default(false)
+      .describe("Whether the data lenghts are decided by `data_lengths`. "
+                "If false, the lengths are equal to the max sequence length.");
+    DMLC_DECLARE_FIELD(use_label_lengths).set_default(false)
+      .describe("Whether the label lenghts are decided by "
+                "`label_lengths`, or derived from `padding_mask`. "
+                "If false, the lengths are derived from the "
+                "first occurrence of the value of `padding_mask`. "
+                "The value of `padding_mask` is ``0`` when first CTC label is reserved for blank, "
+                "and ``-1`` when last label is reserved for blank. See `blank_label`.");
+    DMLC_DECLARE_FIELD(blank_label)
+      .add_enum("first", 0)
+      .add_enum("last", 1)
+      .set_default(0)
+      .describe("Set the label that is reserved for blank label."
+                "If \"first\", 0-th label is reserved, and "
+                "label values for tokens in the vocabulary are "
+                "between ``1`` and ``alphabet_size-1``, and the padding mask is ``-1``. "
+                "If \"last\", last label value ``alphabet_size-1`` "
+                "is reserved for blank label instead, "
+                "and label values for tokens in the vocabulary are "
+                "between ``0`` and ``alphabet_size-2``, and the padding mask is ``0``.");
+  }
 };
 
 template <typename xpu>
 class CTCLossOp : public Operator {
  public:
-  explicit CTCLossOp(CTCLossParam p) { this->param_ = p; }
+  explicit CTCLossOp(CTCLossParam p) {
+    this->param_ = p;
+    exceed_cudnn_limit = false;
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+    CUDNN_CALL(cudnnCreateCTCLossDescriptor(&ctc_desc_));
+    CUDNN_CALL(cudnnSetCTCLossDescriptor(ctc_desc_, CUDNN_DATA_FLOAT));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&prob_desc_));
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&grad_desc_));
+#endif
+  }
+
+  ~CTCLossOp() {
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+    CUDNN_CALL(cudnnDestroyCTCLossDescriptor(ctc_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(prob_desc_));
+    CUDNN_CALL(cudnnDestroyTensorDescriptor(grad_desc_));
+#endif
+  }
 
   virtual void Forward(const OpContext &ctx, const std::vector<TBlob> &in_data,
                        const std::vector<OpReqType> &req,
@@ -160,8 +251,9 @@ class CTCLossOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U);
+    CHECK_EQ(in_data.size(), 2U+param_.use_data_lengths+param_.use_label_lengths);
     CHECK_EQ(out_data.size(), 2U);
+    exceed_cudnn_limit = false;
     Stream<xpu> *s = ctx.get_stream<xpu>();
 
     Tensor<xpu, 3, real_t> data =
@@ -178,27 +270,52 @@ class CTCLossOp : public Operator {
     int batch_size = data.size(1);
     int alphabet_size = data.size(2);
 
+    // data_lengths
+    std::vector<int> data_lengths(batch_size, max_seq_len);
+    if (param_.use_data_lengths) {
+      int kInputLength = 2;
+      IndexTensorToVector(in_data[kInputLength].get<xpu, 1, real_t>(s), &data_lengths);
+    }
+
     // label_lengths
     std::vector<int> packed_labels;
-    std::vector<int> label_lengths;
-    LabelTensorToPackedVector(labels, &packed_labels, &label_lengths);
-
-    // allocate temporary workspace
-    std::vector<int> input_lengths(batch_size, max_seq_len);
-    size_t size_bytes;
-    bool gpu = data.kDevCPU ? false : true;
-    get_workspace_size<real_t>(&label_lengths, &input_lengths, alphabet_size,
-                               batch_size, gpu, &size_bytes);
-
-    // round-up so there are enough elems in memory
-    int num_tmp_elems = (size_bytes + sizeof(real_t) - 1) / sizeof(real_t);
-    Tensor<xpu, 1, real_t> workspace =
-        ctx.requested[ctc_loss::kTempSpace].get_space_typed<xpu, 1, real_t>(
-            Shape1(num_tmp_elems), s);
-
-    compute_ctc_cost(data, costs.dptr_, grad.dptr_, packed_labels.data(),
-                     label_lengths.data(), input_lengths.data(),
-                     workspace.dptr_, ctx.is_train);
+    std::vector<int> label_lengths(batch_size);
+
+    if (param_.use_label_lengths) {
+      int kLabelLength = 2+param_.use_data_lengths;
+      exceed_cudnn_limit = PackLabelByLength(labels, in_data[kLabelLength].get<xpu, 1, real_t>(s),
+                                             &packed_labels, &label_lengths);
+    } else {
+      exceed_cudnn_limit = LabelTensorToPackedVector(labels, param_.blank_label == 0?0:-1,
+                                                     &packed_labels, &label_lengths);
+    }
+
+// CUDNN is disabled due to lack of support for input lengths
+/* #if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7 */
+/*     if (!exceed_cudnn_limit) { */
+/*       cudnn_forward(ctx, s, data, costs, grad, */
+/*                     &data_lengths, &label_lengths, &packed_labels, */
+/*                     max_seq_len, batch_size, alphabet_size, */
+/*                     req[ctc_loss::kGrad] != mxnet::kNullOp); */
+/*     } else { */
+/*       baidu_forward(ctx, s, data, costs, grad, */
+/*                     &data_lengths, &label_lengths, &packed_labels, */
+/*                     batch_size, alphabet_size, req[ctc_loss::kGrad] != mxnet::kNullOp); */
+/*     } */
+/* #else */
+
+    baidu_forward(ctx, s, data, costs, grad,
+                  &data_lengths, &label_lengths, &packed_labels,
+                  batch_size, alphabet_size, req[ctc_loss::kGrad] != mxnet::kNullOp);
+
+    if (param_.use_data_lengths) {
+      // baidu warp CTC implementation sometimes includes undefined gradients
+      // for data outside of length mask. Setting to 0 to make it consistent
+      // with CPU implementation.
+      int kInputLength = 2;
+      mxnet_op::SequenceMask(grad, in_data[kInputLength].get<xpu, 1, real_t>(s),
+                             static_cast<real_t>(0));
+    }
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -222,11 +339,129 @@ class CTCLossOp : public Operator {
         out_data[ctc_loss::kGrad].get<xpu, 3, real_t>(s);
 
     Assign(data_grad, req[ctc_loss::kData],
-           broadcast<1>(output_grad, data_grad.shape_) * data_grad_computed);
+           mshadow::expr::broadcast<1>(output_grad, data_grad.shape_) * data_grad_computed);
   }
 
  private:
   CTCLossParam param_;
+  bool exceed_cudnn_limit;
+
+#if defined(__CUDACC__) && MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 7
+  cudnnDataType_t dtype_;
+  cudnnCTCLossDescriptor_t ctc_desc_;
+  cudnnTensorDescriptor_t prob_desc_, grad_desc_;
+
+  inline virtual void cudnn_forward(const OpContext &ctx,
+                                    mshadow::Stream<xpu>* s,
+                                    mshadow::Tensor<xpu, 3, real_t> data,
+                                    mshadow::Tensor<xpu, 1, real_t> costs,
+                                    mshadow::Tensor<xpu, 3, real_t> grad,
+                                    std::vector<int>* data_lengths,
+                                    std::vector<int>* label_lengths,
+                                    std::vector<int>* packed_labels,
+                                    int max_seq_len,
+                                    int batch_size,
+                                    int alphabet_size,
+                                    bool req_grad) {
+    using namespace mshadow;
+
+    // call cudnn to calculate ctc loss
+    dtype_ = CUDNN_DATA_FLOAT;
+    int dims[3], strides[3];
+    size_t workspace_bytes;
+    int workspace_size;
+    dims[0] = max_seq_len;
+    dims[1] = batch_size;
+    dims[2] = alphabet_size;
+    strides[0] = batch_size*alphabet_size;
+    strides[1] = alphabet_size;
+    strides[2] = 1;
+    cudnnCTCLossAlgo_t ctc_algo = CUDNN_CTC_LOSS_ALGO_DETERMINISTIC;
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(prob_desc_,
+                                          dtype_,
+                                          3,
+                                          dims,
+                                          strides));
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(grad_desc_,
+                                          dtype_,
+                                          3,
+                                          dims,
+                                          strides));
+    CUDNN_CALL(cudnnGetCTCLossWorkspaceSize(s->dnn_handle_,
+                                            prob_desc_,
+                                            req_grad?grad_desc_:NULL,
+                                            packed_labels->data(),
+                                            label_lengths->data(),
+                                            data_lengths->data(),
+                                            ctc_algo,
+                                            ctc_desc_,
+                                            &workspace_bytes));
+    workspace_size = (workspace_bytes + sizeof(real_t) - 1)/sizeof(real_t);
+
+    Tensor<xpu, 1, real_t> temp_space =
+      ctx.requested[ctc_loss::kTempSpace].get_space_typed<xpu, 1, real_t>(
+          mshadow::Shape1(workspace_size+data.shape_.FlatTo1D()[0]), s);
+
+    Tensor<gpu, 1, real_t> work_space(temp_space.dptr_,
+                                      mshadow::Shape1(workspace_size), s);
+    Tensor<xpu, 3, real_t> prob(temp_space.dptr_+workspace_size,
+                                data.shape_, s);
+
+    // since the input is activation before softmax and cudnn ctc takes softmax
+    // apply softmax to inputs first.
+    mxnet_op::Softmax<mxnet_op::softmax_fwd>(s, data.dptr_, prob.dptr_, data.shape_, 2);
+
+    CUDNN_CALL(cudnnCTCLoss(s->dnn_handle_,
+                            prob_desc_,
+                            prob.dptr_,
+                            packed_labels->data(),
+                            label_lengths->data(),
+                            data_lengths->data(),
+                            costs.dptr_,
+                            req_grad?grad_desc_:NULL,
+                            req_grad?grad.dptr_:NULL,
+                            ctc_algo,
+                            ctc_desc_,
+                            work_space.dptr_,
+                            workspace_bytes));
+
+    if (req_grad) {
+      mxnet_op::SoftmaxGrad<mshadow::op::mul, mxnet_op::softmax_bwd>(s,
+          prob.dptr_, grad.dptr_, grad.dptr_, data.shape_, 2);
+      Assign(grad, mxnet::kWriteInplace, grad * alphabet_size);
+    }
+  }
+#endif  // __CUDACC__ && CUDNN
+
+  inline virtual void baidu_forward(const OpContext &ctx,
+                                    mshadow::Stream<xpu>* s,
+                                    mshadow::Tensor<xpu, 3, real_t> data,
+                                    mshadow::Tensor<xpu, 1, real_t> costs,
+                                    mshadow::Tensor<xpu, 3, real_t> grad,
+                                    std::vector<int>* data_lengths,
+                                    std::vector<int>* label_lengths,
+                                    std::vector<int>* packed_labels,
+                                    int batch_size,
+                                    int alphabet_size,
+                                    bool req_grad) {
+    using namespace mshadow;
+    // allocate temporary workspace
+    size_t size_bytes;
+    bool gpu = data.kDevCPU ? false : true;
+    get_workspace_size<real_t>(label_lengths, data_lengths, alphabet_size,
+                               batch_size, gpu, &size_bytes);
+
+    // round-up so there are enough elems in memory
+    int num_tmp_elems = (size_bytes + sizeof(real_t) - 1) / sizeof(real_t);
+    Tensor<xpu, 1, real_t> workspace =
+        ctx.requested[ctc_loss::kTempSpace].get_space_typed<xpu, 1, real_t>(
+            Shape1(num_tmp_elems), s);
+
+    compute_ctc_cost(data, costs.dptr_, grad.dptr_, packed_labels->data(),
+                     label_lengths->data(), data_lengths->data(),
+                     workspace.dptr_, req_grad,
+                     param_.blank_label == 0?0:(alphabet_size-1));
+  }
 };  // class CTCLossOp
 
 template <typename xpu>
@@ -240,15 +475,22 @@ class CTCLossProp : public OperatorProperty {
   int NumOutputs() const override { return 2; }
 
   std::vector<std::string> ListArguments() const override {
-    return {"data", "label"};
+    if (param_.use_data_lengths && param_.use_label_lengths) {
+      return {"data", "label", "data_lengths", "label_lengths"};
+    } else if (param_.use_data_lengths) {
+      return {"data", "label", "data_lengths"};
+    } else if (param_.use_label_lengths) {
+      return {"data", "label", "label_lengths"};
+    } else {
+      return {"data", "label"};
+    }
   }
 
   std::vector<std::string> ListOutputs() const override {
     return {"output", "grad"};
   }
 
-  void Init(
-      const std::vector<std::pair<std::string, std::string>> &kwargs) override {
+  void Init(const std::vector<std::pair<std::string, std::string>> &kwargs) override {
     param_.Init(kwargs);
   }
 
@@ -259,7 +501,9 @@ class CTCLossProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape, std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
     using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2U) << "Expect two inputs to the symbol.";
+    index_t expected_inputs = 2+param_.use_data_lengths+param_.use_label_lengths;
+    CHECK_EQ(in_shape->size(), expected_inputs)
+        << "Expect " << expected_inputs << " inputs to the symbol.";
 
     const TShape &dshape = (*in_shape)[ctc_loss::kData];
     const TShape &lshape = (*in_shape)[ctc_loss::kLabel];
@@ -267,10 +511,24 @@ class CTCLossProp : public OperatorProperty {
     CHECK_EQ(lshape.ndim(), 2U) << "The labels array must be of rank 2.";
     CHECK_EQ(dshape[1], lshape[0])
         << "The batch size for the labels and data arrays must be the same.";
+    if (param_.use_data_lengths) {
+      int kInputLength = 2;
+      const TShape &dlshape = (*in_shape)[kInputLength];
+      CHECK_EQ(dlshape.ndim(), 1U) << "Data length array must be a vector.";
+      CHECK_EQ(dlshape[0], dshape[1])
+          << "The batch size for the data and data lengths must be the same.";
+    }
+    if (param_.use_label_lengths) {
+      int kLabelLength = 2+param_.use_data_lengths;
+      const TShape &llshape = (*in_shape)[kLabelLength];
+      CHECK_EQ(llshape.ndim(), 1U) << "Label length array must be a vector.";
+      CHECK_EQ(llshape[0], lshape[0])
+          << "The batch size for the labels and label lengths must be the same.";
+    }
 
     CHECK_GE(dshape[0], lshape[1]) << "The max number of labels cannot exceed "
                                       "the maximum sequence length of the "
-                                      "input.";
+                                      "data.";
 
     TShape oshape(1);
     oshape[0] = dshape[1];  // batch size
diff --git a/src/operator/contrib/ctc_loss.cc b/src/operator/contrib/ctc_loss.cc
index 3727cee10b1c..32e8e629f090 100644
--- a/src/operator/contrib/ctc_loss.cc
+++ b/src/operator/contrib/ctc_loss.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file ctc_loss.cc
  * \brief
  * \author Sebastian Bodenstein
@@ -31,18 +32,18 @@ namespace mshadow {
 template <typename DType>
 ctcStatus_t compute_ctc_cost(const Tensor<cpu, 3, DType> activations,
                              DType *costs, DType *grads, int *labels,
-                             int *label_lengths, int *input_lengths,
-                             void *workspace, int train) {
+                             int *label_lengths, int *data_lengths,
+                             void *workspace, int train, int blank_label) {
   int minibatch = static_cast<int>(activations.size(1));
   int alphabet_size = static_cast<int>(activations.size(2));
-  int blank_label = 0;
   mxnet_warpctc::CpuCTC<DType> ctc(alphabet_size, minibatch, workspace, blank_label);
-  if (train)
+  if (train) {
     return ctc.cost_and_grad(activations.dptr_, grads, costs, labels,
-                             label_lengths, input_lengths);
-  else
+                             label_lengths, data_lengths);
+  } else {
     return ctc.score_forward(activations.dptr_, costs, labels, label_lengths,
-                             input_lengths);
+                             data_lengths);
+  }
 }
 
 }  // namespace mshadow
@@ -72,34 +73,55 @@ MXNET_REGISTER_OP_PROPERTY(_contrib_CTCLoss, CTCLossProp)
 
 The shapes of the inputs and outputs:
 
-- **data**: *(sequence_length, batch_size, alphabet_size + 1)*
-- **label**: *(batch_size, label_sequence_length)*
-- **out**: *(batch_size)*.
+- **data**: `(sequence_length, batch_size, alphabet_size)`
+- **label**: `(batch_size, label_sequence_length)`
+- **out**: `(batch_size)`
 
-``label`` is a tensor of integers between 1 and *alphabet_size*. If a
-sequence of labels is shorter than *label_sequence_length*, use the special
-padding character 0 at the end of the sequence to conform it to the correct
-length. For example, if *label_sequence_length* = 4, and one has two sequences
-of labels [2, 1] and [3, 2, 2], the resulting ```label``` tensor should be
-padded to be::
+The `data` tensor consists of sequences of activation vectors (without applying softmax),
+with i-th channel in the last dimension corresponding to i-th label
+for i between 0 and alphabet_size-1 (i.e always 0-indexed).
+Alphabet size should include one additional value reserved for blank label.
+When `blank_label` is ``"first"``, the ``0``-th channel is be reserved for
+activation of blank label, or otherwise if it is "last", ``(alphabet_size-1)``-th channel should be
+reserved for blank label.
 
-  [[2, 1, 0, 0], [3, 2, 2, 0]]
+``label`` is an index matrix of integers. When `blank_label` is ``"first"``,
+the value 0 is then reserved for blank label, and should not be passed in this matrix. Otherwise,
+when `blank_label` is ``"last"``, the value `(alphabet_size-1)` is reserved for blank label.
 
-The ``data`` tensor consists of sequences of activation vectors. The layer
-applies a softmax to each vector, which then becomes a vector of probabilities
-over the alphabet. Note that the 0th element of this vector is reserved for the
-special blank character.
+If a sequence of labels is shorter than *label_sequence_length*, use the special
+padding value at the end of the sequence to conform it to the correct
+length. The padding value is `0` when `blank_label` is ``"first"``, and `-1` otherwise.
+
+For example, suppose the vocabulary is `[a, b, c]`, and in one batch we have three sequences
+'ba', 'cbb', and 'abac'. When `blank_label` is ``"first"``, we can index the labels as
+`{'a': 1, 'b': 2, 'c': 3}`, and we reserve the 0-th channel for blank label in data tensor.
+The resulting `label` tensor should be padded to be::
+
+  [[2, 1, 0, 0], [3, 2, 2, 0], [1, 2, 1, 3]]
+
+When `blank_label` is ``"last"``, we can index the labels as
+`{'a': 0, 'b': 1, 'c': 2}`, and we reserve the channel index 3 for blank label in data tensor.
+The resulting `label` tensor should be padded to be::
+
+  [[1, 0, -1, -1], [2, 1, 1, -1], [0, 1, 0, 2]]
 
 ``out`` is a list of CTC loss values, one per example in the batch.
 
 See *Connectionist Temporal Classification: Labelling Unsegmented
 Sequence Data with Recurrent Neural Networks*, A. Graves *et al*. for more
-information.
+information on the definition and the algorithm.
 
 )code" ADD_FILELINE)
     .add_argument("data", "NDArray-or-Symbol", "Input data to the ctc_loss op.")
     .add_argument("label", "NDArray-or-Symbol",
                   "Ground-truth labels for the loss.")
+    .add_argument("data_lengths", "NDArray-or-Symbol",
+                  "Lengths of data for each of the samples. Only required "
+                  "when use_data_lengths is true.")
+    .add_argument("label_lengths", "NDArray-or-Symbol",
+                  "Lengths of labels for each of the samples. Only required "
+                  "when use_label_lengths is true.")
     .add_arguments(CTCLossParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_contrib_CTCLoss).add_alias("_contrib_ctc_loss");
diff --git a/src/operator/contrib/ctc_loss.cu b/src/operator/contrib/ctc_loss.cu
index 4bdef752812b..3f5f12ca4394 100644
--- a/src/operator/contrib/ctc_loss.cu
+++ b/src/operator/contrib/ctc_loss.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file ctc_loss.cu
  * \brief
  * \author Sebastian Bodenstein
@@ -32,10 +33,9 @@ template <typename DType>
 ctcStatus_t compute_ctc_cost(const Tensor<gpu, 3, DType> activations,
                              DType *costs, DType *grads, int *labels,
                              int *label_lengths, int *input_lengths,
-                             void *workspace, int train) {
+                             void *workspace, int train, int blank_label) {
   int minibatch = static_cast<int>(activations.size(1));
   int alphabet_size = static_cast<int>(activations.size(2));
-  int blank_label = 0;
   mxnet_warpctc::GpuCTC<DType> ctc(alphabet_size, minibatch, workspace,
                     activations.stream_->stream_, blank_label);
   if (train)
diff --git a/src/operator/contrib/deformable_convolution-inl.h b/src/operator/contrib/deformable_convolution-inl.h
index a8dc6b8f09ed..cc9a93a830cb 100644
--- a/src/operator/contrib/deformable_convolution-inl.h
+++ b/src/operator/contrib/deformable_convolution-inl.h
@@ -44,6 +44,7 @@
 #include "../operator_common.h"
 #include "../nn/im2col.h"
 #include "./nn/deformable_im2col.h"
+#include "../linalg.h"
 
 
 namespace mxnet {
@@ -152,7 +153,9 @@ class DeformableConvolutionOp : public Operator {
         param_.num_deformable_group, col_buffer.dptr<DType>());
       Tensor<xpu, 3, DType> output_3d = output_4d[n];
       for (index_t g = 0; g < group_; ++g) {
-        ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g]));
+        // Legacy approach shown here for comparison:
+        //   Assign(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g]));
+        linalg_gemm(weight_3d[g], col_buffer_3d[g], output_3d[g], false, false, s, req[conv::kOut]);
       }
     }
     if (bias_term_) {
@@ -216,7 +219,9 @@ class DeformableConvolutionOp : public Operator {
     for (index_t n = 0; n < num_; ++n) {
       Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
       for (index_t g = 0; g < group_; ++g) {
-        col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]);
+        // Legacy approach shown here for comparison:
+        //   col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]);
+        linalg_gemm(weight_3d[g], out_grad_3d[g], col_buffer_3d[g], true, false, s);
       }
 
       // gradient w.r.t. input coordinate data
@@ -243,12 +248,10 @@ class DeformableConvolutionOp : public Operator {
         param_.num_deformable_group, col_buffer.dptr<DType>());
 
       for (index_t g = 0; g < group_; ++g) {
-        if (0 == n) {
-          ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight],
-            dot(out_grad_3d[g], col_buffer_3d[g].T()));
-        } else {
-          dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T());
-        }
+        auto request = (n == 0) ? req[conv::kWeight] : kAddTo;
+        // Legacy approach shown here for comparison:
+        //   Assign(dweight_3d[g], request, dot(out_grad_3d[g], col_buffer_3d[g].T()));
+        linalg_gemm(out_grad_3d[g], col_buffer_3d[g], dweight_3d[g], false, true, s, request);
       }
     }
 
@@ -454,9 +457,7 @@ class DeformableConvolutionProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-          << "Expected " << dtype << " v.s. given "
-          << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/contrib/deformable_psroi_pooling.cc b/src/operator/contrib/deformable_psroi_pooling.cc
index 47f369a32d24..42cf6f19df30 100644
--- a/src/operator/contrib/deformable_psroi_pooling.cc
+++ b/src/operator/contrib/deformable_psroi_pooling.cc
@@ -101,7 +101,7 @@ namespace op {
   DMLC_REGISTER_PARAMETER(DeformablePSROIPoolingParam);
 
   MXNET_REGISTER_OP_PROPERTY(_contrib_DeformablePSROIPooling, DeformablePSROIPoolingProp)
-    .describe("Performs deformable position-sensitive region-of-interest pooling on inputs."
+    .describe("Performs deformable position-sensitive region-of-interest pooling on inputs.\n"
       "The DeformablePSROIPooling operation is described in https://arxiv.org/abs/1703.06211 ."
       "batch_size will change to the number of region bounding boxes after DeformablePSROIPooling")
     .add_argument("data", "Symbol", "Input data to the pooling operator, a 4D Feature maps")
diff --git a/src/operator/contrib/dequantize-inl.h b/src/operator/contrib/dequantize-inl.h
index 61940c016b15..8f24a8fd7b5c 100644
--- a/src/operator/contrib/dequantize-inl.h
+++ b/src/operator/contrib/dequantize-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file dequantize-inl.h
  * \brief Implementation of dequantize operation
  */
@@ -64,7 +65,6 @@ void DequantizeCompute(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
 
-  const DequantizeParam& param = nnvm::get<DequantizeParam>(attrs.parsed);
   // for now, only supports dequantize from float to uint8
   typedef float   DstDType;
   typedef uint8_t SrcDType;
@@ -82,7 +82,6 @@ void DequantizeCompute(const nnvm::NodeAttrs& attrs,
 inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
-  const DequantizeParam& param = nnvm::get<DequantizeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 1U);
 
@@ -98,7 +97,6 @@ inline bool DequantizeShape(const nnvm::NodeAttrs& attrs,
 inline bool DequantizeType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *in_attrs,
                          std::vector<int> *out_attrs) {
-  const DequantizeParam& param = nnvm::get<DequantizeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 1U);
   CHECK_EQ((*in_attrs)[0], mshadow::kUint8)
diff --git a/src/operator/contrib/dequantize.cc b/src/operator/contrib/dequantize.cc
index 422a9557dc1d..7814a157719c 100644
--- a/src/operator/contrib/dequantize.cc
+++ b/src/operator/contrib/dequantize.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file dequantize.cc
  * \brief
  */
diff --git a/src/operator/contrib/dequantize.cu b/src/operator/contrib/dequantize.cu
index 7081c27c975b..ca5f91c5def9 100644
--- a/src/operator/contrib/dequantize.cu
+++ b/src/operator/contrib/dequantize.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file dequantize.cu
  * \brief
  */
diff --git a/src/operator/contrib/fft-inl.h b/src/operator/contrib/fft-inl.h
index 5092f586fdf7..be7b64aeb0c6 100644
--- a/src/operator/contrib/fft-inl.h
+++ b/src/operator/contrib/fft-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fft-inl.h
  * \brief
  * \author Chen Zhu
@@ -54,6 +55,7 @@ struct FFTParam : public dmlc::Parameter<FFTParam> {
   }
 };
 
+#if MXNET_USE_CUDA
 template<typename xpu, typename DType>
 class FFTOp : public Operator {
  public:
@@ -102,7 +104,6 @@ class FFTOp : public Operator {
                 Shape1(param_.compute_size*dim_*2), s);
     Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
                                               Shape2(param_.compute_size, dim_*2), s);
-    #if MSHADOW_USE_CUDNN
     // start fft
     cufftHandle plan;
     cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
@@ -135,7 +136,6 @@ class FFTOp : public Operator {
       CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
       cufftDestroy(plan_remain);
     }
-    #endif
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -170,7 +170,6 @@ class FFTOp : public Operator {
     // In this solution, out_grad must comes from a fft of real signal,
     // so that it is Hermitian symmetric, giving a real output
     // but if it is not, remember that we have implemented complex_take_real, and use this
-    #if MSHADOW_USE_CUDNN
     cufftHandle plan;
     cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
     for (size_t idx = 0; idx < num_compute; ++idx) {
@@ -203,7 +202,6 @@ class FFTOp : public Operator {
              req[fft::kData], complex_toreal(complex_data));
       cufftDestroy(plan_remain);
     }
-    #endif
     // for bp, we should not divide it
     // but for comparison with np.fft.ifft, we should do it.
     // gdata /= dim_;
@@ -211,9 +209,11 @@ class FFTOp : public Operator {
 
  private:
   FFTParam param_;
-  int dim_, stride_, num_compute, n_ffts;
+  int dim_, stride_, n_ffts;
+  size_t num_compute;
   bool init_cufft_;
 };  // class FFTOp
+#endif  // MXNET_USE_CUDA
 
 // Declare Factory Function, used for dispatch specialization
 template<typename xpu>
@@ -262,9 +262,7 @@ class FFTProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/contrib/fft.cc b/src/operator/contrib/fft.cc
index 11f8425e07b1..8332451bf921 100644
--- a/src/operator/contrib/fft.cc
+++ b/src/operator/contrib/fft.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fft-inl.h
  * \brief
  * \author Chen Zhu
@@ -28,17 +29,13 @@ namespace mxnet {
 namespace op {
 template<>
 Operator *CreateOp<cpu>(FFTParam param, int dtype) {
-    LOG(FATAL) << "fft is only available for GPU.";
-    return NULL;
+  LOG(FATAL) << "fft is only available for GPU.";
+  return NULL;
 }
 
 Operator *FFTProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                                     std::vector<int> *in_type) const {
-    std::vector<TShape> out_shape, aux_shape;
-    std::vector<int> out_type, aux_type;
-    CHECK(InferType(in_type, &out_type, &aux_type));
-    CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(FFTParam);
@@ -52,6 +49,7 @@ Currently accept 2 input data shapes: (N, d) or (N1, N2, N3, d), data can only b
 The output data has shape: (N, 2*d) or (N1, N2, N3, 2*d). The format is: [real0, imag0, real1, imag1, ...].
 
 Example::
+
    data = np.random.normal(0,1,(3,4))
    out = mx.contrib.ndarray.fft(data = mx.nd.array(data,ctx = mx.gpu(0)))
 
diff --git a/src/operator/contrib/fft.cu b/src/operator/contrib/fft.cu
index 3017ce76756b..ce25faebf87c 100644
--- a/src/operator/contrib/fft.cu
+++ b/src/operator/contrib/fft.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file fft-inl.h
  * \brief
  * \author Chen Zhu
@@ -29,11 +30,12 @@ namespace op {
 
 template<>
 Operator* CreateOp<gpu>(FFTParam param, int dtype) {
-    Operator *op = NULL;
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-        op = new FFTOp<gpu, DType>(param);
-    })
-    return op;
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new FFTOp<gpu, DType>(param);
+  })
+  return op;
 }
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/ifft-inl.h b/src/operator/contrib/ifft-inl.h
index abd5bb22a389..e48d653d9274 100644
--- a/src/operator/contrib/ifft-inl.h
+++ b/src/operator/contrib/ifft-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file Ifft-inl.h
  * \brief
  * \author Chen Zhu
@@ -54,6 +55,7 @@ struct IFFTParam : public dmlc::Parameter<IFFTParam> {
   }
 };
 
+#if MXNET_USE_CUDA
 template<typename xpu, typename DType>
 class IFFTOp : public Operator {
  public:
@@ -98,7 +100,6 @@ class IFFTOp : public Operator {
                 Shape1(param_.compute_size*dim_*2), s);
     Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
                                               Shape2(param_.compute_size, dim_*2), s);
-    #if MSHADOW_USE_CUDNN
     // start ifft
     cufftHandle plan;
     cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
@@ -131,7 +132,6 @@ class IFFTOp : public Operator {
              req[ifft::kOut], complex_toreal(complex_data));
       cufftDestroy(plan_remain);
     }
-    #endif
     // commenting this out to be consistant with caffe
     // out /= dim_;
   }
@@ -162,7 +162,6 @@ class IFFTOp : public Operator {
                 Shape1(param_.compute_size*dim_*2), s);
     Tensor<xpu, 2, DType> complex_data = Tensor<xpu, 2, DType>(workspace.dptr_,
                                               Shape2(param_.compute_size, dim_*2), s);
-    #if MSHADOW_USE_CUDNN
     // start fft
     cufftHandle plan;
     cufftPlanMany(&plan, 1, &dim_, nullptr, 0, 0, nullptr, 0, 0, CUFFT_C2C, param_.compute_size);
@@ -194,17 +193,19 @@ class IFFTOp : public Operator {
       CHECK_EQ(cufftExecC2C(plan_remain, in_tmp, out_tmp, CUFFT_FORWARD), CUFFT_SUCCESS);
       cufftDestroy(plan_remain);
     }
-    #endif
     // commenting this out to be consistant with caffe
     // gdata /= dim_;
   }
 
  private:
   IFFTParam param_;
-  int dim_, stride_, num_compute, n_iffts;
+  int dim_, stride_, n_iffts;
+  size_t num_compute;
   bool init_cufft_;
 };  // class IFFTOp
 
+#endif  // MXNET_USE_CUDA
+
 // Declare Factory Function, used for dispatch specialization
 template<typename xpu>
 Operator* CreateOp(IFFTParam param, int dtype);
@@ -253,9 +254,7 @@ class IFFTProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/contrib/ifft.cc b/src/operator/contrib/ifft.cc
index 0ea3a7ec112f..26e7041ce02f 100644
--- a/src/operator/contrib/ifft.cc
+++ b/src/operator/contrib/ifft.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file Ifft-inl.h
  * \brief
  * \author Chen Zhu
@@ -29,17 +30,13 @@ namespace op {
 
 template<>
 Operator *CreateOp<cpu>(IFFTParam param, int dtype) {
-    LOG(FATAL) << "ifft is only available for GPU.";
-    return NULL;
+  LOG(FATAL) << "ifft is only available for GPU.";
+  return NULL;
 }
 
 Operator *IFFTProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
                                                     std::vector<int> *in_type) const {
-    std::vector<TShape> out_shape, aux_shape;
-    std::vector<int> out_type, aux_type;
-    CHECK(InferType(in_type, &out_type, &aux_type));
-    CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-    DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
 }
 
 DMLC_REGISTER_PARAMETER(IFFTParam);
@@ -54,6 +51,7 @@ Last dimension must be an even number.
 The output data has shape: (N, d/2) or (N1, N2, N3, d/2). It is only the real part of the result.
 
 Example::
+
    data = np.random.normal(0,1,(3,4))
    out = mx.contrib.ndarray.ifft(data = mx.nd.array(data,ctx = mx.gpu(0)))
 
diff --git a/src/operator/contrib/ifft.cu b/src/operator/contrib/ifft.cu
index 79795d8561bf..738ad639c630 100644
--- a/src/operator/contrib/ifft.cu
+++ b/src/operator/contrib/ifft.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file Ifft-inl.h
  * \brief
  * \author Chen Zhu
@@ -29,11 +30,12 @@ namespace op {
 
 template<>
 Operator* CreateOp<gpu>(IFFTParam param, int dtype) {
-    Operator *op = NULL;
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-        op = new IFFTOp<gpu, DType>(param);
-    })
-    return op;
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+      op = new IFFTOp<gpu, DType>(param);
+  })
+  return op;
 }
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/contrib/krprod.h b/src/operator/contrib/krprod.h
index a54ece79e9d7..90a6179e0762 100644
--- a/src/operator/contrib/krprod.h
+++ b/src/operator/contrib/krprod.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  *  \file krprod.h
  *  \brief Core function for Khatri-Rao product
  *  \author Jencir Lee
diff --git a/src/operator/contrib/multi_proposal-inl.h b/src/operator/contrib/multi_proposal-inl.h
index 7cd465e0b09e..e2ba7c48df2f 100644
--- a/src/operator/contrib/multi_proposal-inl.h
+++ b/src/operator/contrib/multi_proposal-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file multi_proposal-inl.h
@@ -40,95 +41,6 @@
 #include "../operator_common.h"
 #include "../mshadow_op.h"
 
-// extend NumericalParam
-namespace mxnet {
-namespace op {
-
-/*!
-* \brief structure for numerical tuple input
-* \tparam VType data type of param
-*/
-template<typename VType>
-struct NumericalParam {
-  NumericalParam() {}
-  explicit NumericalParam(VType *begin, VType *end) {
-    int32_t size = static_cast<int32_t>(end - begin);
-    info.resize(size);
-    for (int i = 0; i < size; ++i) {
-      info[i] = *(begin + i);
-    }
-  }
-  inline size_t ndim() const {
-    return info.size();
-  }
-  std::vector<VType> info;
-};
-
-template<typename VType>
-inline std::istream &operator>>(std::istream &is, NumericalParam<VType> &param) {
-  while (true) {
-    char ch = is.get();
-    if (ch == '(') break;
-    if (!isspace(ch)) {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  VType idx;
-  std::vector<VType> tmp;
-  // deal with empty case
-  size_t pos = is.tellg();
-  char ch = is.get();
-  if (ch == ')') {
-    param.info = tmp;
-    return is;
-  }
-  is.seekg(pos);
-  // finish deal
-  while (is >> idx) {
-    tmp.push_back(idx);
-    char ch;
-    do {
-      ch = is.get();
-    } while (isspace(ch));
-    if (ch == ',') {
-      while (true) {
-        ch = is.peek();
-        if (isspace(ch)) {
-          is.get(); continue;
-        }
-        if (ch == ')') {
-          is.get(); break;
-        }
-        break;
-      }
-      if (ch == ')') break;
-    } else if (ch == ')') {
-      break;
-    } else {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  param.info = tmp;
-  return is;
-}
-
-template<typename VType>
-inline std::ostream &operator<<(std::ostream &os, const NumericalParam<VType> &param) {
-  os << '(';
-  for (index_t i = 0; i < param.info.size(); ++i) {
-    if (i != 0) os << ',';
-    os << param.info[i];
-  }
-  // python style tuple
-  if (param.info.size() == 1) os << ',';
-  os << ')';
-  return os;
-}
-
-}  // namespace op
-}  // namespace mxnet
 
 namespace mxnet {
 namespace op {
@@ -144,8 +56,8 @@ struct MultiProposalParam : public dmlc::Parameter<MultiProposalParam> {
   int rpn_post_nms_top_n;
   float threshold;
   int rpn_min_size;
-  NumericalParam<float> scales;
-  NumericalParam<float> ratios;
+  nnvm::Tuple<float> scales;
+  nnvm::Tuple<float> ratios;
   int feature_stride;
   bool output_score;
   bool iou_loss;
@@ -161,10 +73,10 @@ struct MultiProposalParam : public dmlc::Parameter<MultiProposalParam> {
     DMLC_DECLARE_FIELD(rpn_min_size).set_default(16)
     .describe("Minimum height or width in proposal");
     tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f;
-    DMLC_DECLARE_FIELD(scales).set_default(NumericalParam<float>(tmp, tmp + 4))
+    DMLC_DECLARE_FIELD(scales).set_default(nnvm::Tuple<float>(tmp, tmp + 4))
     .describe("Used to generate anchor windows by enumerating scales");
     tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f;
-    DMLC_DECLARE_FIELD(ratios).set_default(NumericalParam<float>(tmp, tmp + 3))
+    DMLC_DECLARE_FIELD(ratios).set_default(nnvm::Tuple<float>(tmp, tmp + 3))
     .describe("Used to generate anchor windows by enumerating ratios");
     DMLC_DECLARE_FIELD(feature_stride).set_default(16)
     .describe("The size of the receptive field each unit in the convolution layer of the rpn,"
@@ -302,11 +214,11 @@ inline void _Transform(float scale,
 
 // out_anchors must have shape (n, 5), where n is ratios.size() * scales.size()
 inline void GenerateAnchors(const std::vector<float>& base_anchor,
-                            const std::vector<float>& ratios,
-                            const std::vector<float>& scales,
+                            const nnvm::Tuple<float>& ratios,
+                            const nnvm::Tuple<float>& scales,
                             std::vector<float> *out_anchors) {
-  for (size_t j = 0; j < ratios.size(); ++j) {
-    for (size_t k = 0; k < scales.size(); ++k) {
+  for (size_t j = 0; j < ratios.ndim(); ++j) {
+    for (size_t k = 0; k < scales.ndim(); ++k) {
       _Transform(scales[k], ratios[j], base_anchor, out_anchors);
     }
   }
diff --git a/src/operator/contrib/multi_proposal.cu b/src/operator/contrib/multi_proposal.cu
index cb9996344e3e..a2a17d74908e 100644
--- a/src/operator/contrib/multi_proposal.cu
+++ b/src/operator/contrib/multi_proposal.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file multi_proposal.cu
@@ -460,11 +461,11 @@ class MultiProposalGPUOp : public Operator{
     base_anchor[1] = 0.0;
     base_anchor[2] = param_.feature_stride - 1.0;
     base_anchor[3] = param_.feature_stride - 1.0;
-    CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size());
+    CHECK_EQ(num_anchors, param_.ratios.ndim() * param_.scales.ndim());
     std::vector<float> anchors;
     utils::GenerateAnchors(base_anchor,
-                           param_.ratios.info,
-                           param_.scales.info,
+                           param_.ratios,
+                           param_.scales,
                            &anchors);
 
     // Copy generated anchors to GPU
diff --git a/src/operator/contrib/multibox_detection-inl.h b/src/operator/contrib/multibox_detection-inl.h
index 34099a3d6978..fcf22727ffb1 100644
--- a/src/operator/contrib/multibox_detection-inl.h
+++ b/src/operator/contrib/multibox_detection-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_detection-inl.h
  * \brief post-process multibox detection predictions
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_detection.cc b/src/operator/contrib/multibox_detection.cc
index 0f6982890f4f..a2e681a8e603 100644
--- a/src/operator/contrib/multibox_detection.cc
+++ b/src/operator/contrib/multibox_detection.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_detection.cc
  * \brief MultiBoxDetection op
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_detection.cu b/src/operator/contrib/multibox_detection.cu
index 56a1e88dfee0..6db8c555978f 100644
--- a/src/operator/contrib/multibox_detection.cu
+++ b/src/operator/contrib/multibox_detection.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_detection.cu
  * \brief MultiBoxDetection op
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_prior-inl.h b/src/operator/contrib/multibox_prior-inl.h
index 88ca3dc8de6f..6602b43ca01f 100644
--- a/src/operator/contrib/multibox_prior-inl.h
+++ b/src/operator/contrib/multibox_prior-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_prior-inl.h
  * \brief generate multibox prior boxes
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_prior.cc b/src/operator/contrib/multibox_prior.cc
index af77fdaa8015..22a9c10cd958 100644
--- a/src/operator/contrib/multibox_prior.cc
+++ b/src/operator/contrib/multibox_prior.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_prior.cc
  * \brief generate multibox prior boxes cpu implementation
  * \author Joshua Zhang
@@ -46,7 +47,7 @@ inline void MultiBoxPriorForward(const Tensor<cpu, 2, DType> &out,
       // ratio = 1, various sizes
       for (int i = 0; i < num_sizes; ++i) {
         float size = sizes[i];
-        float w = size / 2;
+        float w = size * in_height / in_width / 2;
         float h = size / 2;
         out[count][0] = center_x - w;  // xmin
         out[count][1] = center_y - h;  // ymin
@@ -58,7 +59,7 @@ inline void MultiBoxPriorForward(const Tensor<cpu, 2, DType> &out,
       float size = sizes[0];
       for (int j = 1; j < num_ratios; ++j) {
         float ratio = sqrtf(ratios[j]);
-        float w = size * ratio / 2;
+        float w = size * in_height / in_width * ratio / 2;
         float h = size / ratio / 2;
         out[count][0] = center_x - w;  // xmin
         out[count][1] = center_y - h;  // ymin
diff --git a/src/operator/contrib/multibox_prior.cu b/src/operator/contrib/multibox_prior.cu
index b041b90d1d05..57901585b45a 100644
--- a/src/operator/contrib/multibox_prior.cu
+++ b/src/operator/contrib/multibox_prior.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_prior.cu
  * \brief generate multibox prior boxes cuda kernels
  * \author Joshua Zhang
@@ -48,7 +49,7 @@ __global__ void AssignPriors(DType *out, const float size,
   int c = index % in_width;
   float center_x = (c + center_offx) * step_x;
   float center_y = (r + center_offy) * step_y;
-  float w = size * sqrt_ratio / 2;  // half width
+  float w = size * in_height / in_width * sqrt_ratio / 2;  // half width
   float h = size / sqrt_ratio / 2;  // half height
   DType *ptr = out + index * stride + 4 * offset;
   *(ptr++) = center_x - w;  // xmin
diff --git a/src/operator/contrib/multibox_target-inl.h b/src/operator/contrib/multibox_target-inl.h
index f76df3504a28..f5a3b1189d19 100644
--- a/src/operator/contrib/multibox_target-inl.h
+++ b/src/operator/contrib/multibox_target-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_target-inl.h
  * \brief
  * \author Joshua Zhang
@@ -65,7 +66,7 @@ struct MultiBoxTargetParam : public dmlc::Parameter<MultiBoxTargetParam> {
   nnvm::Tuple<float> variances;
   DMLC_DECLARE_PARAMETER(MultiBoxTargetParam) {
     DMLC_DECLARE_FIELD(overlap_threshold).set_default(0.5f)
-    .describe("Anchor-GT overlap threshold to be regarded as a possitive match.");
+    .describe("Anchor-GT overlap threshold to be regarded as a positive match.");
     DMLC_DECLARE_FIELD(ignore_label).set_default(-1.0f)
     .describe("Label for ignored anchors.");
     DMLC_DECLARE_FIELD(negative_mining_ratio).set_default(-1.0f)
diff --git a/src/operator/contrib/multibox_target.cc b/src/operator/contrib/multibox_target.cc
index 095613d4a938..2fa041dd34cf 100644
--- a/src/operator/contrib/multibox_target.cc
+++ b/src/operator/contrib/multibox_target.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_target.cc
  * \brief MultiBoxTarget op
  * \author Joshua Zhang
diff --git a/src/operator/contrib/multibox_target.cu b/src/operator/contrib/multibox_target.cu
index 3d0da6ce6f5b..c70dce3c1d7e 100644
--- a/src/operator/contrib/multibox_target.cu
+++ b/src/operator/contrib/multibox_target.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file multibox_target.cu
  * \brief MultiBoxTarget op
  * \author Joshua Zhang
diff --git a/src/operator/contrib/nn/deformable_im2col.h b/src/operator/contrib/nn/deformable_im2col.h
index b477acb4c876..1c25982ed32c 100644
--- a/src/operator/contrib/nn/deformable_im2col.h
+++ b/src/operator/contrib/nn/deformable_im2col.h
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
  *
@@ -138,7 +119,6 @@ inline void deformable_col2im(mshadow::Stream<cpu>* s,
   const TShape& pad, const TShape& stride,
   const TShape& dilation, const uint32_t deformable_group,
   DType* grad_im, OpReqType req) {
-  index_t num_spatial_axes = kernel_shape.ndim();
   LOG(FATAL) << "only implemented in GPU";
 }
 
diff --git a/src/operator/contrib/proposal-inl.h b/src/operator/contrib/proposal-inl.h
index 0b33b2d79b31..7f9861a3b4ff 100644
--- a/src/operator/contrib/proposal-inl.h
+++ b/src/operator/contrib/proposal-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file proposal-inl.h
  * \brief Proposal Operator
  * \author Piotr Teterwak, Bing Xu, Jian Guo, Pengfei Chen
@@ -38,95 +39,6 @@
 #include "../operator_common.h"
 #include "../mshadow_op.h"
 
-// extend NumericalParam
-namespace mxnet {
-namespace op {
-
-/*!
-* \brief structure for numerical tuple input
-* \tparam VType data type of param
-*/
-template<typename VType>
-struct NumericalParam {
-  NumericalParam() {}
-  explicit NumericalParam(VType *begin, VType *end) {
-    int32_t size = static_cast<int32_t>(end - begin);
-    info.resize(size);
-    for (int i = 0; i < size; ++i) {
-      info[i] = *(begin + i);
-    }
-  }
-  inline size_t ndim() const {
-    return info.size();
-  }
-  std::vector<VType> info;
-};
-
-template<typename VType>
-inline std::istream &operator>>(std::istream &is, NumericalParam<VType> &param) {
-  while (true) {
-    char ch = is.get();
-    if (ch == '(') break;
-    if (!isspace(ch)) {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  VType idx;
-  std::vector<VType> tmp;
-  // deal with empty case
-  size_t pos = is.tellg();
-  char ch = is.get();
-  if (ch == ')') {
-    param.info = tmp;
-    return is;
-  }
-  is.seekg(pos);
-  // finish deal
-  while (is >> idx) {
-    tmp.push_back(idx);
-    char ch;
-    do {
-      ch = is.get();
-    } while (isspace(ch));
-    if (ch == ',') {
-      while (true) {
-        ch = is.peek();
-        if (isspace(ch)) {
-          is.get(); continue;
-        }
-        if (ch == ')') {
-          is.get(); break;
-        }
-        break;
-      }
-      if (ch == ')') break;
-    } else if (ch == ')') {
-      break;
-    } else {
-      is.setstate(std::ios::failbit);
-      return is;
-    }
-  }
-  param.info = tmp;
-  return is;
-}
-
-template<typename VType>
-inline std::ostream &operator<<(std::ostream &os, const NumericalParam<VType> &param) {
-  os << '(';
-  for (index_t i = 0; i < param.info.size(); ++i) {
-    if (i != 0) os << ',';
-    os << param.info[i];
-  }
-  // python style tuple
-  if (param.info.size() == 1) os << ',';
-  os << ')';
-  return os;
-}
-
-}  // namespace op
-}  // namespace mxnet
 
 namespace mxnet {
 namespace op {
@@ -142,8 +54,8 @@ struct ProposalParam : public dmlc::Parameter<ProposalParam> {
   int rpn_post_nms_top_n;
   float threshold;
   int rpn_min_size;
-  NumericalParam<float> scales;
-  NumericalParam<float> ratios;
+  nnvm::Tuple<float> scales;
+  nnvm::Tuple<float> ratios;
   int feature_stride;
   bool output_score;
   bool iou_loss;
@@ -159,10 +71,10 @@ struct ProposalParam : public dmlc::Parameter<ProposalParam> {
     DMLC_DECLARE_FIELD(rpn_min_size).set_default(16)
     .describe("Minimum height or width in proposal");
     tmp[0] = 4.0f; tmp[1] = 8.0f; tmp[2] = 16.0f; tmp[3] = 32.0f;
-    DMLC_DECLARE_FIELD(scales).set_default(NumericalParam<float>(tmp, tmp + 4))
+    DMLC_DECLARE_FIELD(scales).set_default(nnvm::Tuple<float>(tmp, tmp + 4))
     .describe("Used to generate anchor windows by enumerating scales");
     tmp[0] = 0.5f; tmp[1] = 1.0f; tmp[2] = 2.0f;
-    DMLC_DECLARE_FIELD(ratios).set_default(NumericalParam<float>(tmp, tmp + 3))
+    DMLC_DECLARE_FIELD(ratios).set_default(nnvm::Tuple<float>(tmp, tmp + 3))
     .describe("Used to generate anchor windows by enumerating ratios");
     DMLC_DECLARE_FIELD(feature_stride).set_default(16)
     .describe("The size of the receptive field each unit in the convolution layer of the rpn,"
@@ -300,11 +212,11 @@ inline void _Transform(float scale,
 
 // out_anchors must have shape (n, 5), where n is ratios.size() * scales.size()
 inline void GenerateAnchors(const std::vector<float>& base_anchor,
-                            const std::vector<float>& ratios,
-                            const std::vector<float>& scales,
+                            const nnvm::Tuple<float>& ratios,
+                            const nnvm::Tuple<float>& scales,
                             std::vector<float> *out_anchors) {
-  for (size_t j = 0; j < ratios.size(); ++j) {
-    for (size_t k = 0; k < scales.size(); ++k) {
+  for (size_t j = 0; j < ratios.ndim(); ++j) {
+    for (size_t k = 0; k < scales.ndim(); ++k) {
       _Transform(scales[k], ratios[j], base_anchor, out_anchors);
     }
   }
diff --git a/src/operator/contrib/proposal.cc b/src/operator/contrib/proposal.cc
index 9aff80ddf2f6..29703471d31a 100644
--- a/src/operator/contrib/proposal.cc
+++ b/src/operator/contrib/proposal.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file proposal.cc
  * \brief
  * \author Piotr Teterwak, Bing Xu, Jian Guo
@@ -343,11 +344,11 @@ class ProposalOp : public Operator{
     base_anchor[1] = 0.0;
     base_anchor[2] = param_.feature_stride - 1.0;
     base_anchor[3] = param_.feature_stride - 1.0;
-    CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size());
+    CHECK_EQ(num_anchors, param_.ratios.ndim() * param_.scales.ndim());
     std::vector<float> anchors;
     utils::GenerateAnchors(base_anchor,
-                           param_.ratios.info,
-                           param_.scales.info,
+                           param_.ratios,
+                           param_.scales,
                            &anchors);
     for(int n = 0; n < nbatch; n++) {
       std::memcpy(workspace_proposals.dptr_ + n * 5 * count, &anchors[0], sizeof(float) * anchors.size());
diff --git a/src/operator/contrib/proposal.cu b/src/operator/contrib/proposal.cu
index bee05e32c524..c6baa535b1f9 100644
--- a/src/operator/contrib/proposal.cu
+++ b/src/operator/contrib/proposal.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file proposal.cu
  * \brief Proposal Operator
  * \author Shaoqing Ren, Jian Guo, Pengfei Chen
@@ -440,11 +441,11 @@ class ProposalGPUOp : public Operator{
     base_anchor[1] = 0.0;
     base_anchor[2] = param_.feature_stride - 1.0;
     base_anchor[3] = param_.feature_stride - 1.0;
-    CHECK_EQ(num_anchors, param_.ratios.info.size() * param_.scales.info.size());
+    CHECK_EQ(num_anchors, param_.ratios.ndim() * param_.scales.ndim());
     std::vector<float> anchors;
     utils::GenerateAnchors(base_anchor,
-                           param_.ratios.info,
-                           param_.scales.info,
+                           param_.ratios,
+                           param_.scales,
                            &anchors);
 
     // Copy generated anchors to GPU
diff --git a/src/operator/contrib/psroi_pooling-inl.h b/src/operator/contrib/psroi_pooling-inl.h
index b4929725279d..3a3a9c34927c 100644
--- a/src/operator/contrib/psroi_pooling-inl.h
+++ b/src/operator/contrib/psroi_pooling-inl.h
@@ -1,23 +1,5 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
+ * Copyright (c) 2017 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file psroi_pooling-inl.h
diff --git a/src/operator/contrib/psroi_pooling.cc b/src/operator/contrib/psroi_pooling.cc
index dd3a9e08895d..75b533446b8e 100644
--- a/src/operator/contrib/psroi_pooling.cc
+++ b/src/operator/contrib/psroi_pooling.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file psroi_pooling.cc
diff --git a/src/operator/contrib/psroi_pooling.cu b/src/operator/contrib/psroi_pooling.cu
index 6df64a1948d6..472131637461 100644
--- a/src/operator/contrib/psroi_pooling.cu
+++ b/src/operator/contrib/psroi_pooling.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * Copyright (c) 2017 Microsoft
  * Licensed under The Apache-2.0 License [see LICENSE for details]
  * \file psroi_pooling.cu
diff --git a/src/operator/contrib/quantize-inl.h b/src/operator/contrib/quantize-inl.h
index 1274a7ded58a..4d55b1b5c6d0 100644
--- a/src/operator/contrib/quantize-inl.h
+++ b/src/operator/contrib/quantize-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file quantize-inl.h
  * \brief implementation of quantize operation
  */
@@ -67,7 +68,6 @@ void QuantizeCompute(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
 
-  const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
   // for now, only supports quantize from uint8 to float
   // TODO(ziheng) consider add MSHADOW_INTEGER_TYPE_SWITCH
   typedef uint8_t DstDType;
@@ -81,7 +81,6 @@ void QuantizeCompute(const nnvm::NodeAttrs& attrs,
 inline bool QuantizeShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
-  const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
 
@@ -99,7 +98,6 @@ inline bool QuantizeShape(const nnvm::NodeAttrs& attrs,
 inline bool QuantizeType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *in_attrs,
                          std::vector<int> *out_attrs) {
-  const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 3U);
   CHECK_EQ(out_attrs->size(), 3U);
   CHECK_EQ((*in_attrs)[0], mshadow::kFloat32)
diff --git a/src/operator/contrib/quantize.cc b/src/operator/contrib/quantize.cc
index dbb8985c72f2..43d60d1dd83d 100644
--- a/src/operator/contrib/quantize.cc
+++ b/src/operator/contrib/quantize.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file quantize.cc
  * \brief
  */
diff --git a/src/operator/contrib/quantize.cu b/src/operator/contrib/quantize.cu
index 6c9db9aeecf4..d50eb25fe6c6 100644
--- a/src/operator/contrib/quantize.cu
+++ b/src/operator/contrib/quantize.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file quantize.cu
  * \brief
  */
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
index 0a2522cccb65..c0cc246bee37 100644
--- a/src/operator/convolution-inl.h
+++ b/src/operator/convolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file convolution-inl.h
  * \brief
  * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo
@@ -40,6 +41,7 @@
 #include <utility>
 #include "./operator_common.h"
 #include "./nn/im2col.h"
+#include "./linalg.h"
 
 
 namespace mxnet {
@@ -102,7 +104,48 @@ struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
   index_t DilatedKernelSize(int dim) const {
     return 1 + (kernel[dim] - 1) * dilate[dim];
   }
+
+  bool operator==(const ConvolutionParam& other) const {
+    return this->kernel == other.kernel &&
+           this->stride == other.stride &&
+           this->dilate == other.dilate &&
+           this->pad == other.pad &&
+           this->num_filter == other.num_filter &&
+           this->num_group == other.num_group &&
+           this->workspace == other.workspace &&
+           this->no_bias == other.no_bias &&
+           this->cudnn_tune == other.cudnn_tune &&
+           this->cudnn_off == other.cudnn_off &&
+           this->layout == other.layout;
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+namespace std {
+template<>
+struct hash<mxnet::op::ConvolutionParam> {
+  size_t operator()(const mxnet::op::ConvolutionParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.kernel);
+    ret = dmlc::HashCombine(ret, val.stride);
+    ret = dmlc::HashCombine(ret, val.dilate);
+    ret = dmlc::HashCombine(ret, val.pad);
+    ret = dmlc::HashCombine(ret, val.num_filter);
+    ret = dmlc::HashCombine(ret, val.num_group);
+    ret = dmlc::HashCombine(ret, val.workspace);
+    ret = dmlc::HashCombine(ret, val.no_bias);
+    ret = dmlc::HashCombine(ret, val.cudnn_tune);
+    ret = dmlc::HashCombine(ret, val.cudnn_off);
+    ret = dmlc::HashCombine(ret, val.layout);
+    return ret;
+  }
 };
+}  // namespace std
+
+namespace mxnet {
+namespace op {
 
 template<typename xpu, typename DType>
 class ConvolutionOp : public Operator {
@@ -131,17 +174,6 @@ class ConvolutionOp : public Operator {
     CHECK_EQ(req[conv::kOut], kWriteTo);
     LayerSetUp(in_data[conv::kData].shape_, out_data[conv::kOut].shape_);
     Stream<xpu>* s = ctx.get_stream<xpu>();
-    // allocate workspace for col_buffer
-    Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
-      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
-    // calculate the shape of col_buffer
-    TShape col_buffer_shape(num_spatial_axes_ + 1);
-    col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
-      col_buffer_shape[i] = out_data[0].shape_[i+1];
-    }
-    // create a column buffer using workspace and col_buffer_shape
-    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
 
     // initialize weight and col_buffer 3D tensors for using gemm
     index_t M = conv_out_channels_ / group_;
@@ -149,20 +181,49 @@ class ConvolutionOp : public Operator {
     index_t K = kernel_dim_;
     Tensor<xpu, 3, DType> weight_3d = in_data[conv::kWeight].get_with_shape<xpu, 3, DType>(
       Shape3(group_, M, K), s);
-    Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
-      Shape3(group_, K, N), s);
     Tensor<xpu, 4, DType> output_4d = out_data[conv::kOut].get_with_shape<xpu, 4, DType>(
       Shape4(num_, group_, M, N), s);
-    for (index_t n = 0; n < num_; ++n) {
-      // transform image to col_buffer in order to use gemm
-      im2col(s, in_data[conv::kData].dptr<DType>()+n*input_dim_, in_data[conv::kData].shape_,
-             col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
-             col_buffer.dptr<DType>());
-      Tensor<xpu, 3, DType> output_3d = output_4d[n];
-      for (index_t g = 0; g < group_; ++g) {
-        ASSIGN_DISPATCH(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g]));
+
+    // no need to allocating memory and reordering in memory
+    if (is_1x1_) {
+      Tensor<xpu, 4, DType> input_4d = in_data[conv::kData].get_with_shape<xpu, 4, DType>(
+        Shape4(num_, group_, K, N), s);
+      for (index_t n = 0; n < num_; ++n) {
+        Tensor<xpu, 3, DType> input_3d = input_4d[n];
+        Tensor<xpu, 3, DType> output_3d = output_4d[n];
+        for (index_t g = 0; g < group_; ++g) {
+          linalg_gemm(weight_3d[g], input_3d[g], output_3d[g], false, false, s, req[conv::kOut]);
+        }
+      }
+    } else {
+      // allocate workspace for col_buffer
+      Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
+        .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
+      // calculate the shape of col_buffer
+      TShape col_buffer_shape(num_spatial_axes_ + 1);
+      col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
+      for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+        col_buffer_shape[i] = out_data[0].shape_[i+1];
+      }
+      // create a column buffer using workspace and col_buffer_shape
+      TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+      Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
+        Shape3(group_, K, N), s);
+      for (index_t n = 0; n < num_; ++n) {
+        // transform image to col_buffer in order to use gemm
+        im2col(s, in_data[conv::kData].dptr<DType>()+n*input_dim_, in_data[conv::kData].shape_,
+               col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
+               col_buffer.dptr<DType>());
+        Tensor<xpu, 3, DType> output_3d = output_4d[n];
+        for (index_t g = 0; g < group_; ++g) {
+          // Legacy approach shown here for comparison:
+          //   Assign(output_3d[g], req[conv::kOut], dot(weight_3d[g], col_buffer_3d[g]));
+          linalg_gemm(weight_3d[g], col_buffer_3d[g], output_3d[g], false, false, s,
+            req[conv::kOut]);
+        }
       }
     }
+
     if (bias_term_) {
       Tensor<xpu, 1, DType> bias = in_data[conv::kBias].get<xpu, 1, DType>(s);
       Tensor<xpu, 3, DType> output_3d = out_data[conv::kOut].get_with_shape<xpu, 3, DType>(
@@ -188,17 +249,6 @@ class ConvolutionOp : public Operator {
     CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true);
     LayerSetUp(in_grad[conv::kData].shape_, out_grad[conv::kOut].shape_);
     Stream<xpu> *s = ctx.get_stream<xpu>();
-    // allocate workspace for col_buffer
-    Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
-      .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
-    // calculate the shape of col_buffer
-    TShape col_buffer_shape(num_spatial_axes_ + 1);
-    col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
-    for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
-      col_buffer_shape[i] = out_grad[conv::kData].shape_[i+1];
-    }
-    // create a column buffer using workspace and col_buffer_shape
-    TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
 
     // initialize weight and col_buffer 3D tensors for using gemm
     // For computing dLoss/d(in_data[kData])
@@ -209,32 +259,62 @@ class ConvolutionOp : public Operator {
       Shape3(group_, K, M), s);
     Tensor<xpu, 4, DType> out_grad_4d = out_grad[conv::kOut].get_with_shape<xpu, 4, DType>(
       Shape4(num_, group_, K, N), s);
-    Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
-      Shape3(group_, M, N), s);
     // For computing dLoss/dWeight
     Tensor<xpu, 3, DType> dweight_3d = in_grad[conv::kWeight].get_with_shape<xpu, 3, DType>(
       Shape3(group_, K, M), s);
 
-    for (index_t n = 0; n < num_; ++n) {
-      Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
-      // gradient w.r.t. input data
-      for (index_t g = 0; g < group_; ++g) {
-        col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]);
+    // no need to allocating memory and reordering in memory
+    if (is_1x1_) {
+      Tensor<xpu, 4, DType> input_4d = in_data[conv::kData].get_with_shape<xpu, 4, DType>(
+        Shape4(num_, group_, M, N), s);
+      Tensor<xpu, 4, DType> in_grad_4d = in_grad[conv::kData].get_with_shape<xpu, 4, DType>(
+        Shape4(num_, group_, M, N), s);
+      for (index_t n = 0; n < num_; ++n) {
+        Tensor<xpu, 3, DType> input_3d = input_4d[n];
+        Tensor<xpu, 3, DType> in_grad_3d = in_grad_4d[n];
+        Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
+        // gradient w.r.t. input data
+        for (index_t g = 0; g < group_; ++g) {
+          linalg_gemm(weight_3d[g], out_grad_3d[g], in_grad_3d[g], true, false, s);
+          auto request = (n == 0) ? req[conv::kWeight] : kAddTo;
+          linalg_gemm(out_grad_3d[g], input_3d[g], dweight_3d[g], false, true, s, request);
+        }
       }
-      col2im(s, col_buffer.dptr<DType>(), in_grad[conv::kData].shape_, col_buffer.shape_,
-             param_.kernel, param_.pad, param_.stride, param_.dilate,
-             in_grad[conv::kData].dptr<DType>()+n*input_dim_, req[conv::kData]);
-
-      // gradient w.r.t. weight, dWeight should accumulate across the batch and group
-      im2col(s, in_data[conv::kData].dptr<DType>()+n*input_dim_, in_data[conv::kData].shape_,
-             col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
-             col_buffer.dptr<DType>());
-      for (index_t g = 0; g < group_; ++g) {
-        if (0 == n) {
-          ASSIGN_DISPATCH(dweight_3d[g], req[conv::kWeight],
-                          dot(out_grad_3d[g], col_buffer_3d[g].T()));
-        } else {
-          dweight_3d[g] += dot(out_grad_3d[g], col_buffer_3d[g].T());
+    } else {
+      // allocate workspace for col_buffer
+      Tensor<xpu, 1, DType> workspace = ctx.requested[conv::kTempSpace]
+        .get_space_typed<xpu, 1, DType>(Shape1(col_buffer_size_), s);
+      // calculate the shape of col_buffer
+      TShape col_buffer_shape(num_spatial_axes_ + 1);
+      col_buffer_shape[0] = conv_in_channels_ * param_.kernel.Size();
+      for (index_t i = 1; i < col_buffer_shape.ndim(); ++i) {
+        col_buffer_shape[i] = out_grad[conv::kData].shape_[i+1];
+      }
+      // create a column buffer using workspace and col_buffer_shape
+      TBlob col_buffer(workspace.dptr_, col_buffer_shape, xpu::kDevMask, DataType<DType>::kFlag);
+      Tensor<xpu, 3, DType> col_buffer_3d = col_buffer.get_with_shape<xpu, 3, DType>(
+        Shape3(group_, M, N), s);
+      for (index_t n = 0; n < num_; ++n) {
+        Tensor<xpu, 3, DType> out_grad_3d = out_grad_4d[n];
+        // gradient w.r.t. input data
+        for (index_t g = 0; g < group_; ++g) {
+          // Legacy approach shown here for comparison:
+          //   col_buffer_3d[g] = dot(weight_3d[g].T(), out_grad_3d[g]);
+          linalg_gemm(weight_3d[g], out_grad_3d[g], col_buffer_3d[g], true, false, s);
+        }
+        col2im(s, col_buffer.dptr<DType>(), in_grad[conv::kData].shape_, col_buffer.shape_,
+               param_.kernel, param_.pad, param_.stride, param_.dilate,
+               in_grad[conv::kData].dptr<DType>()+n*input_dim_, req[conv::kData]);
+
+        // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+        im2col(s, in_data[conv::kData].dptr<DType>()+n*input_dim_, in_data[conv::kData].shape_,
+               col_buffer.shape_, param_.kernel, param_.pad, param_.stride, param_.dilate,
+               col_buffer.dptr<DType>());
+        for (index_t g = 0; g < group_; ++g) {
+          auto request = (n == 0) ? req[conv::kWeight] : kAddTo;
+          // Legacy approach shown here for comparison:
+          //   Assign(dweight_3d[g], request, dot(out_grad_3d[g], col_buffer_3d[g].T()));
+          linalg_gemm(out_grad_3d[g], col_buffer_3d[g], dweight_3d[g], false, true, s, request);
         }
       }
     }
@@ -547,9 +627,7 @@ class ConvolutionProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc
index 55cfe4e085dc..bc65cc29742d 100644
--- a/src/operator/convolution.cc
+++ b/src/operator/convolution.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file convolution.cc
  * \brief
  * \author Bing Xu, Jun Wu
diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu
index f3204eb312cb..2962559119cf 100644
--- a/src/operator/convolution.cu
+++ b/src/operator/convolution.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file convolution.cu
  * \brief
  * \author Bing Xu, Jun Wu
@@ -60,61 +61,18 @@ Operator* CreateOp<gpu>(ConvolutionParam param, int dtype,
   }
 
 #if MXNET_USE_CUDNN == 1
-  // The NVIDIA Pascal architecture was the first to include 16-bit ALUs.
-  // Thus, when the framework is compiled with MSHADOW_USE_PASCAL == 1, we
-  // perform the convolution calculation in 16-bit when the tensor type is
-  // also 16-bit.  For NVIDIA architectures earlier than Pascal (so Maxwell
-  // and Kepler), the computation precision is always at least 32-bits.
-#if MSHADOW_USE_PASCAL == 1
-  // true fp16
-  int desired_forward_compute_type = dtype;
-  int desired_backward_compute_type = dtype;
-#else
-  // pseudo fp16
-  int desired_forward_compute_type =
-    (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
-  int desired_backward_compute_type =
-    (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
-#endif  // MSHADOW_USE_PASCAL == 1
+  // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
+  int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (param.cudnn_off) {
       op = new ConvolutionOp<gpu, DType>(param);
+    } else if (!CuDNNConvolutionOp<DType>::Supports(param, compute_type, compute_type, ctx)) {
+      LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
+      op = new ConvolutionOp<gpu, DType>(param);
     } else {
-      int forward_compute_type = desired_forward_compute_type;
-      int backward_compute_type = desired_backward_compute_type;
-      bool convolutionIsSupported = CuDNNConvolutionOp<DType>::Supports(param,
-                                          forward_compute_type,
-                                          backward_compute_type, ctx);
-
-      // If cuDNN can't handle this case with fp16 backprop kernels, try fp32 backprop.
-      if (!convolutionIsSupported && backward_compute_type == mshadow::kFloat16) {
-        backward_compute_type = mshadow::kFloat32;
-        convolutionIsSupported = CuDNNConvolutionOp<DType>::Supports(param,
-                                          forward_compute_type,
-                                          backward_compute_type, ctx);
-      }
-
-      // If cuDNN can't handle this case with fp16 forward kernels, try fp32
-      if (!convolutionIsSupported && forward_compute_type == mshadow::kFloat16) {
-        forward_compute_type = mshadow::kFloat32;
-        convolutionIsSupported = CuDNNConvolutionOp<DType>::Supports(param,
-                                          forward_compute_type,
-                                          backward_compute_type, ctx);
-      }
-      if (!convolutionIsSupported) {
-        // LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied.";
-        op = new ConvolutionOp<gpu, DType>(param);
-      } else {
-        if (forward_compute_type != desired_forward_compute_type)
-          LOG(WARNING) << "Requested forward compute precision not supported, using fp32.";
-        if (backward_compute_type != desired_backward_compute_type)
-          LOG(WARNING) << "Requested backward compute precision not supported, using fp32.";
-        op = new CuDNNConvolutionOp<DType>(param,
-                                         forward_compute_type,
-                                         backward_compute_type,
+      op = new CuDNNConvolutionOp<DType>(param, compute_type, compute_type,
                                          *in_shape, *out_shape, ctx);
-      }
     }
   })
 #else
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
index f39d8e0804bc..bc6326aad76a 100644
--- a/src/operator/convolution_v1-inl.h
+++ b/src/operator/convolution_v1-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file convolution_v1-inl.h
  * \brief
  * \author Bing Xu
@@ -37,6 +38,7 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
+#include "./linalg.h"
 
 namespace mxnet {
 namespace op {
@@ -180,7 +182,9 @@ class ConvolutionV1Op : public Operator {
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
         mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
                                        gstride * (gid + 1));
-        temp_dst[gid] = dot(wmat[gid], tmpc);
+        // Legacy approach shown here for comparison:
+        //   temp_dst[gid] = dot(wmat[gid], tmpc);
+        linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s);
       }
       out.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst,
                                               mshadow::Shape4(param_.num_filter,
@@ -267,15 +271,21 @@ class ConvolutionV1Op : public Operator {
         Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
         if (i == 0) {
           Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
-          Assign(tmp_gwmat, req[conv_v1::kWeight], dot(temp_dst[gid], tmpc.T()));
+          // Legacy approach shown here for comparison:
+          //   Assign(tmp_gwmat, req[conv_v1::kWeight], dot(temp_dst[gid], tmpc.T()));
+          linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[conv_v1::kWeight]);
         } else {
-          gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+          // Legacy approach shown here for comparison:
+          //   gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+          linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo);
         }
       }
 
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
         Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-        tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+        // Legacy approach shown here for comparison:
+        //   tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+        linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
       }
       if (param_.pad[0] == 0 && param_.pad[1] == 0) {
         Assign(gdata.Slice(i, i + step), req[conv_v1::kData],
@@ -495,9 +505,7 @@ class ConvolutionV1Prop : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/convolution_v1.cc b/src/operator/convolution_v1.cc
index cb47ed11b5c9..7de6a34425f5 100644
--- a/src/operator/convolution_v1.cc
+++ b/src/operator/convolution_v1.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file convolution_v1.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/convolution_v1.cu b/src/operator/convolution_v1.cu
index b20b4b249224..bcba77214edc 100644
--- a/src/operator/convolution_v1.cu
+++ b/src/operator/convolution_v1.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file convolution_v1.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/correlation-inl.h b/src/operator/correlation-inl.h
index 02507cb1d96c..1379fe9a5594 100644
--- a/src/operator/correlation-inl.h
+++ b/src/operator/correlation-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file correlation-inl.h
  * \brief correlation operator and symbol
  * \author Xu Dong
diff --git a/src/operator/correlation.cc b/src/operator/correlation.cc
index 2522cd45c414..d3323dc80b63 100644
--- a/src/operator/correlation.cc
+++ b/src/operator/correlation.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file correlation.cc
  * \brief correlation op
  * \author Xu Dong
diff --git a/src/operator/crop-inl.h b/src/operator/crop-inl.h
index 5a8709633f21..b6e49975bd1c 100644
--- a/src/operator/crop-inl.h
+++ b/src/operator/crop-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file crop-inl.h
  * \brief
  * \author Wei Wu
diff --git a/src/operator/crop.cc b/src/operator/crop.cc
index 8465819903ce..03af624ac2a9 100644
--- a/src/operator/crop.cc
+++ b/src/operator/crop.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file crop.cc
  * \brief
  * \author Wei Wu
diff --git a/src/operator/crop.cu b/src/operator/crop.cu
index 0b51b1449581..ba0334c35396 100644
--- a/src/operator/crop.cu
+++ b/src/operator/crop.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file concat.cu
  * \brief
  * \author Wei Wu
diff --git a/src/operator/cross_device_copy.cc b/src/operator/cross_device_copy.cc
index b32a68d3038c..08a7d52a9ac4 100644
--- a/src/operator/cross_device_copy.cc
+++ b/src/operator/cross_device_copy.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cross_device_copy.cc
  * \brief Special operator that copys NDArray
 */
diff --git a/src/operator/cudnn_activation-inl.h b/src/operator/cudnn_activation-inl.h
index 317ef47c126a..75506b3d726d 100644
--- a/src/operator/cudnn_activation-inl.h
+++ b/src/operator/cudnn_activation-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_activation-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/cudnn_algoreg-inl.h b/src/operator/cudnn_algoreg-inl.h
index dc5db6bbc8b7..fd7b8fa99243 100644
--- a/src/operator/cudnn_algoreg-inl.h
+++ b/src/operator/cudnn_algoreg-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_algoreg-inl.h
  * \brief
  * \author Bing Xu
@@ -61,37 +62,22 @@ class CuDNNAlgo {
   bool is_tensor_core_algo_;
 };
 
+template<typename ParamType>
 class CuDNNAlgoReg {
  public:
-  template <typename Param>
-  std::string GetKey(const Param &param, const std::vector<TShape> &in_shape,
-                     const std::vector<TShape> &out_shape,
-                     cudnnDataType_t cudnn_data_type,
-                     cudnnDataType_t cudnn_forward_compute_type,
-                     cudnnDataType_t cudnn_backward_compute_type,
-                     int sm_arch) {
-    std::ostringstream oss;
-    oss << "inputs=";
-    for (auto &i : in_shape)
-      oss << i << ";";
-    oss << "outputs=";
-    for (auto &i : out_shape)
-      oss << i << ";";
-    auto dict = param.__DICT__();
-    for (auto &k : dict)
-      oss << k.first << "=" << k.second << ";";
-    oss << "cudnn_data_type=" << cudnn_data_type << ";";
-    oss << "cudnn_forward_compute_type=" << cudnn_forward_compute_type << ";";
-    oss << "cudnn_backward_compute_type=" << cudnn_backward_compute_type << ";";
-    // All GPUs of the same compute capability (SM arch) share an algo selection.
-    oss << "sm_arch=" << sm_arch << ";";
-    return oss.str();
-  }
-
-  bool Find(std::string key,
+  bool Find(const ParamType &param,
+            const std::vector<TShape> &in_shape,
+            const std::vector<TShape> &out_shape,
+            cudnnDataType_t cudnn_data_type,
+            cudnnDataType_t cudnn_forward_compute_type,
+            cudnnDataType_t cudnn_backward_compute_type,
+            int sm_arch,
             CuDNNAlgo<cudnnConvolutionFwdAlgo_t> *fwd,
             CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> *bwd,
             CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> *flt) {
+    CHECK(in_shape.size() == 2 || in_shape.size() == 3);
+    ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
+                 cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch};
     std::lock_guard<std::mutex> guard(lock_);
     auto i = reg_.find(key);
     if (i != reg_.end()) {
@@ -103,12 +89,21 @@ class CuDNNAlgoReg {
     return false;
   }
 
-  void Register(std::string key,
+  void Register(const ParamType &param,
+                const std::vector<TShape> &in_shape,
+                const std::vector<TShape> &out_shape,
+                cudnnDataType_t cudnn_data_type,
+                cudnnDataType_t cudnn_forward_compute_type,
+                cudnnDataType_t cudnn_backward_compute_type,
+                int sm_arch,
                 const CuDNNAlgo<cudnnConvolutionFwdAlgo_t> &fwd,
                 const CuDNNAlgo<cudnnConvolutionBwdDataAlgo_t> &bwd,
                 const CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> &flt) {
+    CHECK(in_shape.size() == 2 || in_shape.size() == 3);
+    ParamKey key{param, in_shape[0], in_shape[1], out_shape[0], cudnn_data_type,
+                 cudnn_forward_compute_type, cudnn_backward_compute_type, sm_arch};
     std::lock_guard<std::mutex> guard(lock_);
-    if (reg_.size() % 50 == 0) {
+    if (param.cudnn_tune.value() && reg_.size() % 50 == 0) {
       LOG(INFO) << "Running performance tests to find the best convolution "
                    "algorithm, "
                    "this can take a while... (setting env variable "
@@ -134,9 +129,48 @@ class CuDNNAlgoReg {
     CuDNNAlgo<cudnnConvolutionBwdFilterAlgo_t> flt;
   };
 
+  struct ParamKey {
+    ParamType param;
+    TShape data_shape, weight_shape, out_shape;
+    cudnnDataType_t cudnn_data_type;
+    cudnnDataType_t cudnn_forward_compute_type;
+    cudnnDataType_t cudnn_backward_compute_type;
+    int sm_arch;
+
+    bool operator==(const ParamKey& other) const {
+      return this->param == other.param &&
+             this->data_shape == other.data_shape &&
+             this->weight_shape == other.weight_shape &&
+             this->out_shape == other.out_shape &&
+             this->cudnn_data_type == other.cudnn_data_type &&
+             this->cudnn_forward_compute_type == other.cudnn_forward_compute_type &&
+             this->cudnn_backward_compute_type == other.cudnn_backward_compute_type &&
+             this->sm_arch == other.sm_arch;
+    }
+  };
+
+  struct ParamHash {
+    size_t operator()(const ParamKey& key) const {
+      std::hash<ParamType> hash_param;
+      size_t ret = hash_param(key.param);
+      ret = dmlc::HashCombine(ret, key.data_shape);
+      ret = dmlc::HashCombine(ret, key.weight_shape);
+      ret = dmlc::HashCombine(ret, key.out_shape);
+      ret = dmlc::HashCombine(ret, static_cast<int>(key.cudnn_data_type));
+      ret = dmlc::HashCombine(ret, static_cast<int>(key.cudnn_forward_compute_type));
+      ret = dmlc::HashCombine(ret, static_cast<int>(key.cudnn_backward_compute_type));
+      ret = dmlc::HashCombine(ret, key.sm_arch);
+      return ret;
+    }
+  };
+
   std::mutex lock_;
-  std::unordered_map<std::string, CudnnAlgorithms> reg_;
+  std::unordered_map<ParamKey, CudnnAlgorithms, ParamHash> reg_;
 };
+
+typedef CuDNNAlgoReg<ConvolutionParam> CuDNNConvAlgoReg;
+typedef CuDNNAlgoReg<DeconvolutionParam> CuDNNDeconvAlgoReg;
+
 #endif  // __CUDACC__ && CUDNN
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/cudnn_algoreg.cc b/src/operator/cudnn_algoreg.cc
index 5aa8688c8148..26b3484eb3c5 100644
--- a/src/operator/cudnn_algoreg.cc
+++ b/src/operator/cudnn_algoreg.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_algoreg.cc
  * \brief
  * \author Junyuan Xie
@@ -32,9 +33,16 @@
 namespace mxnet {
 namespace op {
 #if MXNET_USE_CUDNN == 1
-CuDNNAlgoReg *CuDNNAlgoReg::Get() {
-  static CuDNNAlgoReg *ptr = new CuDNNAlgoReg();
-  return ptr;
+template<>
+CuDNNAlgoReg<ConvolutionParam> *CuDNNAlgoReg<ConvolutionParam>::Get() {
+  static CuDNNAlgoReg<ConvolutionParam> inst;
+  return &inst;
+}
+
+template<>
+CuDNNAlgoReg<DeconvolutionParam> *CuDNNAlgoReg<DeconvolutionParam>::Get() {
+  static CuDNNAlgoReg<DeconvolutionParam> inst;
+  return &inst;
 }
 #endif  // CUDNN
 }  // namespace op
diff --git a/src/operator/cudnn_batch_norm-inl.h b/src/operator/cudnn_batch_norm-inl.h
index b0c5f43157d0..c231ca3fcd25 100644
--- a/src/operator/cudnn_batch_norm-inl.h
+++ b/src/operator/cudnn_batch_norm-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm-inl.h
  * \brief
  * \author Junyuan Xie
@@ -112,7 +113,7 @@ class CuDNNBatchNormOp : public Operator {
 
     Tensor<gpu, 4, DType> y =
       out_data[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, DType>(shape_, s);
-#if CUDNN_VERSION >= 7000
+#if CUDNN_VERSION >= 7002
     auto mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 #else
     auto mode = CUDNN_BATCHNORM_SPATIAL;
@@ -201,7 +202,7 @@ class CuDNNBatchNormOp : public Operator {
       out_grad[cudnnbatchnorm::kOut].get_with_shape<gpu, 4, DType>(shape_, s);
 
 #if CUDNN_VERSION >= 4007
-#if CUDNN_VERSION >= 7000
+#if CUDNN_VERSION >= 7002
     auto mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
 #else
     auto mode = CUDNN_BATCHNORM_SPATIAL;
diff --git a/src/operator/cudnn_batch_norm.cc b/src/operator/cudnn_batch_norm.cc
index 28c592b78ccf..e1e0c999b1fb 100644
--- a/src/operator/cudnn_batch_norm.cc
+++ b/src/operator/cudnn_batch_norm.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cc
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/cudnn_batch_norm.cu b/src/operator/cudnn_batch_norm.cu
index c16fc0cac25b..e96db2e5e73f 100644
--- a/src/operator/cudnn_batch_norm.cu
+++ b/src/operator/cudnn_batch_norm.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_batch_norm.cu
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/cudnn_bilinear_sampler-inl.h b/src/operator/cudnn_bilinear_sampler-inl.h
index 57592dabd891..c2171e6651a6 100644
--- a/src/operator/cudnn_bilinear_sampler-inl.h
+++ b/src/operator/cudnn_bilinear_sampler-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file cudnn_bilinear_sampler-inl.h
  * \brief
  * \author Xu Dong
diff --git a/src/operator/cudnn_convolution-inl.h b/src/operator/cudnn_convolution-inl.h
index 428278498337..148a942bc890 100644
--- a/src/operator/cudnn_convolution-inl.h
+++ b/src/operator/cudnn_convolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_convolution-inl.h
  * \brief
  * \author Bing Xu
@@ -580,14 +581,13 @@ class CuDNNConvolutionOp : public Operator {
                   const std::vector<TShape>& out_shape,
                   cudnnDataType_t cudnn_forward_compute_type,
                   cudnnDataType_t cudnn_backward_compute_type) {
-    std::string key = CuDNNAlgoReg::Get()->GetKey(param_, in_shape, out_shape, dtype_,
-                                                  cudnn_forward_compute_type,
-                                                  cudnn_backward_compute_type,
-                                                  SMArch(ctx.dev_id));
-    if (!CuDNNAlgoReg::Get()->Find(key, &forward_algo_, &back_algo_, &back_algo_w_)) {
+    if (!CuDNNConvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_,
+                                       cudnn_forward_compute_type, cudnn_backward_compute_type,
+                                       SMArch(ctx.dev_id), &forward_algo_, &back_algo_,
+                                       &back_algo_w_)) {
       // Not in algo registry, must determine via *Get*() or *Find*()
       Engine::VarHandle var = Engine::Get()->NewVariable();
-      Engine::Get()->PushSync([=](RunContext rctx) {
+      Engine::Get()->PushAsync([=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         mshadow::Stream<gpu> *s = rctx.get_stream<gpu>();
         CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
         size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
@@ -772,8 +772,12 @@ class CuDNNConvolutionOp : public Operator {
         // convolution will match only if identically specified.
         // We're caching results of *Get* as well as *Find*, but these records
         // will be held distinctly because param_.cudnn_tune is part of the key.
-        CuDNNAlgoReg::Get()->Register(key, this->forward_algo_, this->back_algo_,
-                                      this->back_algo_w_);
+        CuDNNConvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_,
+                                          cudnn_forward_compute_type,
+                                          cudnn_backward_compute_type,
+                                          SMArch(ctx.dev_id), this->forward_algo_,
+                                          this->back_algo_, this->back_algo_w_);
+        on_complete();
       }, ctx, {}, {var});
       Engine::Get()->WaitForVar(var);
       Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var);
diff --git a/src/operator/cudnn_deconvolution-inl.h b/src/operator/cudnn_deconvolution-inl.h
index de3e70c7d6a7..6796678a52c7 100644
--- a/src/operator/cudnn_deconvolution-inl.h
+++ b/src/operator/cudnn_deconvolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file cudnn_deconvolution-inl.h
  * \brief
  * \author Wei Wu, Leonard Lausen
@@ -598,14 +599,14 @@ class CuDNNDeconvolutionOp : public Operator {
                   const std::vector<TShape>& out_shape,
                   cudnnDataType_t cudnn_forward_compute_type,
                   cudnnDataType_t cudnn_backward_compute_type) {
-    std::string key = CuDNNAlgoReg::Get()->GetKey(param_, in_shape, out_shape, dtype_,
-                                                  cudnn_forward_compute_type,
-                                                  cudnn_backward_compute_type,
-                                                  SMArch(ctx.dev_id));
-    if (!CuDNNAlgoReg::Get()->Find(key, &forward_algo_, &back_algo_, &back_algo_w_)) {
+    if (!CuDNNDeconvAlgoReg::Get()->Find(param_, in_shape, out_shape, dtype_,
+                                         cudnn_forward_compute_type,
+                                         cudnn_backward_compute_type,
+                                         SMArch(ctx.dev_id), &forward_algo_,
+                                         &back_algo_, &back_algo_w_)) {
       // Not in algo registry, must determine via *Get*() or *Find*()
       Engine::VarHandle var = Engine::Get()->NewVariable();
-      Engine::Get()->PushSync([=](RunContext rctx) {
+      Engine::Get()->PushAsync([=](RunContext rctx, Engine::CallbackOnComplete on_complete) {
         mshadow::Stream <gpu> *s = rctx.get_stream<gpu>();
         CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream<gpu>::OwnHandle);
         size_t workspace_byte = static_cast<size_t>(param_.workspace * sizeof(DType));
@@ -793,8 +794,12 @@ class CuDNNDeconvolutionOp : public Operator {
         // convolution will match only if identically specified.
         // We're caching results of *Get* as well as *Find*, but these records
         // will be held distinctly because param_.cudnn_tune is part of the key.
-        CuDNNAlgoReg::Get()->Register(key, this->forward_algo_, this->back_algo_,
-                                      this->back_algo_w_);
+        CuDNNDeconvAlgoReg::Get()->Register(param_, in_shape, out_shape, dtype_,
+                                            cudnn_forward_compute_type,
+                                            cudnn_backward_compute_type,
+                                            SMArch(ctx.dev_id), this->forward_algo_,
+                                            this->back_algo_, this->back_algo_w_);
+        on_complete();
       }, ctx, {}, {var});
       Engine::Get()->WaitForVar(var);
       Engine::Get()->DeleteVariable([](RunContext s) {}, ctx, var);
diff --git a/src/operator/cudnn_lrn-inl.h b/src/operator/cudnn_lrn-inl.h
index 241ec704a904..3a45fd51ef15 100644
--- a/src/operator/cudnn_lrn-inl.h
+++ b/src/operator/cudnn_lrn-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_lrn-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/cudnn_pooling-inl.h b/src/operator/cudnn_pooling-inl.h
index 5b03fe5ee6f3..b345e0b74f74 100644
--- a/src/operator/cudnn_pooling-inl.h
+++ b/src/operator/cudnn_pooling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_pooling-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/cudnn_rnn-inl.h b/src/operator/cudnn_rnn-inl.h
index a260cb4ca0e3..4bd170cfac7c 100644
--- a/src/operator/cudnn_rnn-inl.h
+++ b/src/operator/cudnn_rnn-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file cudnn_rnn-inl.h
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/cudnn_softmax_activation-inl.h b/src/operator/cudnn_softmax_activation-inl.h
index c604a8f3f4c1..b7ab18e1ffee 100644
--- a/src/operator/cudnn_softmax_activation-inl.h
+++ b/src/operator/cudnn_softmax_activation-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cudnn_activation-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/cudnn_spatial_transformer-inl.h b/src/operator/cudnn_spatial_transformer-inl.h
index fc767841447b..1d7242a83c74 100644
--- a/src/operator/cudnn_spatial_transformer-inl.h
+++ b/src/operator/cudnn_spatial_transformer-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file cudnn_spatial_transformer-inl.h
  * \brief
  * \author Wei Wu
diff --git a/src/operator/custom/custom-inl.h b/src/operator/custom/custom-inl.h
index 4b2d620be1d6..13101da61bc5 100644
--- a/src/operator/custom/custom-inl.h
+++ b/src/operator/custom/custom-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op-inl.h
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/custom.cc b/src/operator/custom/custom.cc
index 59414d30ddc3..280b01b22ee5 100644
--- a/src/operator/custom/custom.cc
+++ b/src/operator/custom/custom.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file custom.cc
  * \brief
  * \author Junyuan Xie
@@ -25,8 +26,8 @@
 #include "./custom-inl.h"
 #include <mxnet/base.h>
 #include <mxnet/ndarray.h>
+#include <mxnet/imperative.h>
 
-#include "../../ndarray/autograd.h"
 #include "../elemwise_op_common.h"
 
 namespace mxnet {
@@ -212,9 +213,17 @@ std::vector<nnvm::NodeEntry> Gradient(
   }
 
   std::vector<nnvm::NodeEntry> ret;
-  for (index_t i = 0; i < g->num_outputs(); ++i) {
+  for (index_t i = 0; i < params.num_args; ++i) {
     ret.emplace_back(nnvm::NodeEntry{g, i, 0});
   }
+  if (params.num_auxs) {
+    nnvm::NodePtr ng = nnvm::Node::Create();
+    ng->attrs.op = nnvm::Op::Get("_NoGradient");
+    ng->attrs.name = "NoGradient";
+    for (index_t i = 0; i < params.num_auxs; ++i) {
+      ret.emplace_back(nnvm::NodeEntry{ng, 0, 0});
+    }
+  }
 
   return ret;
 }
@@ -225,9 +234,8 @@ OpStatePtr CreateState(const NodeAttrs& attrs, Context ctx,
                        const std::vector<int>& in_type) {
   const CustomParam& params = nnvm::get<CustomParam>(attrs.parsed);
 
-  size_t total = params.num_args + params.num_outs + params.num_auxs;
-  std::vector<uint32_t*> shapes(total);
-  std::vector<int> ndims(total);
+  std::vector<uint32_t*> shapes(in_shape.size());
+  std::vector<int> ndims(in_shape.size());
   size_t buff_size = 0;
   for (const auto& i : in_shape) buff_size += i.ndim();
   std::vector<uint32_t> buff(buff_size);
@@ -286,15 +294,15 @@ void Forward(const OpStatePtr& state,
     tags.push_back(4);
   }
 
-  bool prev_recording = autograd::AutogradRuntime::Get()->SetIsRecording(false);
-  bool prev_training = autograd::AutogradRuntime::Get()->SetIsTraining(ctx.is_train);
+  bool prev_recording = Imperative::Get()->set_is_recording(false);
+  bool prev_training = Imperative::Get()->set_is_training(ctx.is_train);
 
   CHECK(reinterpret_cast<CustomOpFBFunc>(params.info->callbacks[kCustomOpForward])(
     ptrs.size(), ptrs.data(), tags.data(), reinterpret_cast<const int*>(req.data()),
     static_cast<int>(ctx.is_train), params.info->contexts[kCustomOpForward]));
 
-  autograd::AutogradRuntime::Get()->SetIsTraining(prev_training);
-  autograd::AutogradRuntime::Get()->SetIsRecording(prev_recording);
+  Imperative::Get()->set_is_training(prev_training);
+  Imperative::Get()->set_is_recording(prev_recording);
 }
 
 
@@ -332,18 +340,33 @@ void Backward(const OpStatePtr& state,
     tags.push_back(4);
   }
 
-  bool prev_recording = autograd::AutogradRuntime::Get()->SetIsRecording(false);
-  bool prev_training = autograd::AutogradRuntime::Get()->SetIsTraining(ctx.is_train);
+  bool prev_recording = Imperative::Get()->set_is_recording(false);
+  bool prev_training = Imperative::Get()->set_is_training(ctx.is_train);
 
   CHECK(reinterpret_cast<CustomOpFBFunc>(params.info->callbacks[kCustomOpBackward])(
     ptrs.size(), ptrs.data(), tags.data(), reinterpret_cast<const int*>(req.data()),
     static_cast<int>(ctx.is_train), params.info->contexts[kCustomOpBackward]));
 
-  autograd::AutogradRuntime::Get()->SetIsTraining(prev_training);
-  autograd::AutogradRuntime::Get()->SetIsRecording(prev_recording);
+  Imperative::Get()->set_is_training(prev_training);
+  Imperative::Get()->set_is_recording(prev_recording);
 }
 
-
+// infer storage function for custom op, which assigns kDefaultStorage for
+// all undefined stypes, and dispatch on DispatchMode::kFComputeEx.
+inline bool InferStorageType(const nnvm::NodeAttrs& attrs,
+                             const int dev_mask,
+                             DispatchMode* dispatch_mode,
+                             std::vector<int> *iattr,
+                             std::vector<int> *oattr) {
+  for (int& v : *oattr) {
+    if (v == -1) v = kDefaultStorage;
+  }
+  for (int& v : *iattr) {
+    if (v == -1) v = kDefaultStorage;
+  }
+  dispatch_mode_assign(dispatch_mode, DispatchMode::kFComputeEx);
+  return true;
+}
 
 NNVM_REGISTER_OP(Custom)
 .describe(R"code(Apply a custom operator implemented in a frontend language (like Python).
@@ -384,6 +407,7 @@ Please check the tutorial here: http://mxnet.io/how_to/new_op.html.
 .set_attr<FCreateOpState>("FCreateOpState", CreateState)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", Forward)
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Forward)
+.set_attr<FInferStorageType>("FInferStorageType", InferStorageType)
 .add_argument("data", "NDArray-or-Symbol[]", "Input data for the custom operator.")
 .add_argument("op_type", "string", "Name of the custom operator. "
               "This is the name that is passed to `mx.operator.register` "
@@ -405,7 +429,8 @@ NNVM_REGISTER_OP(_backward_Custom)
     return ExecType::kLocal;
   })
 .set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", Backward)
-.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Backward);
+.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", Backward)
+.set_attr<FInferStorageType>("FInferStorageType", InferStorageType);
 
 }  // namespace custom
 }  // namespace op
diff --git a/src/operator/custom/native_op-inl.h b/src/operator/custom/native_op-inl.h
index ebce18611b56..d2fb1149f7b5 100644
--- a/src/operator/custom/native_op-inl.h
+++ b/src/operator/custom/native_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op-inl.h
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/native_op.cc b/src/operator/custom/native_op.cc
index 5dd35049d5bd..2b15f8484ad8 100644
--- a/src/operator/custom/native_op.cc
+++ b/src/operator/custom/native_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op.cc
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/native_op.cu b/src/operator/custom/native_op.cu
index ad8d65e3c2eb..9363214449bc 100644
--- a/src/operator/custom/native_op.cu
+++ b/src/operator/custom/native_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op.cu
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/ndarray_op-inl.h b/src/operator/custom/ndarray_op-inl.h
index b3a4662b669e..20624d2d4672 100644
--- a/src/operator/custom/ndarray_op-inl.h
+++ b/src/operator/custom/ndarray_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file native_op-inl.h
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/custom/ndarray_op.cc b/src/operator/custom/ndarray_op.cc
index 48426baea866..9ad0d09e3b0d 100644
--- a/src/operator/custom/ndarray_op.cc
+++ b/src/operator/custom/ndarray_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file ndarray_op.cc
  * \brief
  * \author Junyuan Xie
@@ -84,9 +85,11 @@ void NDArrayOp<xpu>::Forward(const OpContext &ctx,
   }
 
   CHECK(param_.pinfo->forward(ptrs.size(), ptrs.data(), tags.data(), param_.pinfo->p_forward));
-  Engine::Get()->PushSync([ndcpy, ctx](RunContext rctx) {ctx.async_on_complete(); },
-                          ndctx, ndvar, {}, FnProperty::kNormal, 0,
-                          PROFILER_MESSAGE("NDArrayOpForward"));
+  Engine::Get()->PushAsync(
+      [ndcpy, ctx](RunContext rctx, Engine::CallbackOnComplete on_complete) {
+        ctx.async_on_complete();
+        on_complete();
+      }, ndctx, ndvar, {}, FnProperty::kNormal, 0, PROFILER_MESSAGE("NDArrayOpForward"));
 }
 
 template<typename xpu>
@@ -131,9 +134,11 @@ void NDArrayOp<xpu>::Backward(const OpContext &ctx,
   }
 
   CHECK(param_.pinfo->backward(ptrs.size(), ptrs.data(), tags.data(), param_.pinfo->p_backward));
-  Engine::Get()->PushSync([ndcpy, ctx](RunContext rctx){ ctx.async_on_complete(); },
-                          ndctx, ndvar, {}, FnProperty::kNormal, 0,
-                          PROFILER_MESSAGE("NDArrayOpBackward"));
+  Engine::Get()->PushAsync(
+      [ndcpy, ctx](RunContext rctx, Engine::CallbackOnComplete on_complete){
+        ctx.async_on_complete();
+        on_complete();
+      }, ndctx, ndvar, {}, FnProperty::kNormal, 0, PROFILER_MESSAGE("NDArrayOpBackward"));
 }
 
 Operator* NDArrayOpProp::CreateOperator(Context ctx) const {
diff --git a/src/operator/deconvolution-inl.h b/src/operator/deconvolution-inl.h
index 43530138b8ea..a1e3b906a196 100644
--- a/src/operator/deconvolution-inl.h
+++ b/src/operator/deconvolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file deconvolution-inl.h
  * \brief
  * \author Wei Wu
@@ -34,6 +35,7 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
+#include "./linalg.h"
 
 
 namespace mxnet {
@@ -143,8 +145,53 @@ struct DeconvolutionParam : public dmlc::Parameter<DeconvolutionParam> {
   index_t DilatedKernelSize(int dim) const {
     return 1 + (kernel[dim] - 1) * dilate[dim];
   }
+
+  bool operator==(const DeconvolutionParam& other) const {
+    return this->kernel == other.kernel &&
+           this->stride == other.stride &&
+           this->dilate == other.dilate &&
+           this->pad == other.pad &&
+           this->adj == other.adj &&
+           this->target_shape == other.target_shape &&
+           this->num_filter == other.num_filter &&
+           this->num_group == other.num_group &&
+           this->workspace == other.workspace &&
+           this->no_bias == other.no_bias &&
+           this->cudnn_tune == other.cudnn_tune &&
+           this->cudnn_off == other.cudnn_off &&
+           this->layout == other.layout;
+  }
 };
 
+}  // namespace op
+}  // namespace mxnet
+
+namespace std {
+template<>
+struct hash<mxnet::op::DeconvolutionParam> {
+  size_t operator()(const mxnet::op::DeconvolutionParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.kernel);
+    ret = dmlc::HashCombine(ret, val.stride);
+    ret = dmlc::HashCombine(ret, val.dilate);
+    ret = dmlc::HashCombine(ret, val.pad);
+    ret = dmlc::HashCombine(ret, val.adj);
+    ret = dmlc::HashCombine(ret, val.target_shape);
+    ret = dmlc::HashCombine(ret, val.num_filter);
+    ret = dmlc::HashCombine(ret, val.num_group);
+    ret = dmlc::HashCombine(ret, val.workspace);
+    ret = dmlc::HashCombine(ret, val.no_bias);
+    ret = dmlc::HashCombine(ret, val.cudnn_tune);
+    ret = dmlc::HashCombine(ret, val.cudnn_off);
+    ret = dmlc::HashCombine(ret, val.layout);
+    return ret;
+  }
+};
+}  // namespace std
+
+namespace mxnet {
+namespace op {
+
 template<typename xpu, typename DType>
 class DeconvolutionOp : public Operator {
  public:
@@ -227,7 +274,9 @@ class DeconvolutionOp : public Operator {
       for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
         mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
                                               gstride * (gid + 1));
-        tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+        // Legacy approach shown here for comparison:
+        //   tmpc = dot(wmat[gid].T(), temp_dst[gid]);
+        linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
       }
       if (o_pad[0] == 0 && o_pad[1] == 0) {
         out.Slice(i, i + step) = pack_col2patch(temp_col,
@@ -256,7 +305,7 @@ class DeconvolutionOp : public Operator {
     if (!param_.no_bias) {
       // add bias, broadcast bias to dim 1: channel
       Tensor<xpu, 1, DType> bias = in_data[deconv::kBias].get<xpu, 1, DType>(s);
-      out += broadcast<1>(bias, out.shape_);
+      out += mshadow::expr::broadcast<1>(bias, out.shape_);
     }
   }
 
@@ -335,16 +384,23 @@ class DeconvolutionOp : public Operator {
         Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
         if (i == 0) {
           Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
-          Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
+          // Legacy approach shown here for comparison:
+          //   Assign(tmp_gwmat, req[deconv::kWeight], dot(temp_dst[gid], tmpc.T()));
+          linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[deconv::kWeight]);
         } else {
-          gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+          // Legacy approach shown here for comparison:
+          //   gwmat[gid] += dot(temp_dst[gid], tmpc.T());
+          linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo);
         }
       }
-      if (req[deconv::kData] == kWriteTo || req[deconv::kData] == kWriteInplace
-                                         || req[deconv::kData] == kAddTo) {
+      if (req[deconv::kData] == kWriteTo ||
+          req[deconv::kData] == kWriteInplace ||
+          req[deconv::kData] == kAddTo) {
         for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
           Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-          temp_dst[gid] = dot(wmat[gid], tmpc);
+          // Legacy approach shown here for comparison:
+          //   temp_dst[gid] = dot(wmat[gid], tmpc);
+          linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s);
         }
         Assign(gdata.Slice(i, i + step),
                req[deconv::kData],
@@ -645,9 +701,7 @@ class DeconvolutionProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/deconvolution.cc b/src/operator/deconvolution.cc
index 6a59ff6588ff..45867f78593c 100644
--- a/src/operator/deconvolution.cc
+++ b/src/operator/deconvolution.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file deconvolution.cc
  * \brief
  * \author Wei Wu
diff --git a/src/operator/deconvolution.cu b/src/operator/deconvolution.cu
index e9b5cb8e3c7f..2f0f37b6ecaf 100644
--- a/src/operator/deconvolution.cu
+++ b/src/operator/deconvolution.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file deconvolution.cu
  * \brief
  * \author Wei Wu
@@ -45,64 +46,19 @@ Operator* CreateOp<gpu>(DeconvolutionParam param, int dtype,
     return op;
   }
 #if MXNET_USE_CUDNN == 1
-  // The NVIDIA Pascal architecture was the first to include 16-bit ALUs.
-  // Thus, when the framework is compiled with MSHADOW_USE_PASCAL == 1, we
-  // perform the deconvolution calculation in 16-bit when the tensor type is
-  // also 16-bit.  For NVIDIA architectures earlier than Pascal (so Maxwell
-  // and Kepler), the computation precision is always at least 32-bits.
-#if MSHADOW_USE_PASCAL == 1
-  // true fp16
-  int desired_forward_compute_type = dtype;
-  int desired_backward_compute_type = dtype;
-#else
-  // pseudo fp16
-  int desired_forward_compute_type =
-    (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
-  int desired_backward_compute_type =
-    (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
-#endif  // MSHADOW_USE_PASCAL == 1
+  // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16).
+  int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype;
 
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
     if (param.cudnn_off) {
       op = new DeconvolutionOp<gpu, DType>(param);
+    } else if (!CuDNNDeconvolutionOp<DType>::Supports(param, compute_type, compute_type, ctx)) {
+      LOG(WARNING) <<
+        "This deconvolution is not supported by cudnn, MXNET deconvolution is applied.";
+      op = new DeconvolutionOp<gpu, DType>(param);
     } else {
-      int forward_compute_type = desired_forward_compute_type;
-      int backward_compute_type = desired_backward_compute_type;
-      bool deconvolutionIsSupported = CuDNNDeconvolutionOp<DType>::Supports(param,
-                                          forward_compute_type,
-                                          backward_compute_type, ctx);
-
-      // If cuDNN can't handle this case with fp16 backprop kernels, try fp32 backprop.
-      if (!deconvolutionIsSupported && backward_compute_type == mshadow::kFloat16) {
-        backward_compute_type = mshadow::kFloat32;
-        deconvolutionIsSupported = CuDNNDeconvolutionOp<DType>::Supports(param,
-                                          forward_compute_type,
-                                          backward_compute_type, ctx);
-      }
-
-      // If cuDNN can't handle this case with fp16 forward kernels, try fp32
-      if (!deconvolutionIsSupported && forward_compute_type == mshadow::kFloat16) {
-        forward_compute_type = mshadow::kFloat32;
-        deconvolutionIsSupported = CuDNNDeconvolutionOp<DType>::Supports(param,
-                                          forward_compute_type,
-                                          backward_compute_type, ctx);
-      }
-      if (!deconvolutionIsSupported) {
-        LOG(WARNING) <<
-          "This deconvolution is not supported by cudnn, MXNET deconvolution is applied.";
-        op = new DeconvolutionOp<gpu, DType>(param);
-      } else {
-        if ((forward_compute_type != desired_forward_compute_type) ||
-            (backward_compute_type != desired_backward_compute_type)) {
-          LOG(WARNING) <<
-            "True fp16 deconvolution by cudnn not supported in this configuration.  " <<
-            "Falling back to pseudo fp16.";
-        }
-        op = new CuDNNDeconvolutionOp<DType>(param,
-                                         forward_compute_type,
-                                         backward_compute_type,
-                                         *in_shape, *out_shape, ctx);
-      }
+      op = new CuDNNDeconvolutionOp<DType>(param, compute_type, compute_type,
+                                           *in_shape, *out_shape, ctx);
     }
   })
 #else
diff --git a/src/operator/depthwise_convolution-inl.h b/src/operator/depthwise_convolution-inl.h
index 5beea4595f7a..e43fd08a26d3 100644
--- a/src/operator/depthwise_convolution-inl.h
+++ b/src/operator/depthwise_convolution-inl.h
@@ -33,6 +33,8 @@
 #include <cub/cub.cuh>
 #include "./depthwise_convolution_tf.cuh"
 
+#define ROUND_TO_MULTIPLE(x, m) ((((x) + (m) - 1) / (m)) * (m))
+
 namespace mxnet {
 namespace op {
 using namespace tf::depthwise_conv;
@@ -103,7 +105,6 @@ DepthwiseConv2dBackwardFilterKernel(const DepthwiseArgs args,
   const int out_pixels = out_height * out_width;
   const int in_pixels = in_height * in_width;
   const int batch_channel_num = channel * args.batch;
-  const int candidate_reduce_thread_num = out_pixels % blockDim.x;
 
   for (int b = blockIdx.x; b < batch_channel_num; b += gridDim.x) {
     const int local_batch = b / channel;
@@ -112,14 +113,15 @@ DepthwiseConv2dBackwardFilterKernel(const DepthwiseArgs args,
     const int out_grad_offset_temp = (local_batch * channel * out_pixels) +
         (local_channel * out_pixels);
 
-    for (int out_id = threadIdx.x; out_id < out_pixels; out_id += blockDim.x) {
-      const int reduce_thread_num = ((out_pixels - out_id) > candidate_reduce_thread_num) ?
-          blockDim.x : candidate_reduce_thread_num;
-
+    // Make sure all threads enter the loop so they get to the enclosed __syncthreads()
+    for (int out_id = threadIdx.x;
+         out_id < ROUND_TO_MULTIPLE(out_pixels,
+         blockDim.x); out_id += blockDim.x) {
       const int out_w = out_id % out_width;
       const int out_h = (out_id / out_width) % out_height;
       const int out_grad_offset = out_grad_offset_temp + (out_h * out_width) + (out_w);
-      const DType out_g = ldg(out_grad + out_grad_offset);
+      // Set out_g to 0 if the thread would normally have not entered the loop.
+      const DType out_g = out_id < out_pixels ? ldg(out_grad + out_grad_offset) : DType(0);
 
       const int in_h_start = out_h * stride_height - pad_height;
       const int in_w_start = out_w * stride_width - pad_width;
@@ -134,16 +136,18 @@ DepthwiseConv2dBackwardFilterKernel(const DepthwiseArgs args,
           DType partial_grad = DType(0.0f);
           if (in_h >= 0 && in_h < in_height && in_w >= 0 && in_w < in_width) {
             const int input_offset = input_offset_temp + in_w;
-            partial_grad = ldg(input + input_offset) * out_g;
+            // Set partial_grad to 0 if the thread would normally not have entered the loop.
+            partial_grad = out_id < out_pixels ? ldg(input + input_offset) * out_g : DType(0);
           }
           // reduce all valid partial grad in a block
           typedef cub::BlockReduce<DType, mshadow::cuda::kBaseThreadNum> BlockReduceT;
           __shared__ typename BlockReduceT::TempStorage temp_storage_reduce;
-          DType aggregate = BlockReduceT(temp_storage_reduce).Sum(partial_grad, reduce_thread_num);
+          DType aggregate = BlockReduceT(temp_storage_reduce).Sum(partial_grad, blockDim.x);
           if (threadIdx.x == 0) {
             DType* addr = filter_grad + f_w + filter_offset_h + filter_offset_temp;
             atomicAdd(addr, aggregate);
           }
+          // The presense of __syncthreads() here means all threads must enter enclosing for-loops.
           __syncthreads();
         }  // for filter_width
       }  // for filter_height
diff --git a/src/operator/depthwise_convolution_tf.cuh b/src/operator/depthwise_convolution_tf.cuh
index a1538b68a7d0..f94da4462297 100644
--- a/src/operator/depthwise_convolution_tf.cuh
+++ b/src/operator/depthwise_convolution_tf.cuh
@@ -31,6 +31,27 @@
 
 namespace tf {
 namespace depthwise_conv {
+
+#define FULL_WARP_MASK 0xFFFFFFFF
+#if CUDA_VERSION < 9000
+template<typename DType>
+__forceinline__ __device__ DType  __shfl_xor_sync(unsigned, DType val, int delta) {
+  return __shfl_xor(val, delta);
+}
+
+template<typename DType>
+__forceinline__ __device__ DType  __shfl_down_sync(unsigned, DType val, int delta) {
+  return __shfl_down(val, delta);
+}
+
+// shuffle masks not used before CUDA 9.
+#define CREATE_SHFL_MASK(mask, predicate) \
+    unsigned mask = 0u;
+#else
+#define CREATE_SHFL_MASK(mask, predicate) \
+    unsigned mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+#endif
+
 struct DepthwiseArgs {
   // Input layer dimensions
   int batch;
@@ -464,6 +485,10 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackwardFilterKernelSmall(
     // Note: the condition to reach this is uniform across the entire block.
     __syncthreads();
 
+    // Not all threads of a warp may reach the __shfl_down_sync instruction
+    // so we cannot use the FULL_WARP_MASK there
+    CREATE_SHFL_MASK(active_threads, slice_in_range);
+
     if (slice_in_range) {
       const DType* const out_ptr = inout_offset + output;
       const DType out1 = ldg(out_ptr);
@@ -476,7 +501,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackwardFilterKernelSmall(
           DType val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset];
           // Warp-accumulate pixels of the same depth and write to accumulator.
           for (int delta = 16 / kBlockSlices; delta > 0; delta /= 2) {
-            val += __shfl_down(val, delta);
+            val += __shfl_down_sync(active_threads, val, delta);
           }
           if (!(thread_idx & 32 / kBlockSlices - 1)) {
             *accum_ptr = val;
@@ -503,8 +528,13 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackwardFilterKernelSmall(
       if (filter_channel < in_channel) {
         DType val = accum_data[i];
         // Warp-accumulate pixels of the same depth from the accumulator.
+        int lane_id;
+        asm volatile ("mov.u32 %0, %laneid;" : "=r"(lane_id));
+        int sub_warp = lane_id / kAccumPixels;
+        int zeros = sub_warp * kAccumPixels;
+        unsigned mask = (kAccumPixels == 32) ? FULL_WARP_MASK : (((1U << kAccumPixels) - 1) << zeros);
         for (int delta = kAccumPixels / 2; delta > 0; delta /= 2) {
-          val += __shfl_down(val, delta);
+          val += __shfl_xor_sync(mask, val, delta);
         }
         if (!(thread_idx & kAccumPixels - 1)) {
           atomicAdd(filter_offset + filter, val);
diff --git a/src/operator/dropout-inl.h b/src/operator/dropout-inl.h
index b2fb7823bedc..3071131ad65d 100644
--- a/src/operator/dropout-inl.h
+++ b/src/operator/dropout-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file dropout-inl.h
  * \brief
  * \author Bing Xu
@@ -35,6 +36,7 @@
 #include <algorithm>
 #include "./operator_common.h"
 #include "./mshadow_op.h"
+#include "../engine/openmp.h"
 
 #if defined(USE_MKL) && defined(_OPENMP)
 #include <omp.h>
@@ -55,8 +57,8 @@ namespace op {
 
 #if defined(USE_MKL) && defined(_OPENMP)
 static void bernoulli_generate(int n, double p, int* r) {
-  int seed = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
-  int nthr = omp_get_max_threads();
+  const int seed = 17 + rand() % 4096;  // NOLINT(runtime/threadsafe_fn)
+  const int nthr = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
 # pragma omp parallel num_threads(nthr)
   {
     const int ithr = omp_get_thread_num();
@@ -117,12 +119,13 @@ class DropoutOp : public Operator {
 #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
       DType* outptr = out.dptr_;
       DType* dataptr = data.dptr_;
-      int* maskptr = reinterpret_cast<int*>(mask.dptr_);
+      auto maskptr = reinterpret_cast<int*>(mask.dptr_);
       int count = mask.shape_[0]*mask.shape_[1];
       bernoulli_generate(count, this->pkeep_, maskptr);
-  #pragma omp parallel for
+      const float pk_1 = 1.0f / pkeep_;
+      #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
       for (int i = 0; i < count; ++i) {
-        outptr[i] = dataptr[i] * maskptr[i] * (1.0f / pkeep_);
+        outptr[i] = dataptr[i] * maskptr[i] * pk_1;
       }
 #else
       Random<xpu> *prnd = ctx.requested[dropout::kRandom].get_random<xpu, real_t>(s);
@@ -154,15 +157,15 @@ class DropoutOp : public Operator {
 #if !defined(__CUDACC__) && defined(USE_MKL) && defined(_OPENMP)
       DType* ingradptr = gdata.dptr_;
       DType* outgradptr = grad.dptr_;
-      int* maskptr = reinterpret_cast<int*>(mask.dptr_);
-
+      auto maskptr = reinterpret_cast<int*>(mask.dptr_);
       int count = mask.shape_[0]*mask.shape_[1];
-
-      #pragma omp parallel for
+      const float pk_1 = 1.0f / pkeep_;
+      #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
       for (int i = 0; i < count; ++i) {
-        ingradptr[i] = outgradptr[i] * maskptr[i] * (1.0f / pkeep_);
+        ingradptr[i] = outgradptr[i] * maskptr[i] * pk_1;
       }
 #else  // USE_MKL && _OPENMP
+      CHECK_EQ(grad.shape_.Size(), mask.shape_.Size());
       Assign(gdata, req[dropout::kData], grad * mask);
 #endif  // USE_MKL && _OPENMP
     } else {
diff --git a/src/operator/dropout.cc b/src/operator/dropout.cc
index af65578ec6f8..bbf5e2dea25b 100644
--- a/src/operator/dropout.cc
+++ b/src/operator/dropout.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/dropout.cu b/src/operator/dropout.cu
index 5265d8013ff7..f416c5883203 100644
--- a/src/operator/dropout.cu
+++ b/src/operator/dropout.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file dropout.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index 9b398f947e30..e22e23cea5d9 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+* Copyright (c) 2016 by Contributors
 * \file elemwise_op_common.h
 * \brief common function used for broadcasting and reducing
 * \author Xingjian Shi
@@ -35,9 +36,72 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
+#include "./mxnet_op.h"
 
 namespace mxnet {
 namespace op {
+
+/*! \brief storge type inference function for elemwise operators.
+ *         It infers output stypes the same as input stypes when input stypes are the same
+ *  \tparam cpu_only whether fcompute_ex can only be dispatched on cpu context
+ *  \tparam rsp whether row sparse stype is supported
+ *  \tparam rsp whether csr stype is supported
+ */
+template<bool cpu_only, bool rsp, bool csr>
+inline bool ElemwiseStorageAttr(const nnvm::NodeAttrs& attrs,
+                                const int dev_mask,
+                                DispatchMode* dispatch_mode,
+                                std::vector<int> *in_attrs,
+                                std::vector<int> *out_attrs) {
+  using namespace common;
+  bool dispatched = false;
+  const bool invalid_ctx = cpu_only && dev_mask != mshadow::cpu::kDevMask;
+  const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback :
+                                         DispatchMode::kFComputeEx;
+  if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
+    // dns, dns ... -> dns
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched && rsp && ContainsOnlyStorage(*in_attrs, kRowSparseStorage)) {
+    // rsp, rsp, ... -> rsp
+    dispatched = storage_type_assign(out_attrs, kRowSparseStorage,
+                                     dispatch_mode, dispatch_ex);
+  }
+  if (!dispatched && csr && common::ContainsOnlyStorage(*in_attrs, kCSRStorage)) {
+    // csr, csr, ... -> csr
+    dispatched = storage_type_assign(out_attrs, kCSRStorage,
+                                     dispatch_mode, dispatch_ex);
+  }
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+  }
+  if (static_cast<DispatchMode>(*dispatch_mode) == DispatchMode::kFComputeFallback) {
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+/*! \brief storge type inference function for elemwise operators.
+ *         It infers output stypes the same as input stypes when input stypes are the same
+ *  \tparam n_in the number of inputs
+ *  \tparam n_in the number of outputs
+ *  \tparam cpu_only whether fcompute_ex can only be dispatched on cpu context
+ *  \tparam rsp whether row sparse stype is supported
+ *  \tparam rsp whether csr stype is supported
+ */
+template<int n_in, int n_out, bool cpu_only, bool rsp, bool csr>
+inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
+                                const int dev_mask,
+                                DispatchMode* dispatch_mode,
+                                std::vector<int> *in_attrs,
+                                std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), n_in);
+  CHECK_EQ(out_attrs->size(), n_out);
+  return ElemwiseStorageAttr<cpu_only, rsp, csr>(attrs, dev_mask, dispatch_mode,
+                                                 in_attrs, out_attrs);
+}
+
 template<typename AttrType, bool (*is_none)(const AttrType&),
          bool (*assign)(AttrType*, const AttrType&), bool reverse_infer,
          std::string (*attr_string)(const AttrType&),
@@ -112,7 +176,7 @@ inline bool ElemwiseType(const nnvm::NodeAttrs& attrs,
 struct ElemwiseGradUseIn {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) {
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
     return MakeNonlossGradNode(op_name, n, ograds, n->inputs, n->attrs.dict);
   }
 };
@@ -121,7 +185,7 @@ struct ElemwiseGradUseIn {
 struct ElemwiseGradUseOut {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) {
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
     std::vector<nnvm::NodeEntry> heads;
     index_t n_out = n->num_outputs();
     for (index_t i = 0; i < n_out; ++i) {
@@ -135,7 +199,7 @@ struct ElemwiseGradUseOut {
 struct ElemwiseGradUseInOut {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) {
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
     std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
     for (auto& h : n->inputs) {
       heads.push_back(h);
@@ -152,7 +216,7 @@ struct ElemwiseGradUseInOut {
 struct ElemwiseGradUseNone {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) {
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
     return MakeNonlossGradNode(op_name, n, ograds, {}, n->attrs.dict);
   }
 };
@@ -160,13 +224,17 @@ struct ElemwiseGradUseNone {
 struct CloneGradient {
   const char *op_name;
   std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) {
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
     std::vector<nnvm::NodeEntry> ret;
-    for (size_t i = 0; i < n->inputs.size(); ++i)
+    const size_t input_count = n->inputs.size();
+    ret.reserve(input_count);
+    for (size_t i = 0; i < input_count; ++i) {
       ret.emplace_back(ograds[0]);
+    }
     return ret;
   }
 };
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index e2fab9f1f7dd..0fe828aa6e76 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fully_connect_op-inl.h
  * \brief fully connect operator and symbol
 */
@@ -33,7 +34,7 @@
 #include <utility>
 #include "./operator_common.h"
 #include "./elemwise_op_common.h"
-
+#include "linalg.h"
 
 namespace mxnet {
 namespace op {
@@ -48,12 +49,15 @@ enum FullyConnectedOpOutputs {kOut};
 struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   int num_hidden;
   bool no_bias;
+  bool flatten;
   DMLC_DECLARE_PARAMETER(FullyConnectedParam) {
     // TODO(bing) add support for boolean
     DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1)
     .describe("Number of hidden nodes of the output.");
     DMLC_DECLARE_FIELD(no_bias).set_default(false)
     .describe("Whether to disable bias parameter.");
+    DMLC_DECLARE_FIELD(flatten).set_default(true)
+    .describe("Whether to collapse all but the first axis of the input data tensor.");
   }
 };
 
@@ -91,12 +95,23 @@ class FullyConnectedOp : public Operator {
     const TShape& ishape = in_data[fullc::kData].shape_;
     const TShape& oshape = out_data[fullc::kOut].shape_;
 
-    Tensor<xpu, 2, DType> data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
     Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
-    Tensor<xpu, 2, DType> out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
-        Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
-    out = dot(data, wmat.T());
+    Tensor<xpu, 2, DType> data, out;
+    if (!param_.flatten) {
+      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+      out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+    } else {
+      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+      out = out_data[fullc::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+    }
+
+    // Legacy approach shown here for comparison:
+    //   out = dot(data, wmat.T());
+    linalg_gemm(data, wmat, out, false, true, s);
     if (!param_.no_bias) {
       Tensor<xpu, 1, DType> bias = in_data[fullc::kBias].get<xpu, 1, DType>(s);
       out += repmat(bias, data.size(0));
@@ -122,11 +137,23 @@ class FullyConnectedOp : public Operator {
     const TShape& ishape = in_data[fullc::kData].shape_;
     const TShape& oshape = out_grad[fullc::kOut].shape_;
 
-    Tensor<xpu, 2, DType> data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
     Tensor<xpu, 2, DType> wmat = in_data[fullc::kWeight].get<xpu, 2, DType>(s);
-    Tensor<xpu, 2, DType> grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
-        Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+    Tensor<xpu, 2, DType> data, grad, gdata;
+    if (!param_.flatten) {
+      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+      grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+      gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s);
+    } else {
+      data = in_data[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+      grad = out_grad[fullc::kOut].get_with_shape<xpu, 2, DType>(
+          Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s);
+      gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
+          Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
+    }
 
 #if defined(__CUDACC__)
     CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
@@ -136,16 +163,18 @@ class FullyConnectedOp : public Operator {
     CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace";
     // gradient of weight
     Tensor<xpu, 2, DType> gwmat = in_grad[fullc::kWeight].get<xpu, 2, DType>(s);
-    Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
+    // Legacy approach shown here for comparison:
+    //   out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data));
+    linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]);
     // gradient of bias
     if (!param_.no_bias) {
       Tensor<xpu, 1, DType> gbias = in_grad[fullc::kBias].get<xpu, 1, DType>(s);
       Assign(gbias, req[fullc::kBias], sum_rows(grad));
     }
     // gradient of data
-    Tensor<xpu, 2, DType> gdata = in_grad[fullc::kData].get_with_shape<xpu, 2, DType>(
-        Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s);
-    Assign(gdata, req[fullc::kData], dot(grad, wmat));
+    // Legacy approach shown here for comparison:
+    //   Assign(gdata, req[fullc::kData], dot(grad, wmat));
+    linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]);
   }
 
  private:
@@ -193,13 +222,24 @@ class FullyConnectedProp : public OperatorProperty {
     // require data to be known
     if (dshape.ndim() ==  0) return false;
 
-    index_t num_input = dshape.ProdShape(1, dshape.ndim());
+    index_t num_input;
+    if (!param_.flatten) {
+      num_input = dshape[dshape.ndim()-1];
+    } else {
+      num_input = dshape.ProdShape(1, dshape.ndim());
+    }
     SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input));
     if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden));
     }
 
-    SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden));
+    if (!param_.flatten) {
+      TShape result_shape(dshape);
+      result_shape[dshape.ndim()-1] = param_.num_hidden;
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape);
+    } else {
+      SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden));
+    }
     if (oshape.ndim() != 0) {
       dshape[0] = oshape[0];
       SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape);
diff --git a/src/operator/fully_connected.cc b/src/operator/fully_connected.cc
index 5dbaf8c82005..9a978160297d 100644
--- a/src/operator/fully_connected.cc
+++ b/src/operator/fully_connected.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fully_connected.cc
  * \brief fully connect operator
 */
@@ -76,13 +77,20 @@ DMLC_REGISTER_PARAMETER(FullyConnectedParam);
 MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp)
 .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`.
 
-Shapes:
+If ``flatten`` is set to be true, then the shapes are:
 
-- **data**: `(batch_size, input_dim)`
-- **weight**: `(num_hidden, input_dim)`
+- **data**: `(batch_size, x1, x2, ..., xn)`
+- **weight**: `(num_hidden, x1 * x2 * ... * xn)`
 - **bias**: `(num_hidden,)`
 - **out**: `(batch_size, num_hidden)`
 
+If ``flatten`` is set to be false, then the shapes are:
+
+- **data**: `(x1, x2, ..., xn, input_dim)`
+- **weight**: `(num_hidden, input_dim)`
+- **bias**: `(num_hidden,)`
+- **out**: `(x1, x2, ..., xn, num_hidden)`
+
 The learnable parameters include both ``weight`` and ``bias``.
 
 If ``no_bias`` is set to be true, then the ``bias`` term is ignored.
diff --git a/src/operator/fully_connected.cu b/src/operator/fully_connected.cu
index 28a0307b70bd..279a378e2ad4 100644
--- a/src/operator/fully_connected.cu
+++ b/src/operator/fully_connected.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file fully_connected.cu
  * \brief fully connect operator
 */
diff --git a/src/operator/grid_generator-inl.h b/src/operator/grid_generator-inl.h
index 65fb8ccf2e07..105630cfc26d 100644
--- a/src/operator/grid_generator-inl.h
+++ b/src/operator/grid_generator-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file grid_generator-inl.h
  * \brief
  * The operator generate sampling grid
@@ -35,6 +36,7 @@
 #include <string>
 #include "./mshadow_op.h"
 #include "./operator_common.h"
+#include "./linalg.h"
 
 namespace mxnet {
 namespace op {
@@ -101,7 +103,9 @@ class GridGeneratorOp : public Operator {
         grid_dst[1] = scalar<DType>(-1.0) + tcast<DType>(tcast<int>(grid_dst[1] /
           scalar<DType>(param_.target_shape[1]))) * scalar<DType>(2.0/(param_.target_shape[0] - 1));
         grid_dst[2] = scalar<DType>(1.0);
-        Assign(out, req[grid::kOut], dot(data, grid_dst));
+        // Legacy approach shown here for comparison:
+        //   Assign(out, req[grid::kOut], dot(data, grid_dst));
+        linalg_gemm(data, grid_dst, out, false, false, s, req[grid::kOut]);
         break;
       }
       // Warping transformation
@@ -150,8 +154,10 @@ class GridGeneratorOp : public Operator {
           param_.target_shape[0] * param_.target_shape[1]);
         Tensor<xpu, 2, DType> grad = out_grad[grid::kOut]
           .get_with_shape<xpu, 2, DType>(grad_shape, s);
+        // Legacy approach shown here for comparison:
+        //   Assign(gdata, req[grid::kData], dot(grad, grid_dst.T()));
         // grad : (batch * 2, H * W)   grid_dst.T : (H * W, 3)
-        Assign(gdata, req[grid::kData] , dot(grad, grid_dst.T()));
+        linalg_gemm(grad, grid_dst, gdata, false, true, s, req[grid::kData]);
         break;
       }
       case grid::kWarp: {
diff --git a/src/operator/grid_generator.cc b/src/operator/grid_generator.cc
index 411f856be08b..ea6e66145c4a 100644
--- a/src/operator/grid_generator.cc
+++ b/src/operator/grid_generator.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file grid_generator.cc
  * \brief
  * \author Xu Dong
diff --git a/src/operator/grid_generator.cu b/src/operator/grid_generator.cu
index 7c0a80258d36..b363bea0ad1f 100644
--- a/src/operator/grid_generator.cu
+++ b/src/operator/grid_generator.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file grid_generator.cu
  * \brief
  * \author Xu Dong
diff --git a/src/operator/identity_attach_KL_sparse_reg-inl.h b/src/operator/identity_attach_KL_sparse_reg-inl.h
index 2307914f62a5..591ea5956383 100644
--- a/src/operator/identity_attach_KL_sparse_reg-inl.h
+++ b/src/operator/identity_attach_KL_sparse_reg-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sparse_reg-inl.h
  * \brief
 */
diff --git a/src/operator/identity_attach_KL_sparse_reg.cc b/src/operator/identity_attach_KL_sparse_reg.cc
index 5e776774e00b..df0919dc6c64 100644
--- a/src/operator/identity_attach_KL_sparse_reg.cc
+++ b/src/operator/identity_attach_KL_sparse_reg.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file identity_attach_KL_sparse_reg.cc
  * \brief\
 */
diff --git a/src/operator/identity_attach_KL_sparse_reg.cu b/src/operator/identity_attach_KL_sparse_reg.cu
index 0a11fb167399..2ba7916b2b18 100644
--- a/src/operator/identity_attach_KL_sparse_reg.cu
+++ b/src/operator/identity_attach_KL_sparse_reg.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file identity_attach_KL_sparse_reg.cu
  * \brief
 */
diff --git a/src/operator/instance_norm-inl.h b/src/operator/instance_norm-inl.h
index 6e78f7628a11..258c164450d0 100644
--- a/src/operator/instance_norm-inl.h
+++ b/src/operator/instance_norm-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file instance_norm-inl.h
  * \brief Reproducing paper Instance Normalization: The Missing Ingredient for
  * Fast Stylization, D. Ulyanov, A. Vedaldi, V. Lempitsky, 2016
diff --git a/src/operator/instance_norm.cc b/src/operator/instance_norm.cc
index 0666b4bd0303..9305a74b0e07 100644
--- a/src/operator/instance_norm.cc
+++ b/src/operator/instance_norm.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file instance_norm.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/instance_norm.cu b/src/operator/instance_norm.cu
index 9f8cbea797ed..54e970fd72c2 100644
--- a/src/operator/instance_norm.cu
+++ b/src/operator/instance_norm.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file instance_norm.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/l2_normalization-inl.h b/src/operator/l2_normalization-inl.h
index c1f17acbbce1..cb8e740d7ff3 100644
--- a/src/operator/l2_normalization-inl.h
+++ b/src/operator/l2_normalization-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file l2_normalization_op-inl.h
  * \brief instance l2 Normalization op
 */
diff --git a/src/operator/l2_normalization.cc b/src/operator/l2_normalization.cc
index 6995a0d1e440..76e64c8d350d 100644
--- a/src/operator/l2_normalization.cc
+++ b/src/operator/l2_normalization.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file l2_normalization.cc
  * \brief l2 normalization operator
 */
diff --git a/src/operator/l2_normalization.cu b/src/operator/l2_normalization.cu
index ae76278559a8..1c1c0e5ed092 100644
--- a/src/operator/l2_normalization.cu
+++ b/src/operator/l2_normalization.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file l2_normalization.cu
  * \brief l2 normalization operator
 */
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index 828930a0e405..ab0ee8295d54 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file leaky_relu-inl.h
  * \brief leaky relu family operator
  * \author Bing Xu
@@ -111,7 +112,7 @@ class LeakyReLUOp : public Operator {
       case leakyrelu::kPReLU: {
         weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
         Assign(out, req[leakyrelu::kOut],
-               F<mshadow_op::xelu>(data, broadcast<1>(weight, out.shape_)));
+               F<mshadow_op::xelu>(data, mshadow::expr::broadcast<1>(weight, out.shape_)));
         break;
       }
       case leakyrelu::kRReLU: {
@@ -177,7 +178,8 @@ class LeakyReLUOp : public Operator {
         weight = in_data[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
         grad_weight = in_grad[leakyrelu::kGamma].get<xpu, 1, real_t>(s);
         grad_weight = sumall_except_dim<1>(F<prelu_grad>(data) * grad);
-        gdata = F<mshadow_op::xelu_grad>(data, broadcast<1>(weight, data.shape_)) * grad;
+        gdata = F<mshadow_op::xelu_grad>(data, mshadow::expr::broadcast<1>(weight, data.shape_))
+                * grad;
         break;
       }
       case leakyrelu::kRReLU: {
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index da58bd94bd57..aa890891752e 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file leaky_relu.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/leaky_relu.cu b/src/operator/leaky_relu.cu
index b9b3a7b73f9c..9de237c5734d 100644
--- a/src/operator/leaky_relu.cu
+++ b/src/operator/leaky_relu.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file leaky_relu.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/linalg.h b/src/operator/linalg.h
index 9284a5825d2c..aee67d739b0d 100644
--- a/src/operator/linalg.h
+++ b/src/operator/linalg.h
@@ -26,6 +26,8 @@
 #define MXNET_OPERATOR_LINALG_H_
 
 #include <mshadow/tensor.h>
+#include <mxnet/op_attr_types.h>
+
 #include "./c_lapack_api.h"
 using namespace mshadow;
 
@@ -62,6 +64,14 @@ void linalg_batch_gemm(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DTyp
                        const Tensor<xpu, 3, DType>& C, DType alpha, DType beta,
                        bool tA, bool tB, Stream<xpu> *s = 0);
 
+template<typename xpu, typename DType>
+inline void linalg_gemm(const Tensor<xpu, 2, DType>& A,
+                        const Tensor<xpu, 2, DType>& B,
+                        const Tensor<xpu, 2, DType>& C,
+                        bool tA, bool tB,
+                        Stream<xpu> *s = 0,
+                        mxnet::OpReqType req = mxnet::kWriteTo);
+
 //////////////////////////////// TRSM ////////////////////////////////////////////
 
 // CPU/GPU-versions of BLAS3 function "trsm". Please refer to the BLAS3-documentation
@@ -113,6 +123,67 @@ void linalg_potri(const Tensor<xpu, 2, DType>& A, bool lower, Stream<xpu> *s = 0
 template<typename xpu, typename DType>
 void linalg_batch_potri(const Tensor<xpu, 3, DType>& A, bool lower, Stream<xpu> *s = 0);
 
+//////////////////////////////// SYRK ////////////////////////////////////////////
+
+// CPU/GPU-versions of BLAS3 function "syrk". Please refer to the BLAS3-documentation
+// for further information about the function and its parameters.
+// Note that this is B = syrk(A, B), so that B is input and output parameter.
+
+template<typename xpu, typename DType>
+void linalg_syrk(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
+                 DType alpha, DType beta, bool tA, Stream<xpu> *s = 0);
+
+template<typename xpu, typename DType>
+void linalg_batch_syrk(const Tensor<xpu, 3, DType>& A,
+                       const Tensor<xpu, 3, DType>& B, DType alpha, DType beta,
+                       bool tA, Stream<xpu> *s = 0);
+
+//////////////////////////////// GELQF ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK functions "gelqf", "orglq". Please refer to the
+// LAPACK documentation for further details.
+// Note:
+// - Both functions have A as input and output parameter
+// - Both functions require extra workspace, passed as 1D tensor
+// - We call orglq after gelqf. Apart from A, they also communicate via the
+//   first part of the workspace.
+
+template<typename xpu, typename DType>
+void linalg_gelqf(const Tensor<xpu, 2, DType>& A,
+                  const Tensor<xpu, 1, DType>& work, Stream<xpu> *s = 0);
+
+template<typename xpu, typename DType>
+void linalg_orglq(const Tensor<xpu, 2, DType>& A,
+                  const Tensor<xpu, 1, DType>& work, Stream<xpu> *s = 0);
+
+// This function determines the amount of workspace needed for linalg_gelqf,
+// linalg_orglq. The workspace can be used for both. The first m entries are
+// used to communicate information from gelqf to orglq.
+template<typename xpu, typename DType>
+int linalg_gelqf_workspace_query(const Tensor<xpu, 2, DType>& A,
+                                 Stream<xpu> *s = 0);
+
+//////////////////////////////// SYEVD ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "syevd". Please refer to the
+// LAPACK documentation for further details.
+// Note:
+// - A is input and output parameter (overwritten by U)
+// - Input A is symmetric, we access the lower triangle only
+
+template<typename xpu, typename DType>
+void linalg_syevd(const Tensor<xpu, 2, DType>& A,
+                  const Tensor<xpu, 1, DType>& L,
+                  const Tensor<xpu, 1, DType>& work,
+                  Stream<xpu> *s = 0);
+
+// This function determines the amount of workspace needed for linalg_syevd
+// which is returned as number of elements of type DType.
+template<typename xpu, typename DType>
+int linalg_syevd_workspace_query(const Tensor<xpu, 2, DType>& A,
+                                 const Tensor<xpu, 1, DType>& L,
+                                 Stream<xpu> *s = 0);
+
 #include "linalg_impl.h"
 
 #endif  // MXNET_OPERATOR_LINALG_H_
diff --git a/src/operator/linalg_impl.h b/src/operator/linalg_impl.h
index affa7941640b..b3e6573f789e 100644
--- a/src/operator/linalg_impl.h
+++ b/src/operator/linalg_impl.h
@@ -25,8 +25,12 @@
 #ifndef MXNET_OPERATOR_LINALG_IMPL_H_
 #define MXNET_OPERATOR_LINALG_IMPL_H_
 
+#include <mxnet/op_attr_types.h>
+
 #include <algorithm>
 
+#include "../common/cuda_utils.h"
+
 // Convenience functions.
 inline void linalg_check_batch_size(int A, int B, int C) {
   CHECK_EQ(A, B) << "Inconsistent batch size between arguments to linear algebra operator";
@@ -52,6 +56,8 @@ inline void check_gemm(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DTyp
     << "Non compatible matrix dimensions between inputs A and B for gemm";
 }
 
+#if (MSHADOW_USE_CBLAS == 1 || MSHADOW_USE_MKL == 1)
+
 #define LINALG_CPU_GEMM(fname, DType) \
 template<> inline \
 void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2, DType>& B, \
@@ -62,8 +68,6 @@ void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
                 C.size(0), C.size(1), (tA ? A.size(0) : A.size(1)), alpha, \
                 A.dptr_, A.stride_, B.dptr_, B.stride_, beta, C.dptr_, C.stride_); \
 }
-LINALG_CPU_GEMM(sgemm, float)
-LINALG_CPU_GEMM(dgemm, double)
 
 #define LINALG_CPU_BATCH_GEMM(DType) \
 template<> inline \
@@ -75,9 +79,44 @@ void linalg_batch_gemm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<
     linalg_gemm(A[i], B[i], C[i], alpha, beta, tA, tB); \
   } \
 }
+
+#else
+
+#define LINALG_CPU_GEMM(fname, DType) \
+template<> inline \
+void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2, DType>& B, \
+                             const Tensor<cpu, 2, DType>& C, DType alpha, DType beta, \
+                             bool tA, bool tB, Stream<cpu> *s) { \
+  LOG(FATAL) << "linalg_gemm (without req arg) not implemented by mxnet for cpu, needs cblas!"; \
+}
+
+#define LINALG_CPU_BATCH_GEMM(DType) \
+template<> inline \
+void linalg_batch_gemm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
+                                   const Tensor<cpu, 3, DType>& C, DType alpha, DType beta, \
+                                   bool tA, bool tB, Stream<cpu> *s) { \
+  LOG(FATAL) << "linalg_batch_gemm not implemented by mxnet for cpu, needs cblas!"; \
+}
+
+#endif  // MSHADOW_USE_CBLAS == 1 || MSHADOW_USE_MKL == 1
+
+LINALG_CPU_GEMM(sgemm, float)
+LINALG_CPU_GEMM(dgemm, double)
+
 LINALG_CPU_BATCH_GEMM(float)
 LINALG_CPU_BATCH_GEMM(double)
 
+// Specialization of linalg_gemm<cpu, DType> for DType=mshadow::half::half_t.
+template<> inline
+void linalg_gemm<cpu, mshadow::half::half_t>(const Tensor<cpu, 2, mshadow::half::half_t>& A,
+                                             const Tensor<cpu, 2, mshadow::half::half_t>& B,
+                                             const Tensor<cpu, 2, mshadow::half::half_t>& C,
+                                             mshadow::half::half_t alpha,
+                                             mshadow::half::half_t beta,
+                                             bool tA, bool tB, Stream<cpu> *s) {
+  LOG(FATAL) << "FP16 gemm on cpu not implemented!";
+}
+
 #ifdef __CUDACC__
 
 template<typename DType>
@@ -108,6 +147,55 @@ void linalg_gemm<gpu, DType>(const Tensor<gpu, 2, DType>& A, const Tensor<gpu, 2
 LINALG_GPU_GEMM(Sgemm, float)
 LINALG_GPU_GEMM(Dgemm, double)
 
+// Specialization of linalg_gemm<gpu, DType> for DType=mshadow::half::half_t.
+template<> inline
+void linalg_gemm<gpu, mshadow::half::half_t>(const Tensor<gpu, 2, mshadow::half::half_t>& A,
+                                             const Tensor<gpu, 2, mshadow::half::half_t>& B,
+                                             const Tensor<gpu, 2, mshadow::half::half_t>& C,
+                                             mshadow::half::half_t alpha,
+                                             mshadow::half::half_t beta,
+                                             bool tA, bool tB, Stream<gpu> *s) {
+  using namespace mxnet;
+  using mshadow::gpu;
+  CHECK_NOTNULL(s);
+  check_gemm(A, B, C, alpha, beta, tA, tB);
+
+#if CUDA_VERSION >= 7050
+  auto blas_handle = Stream<gpu>::GetBlasHandle(s);
+#if CUDA_VERSION >= 9000
+  auto cublas_math_mode = GetEnvAllowTensorCore() ? CUBLAS_TENSOR_OP_MATH
+                                                  : CUBLAS_DEFAULT_MATH;
+  auto previous_math_mode = SetCublasMathMode(blas_handle, cublas_math_mode);
+#endif
+
+  // pseudo-fp16 (fp32 math with fp16 I/O)
+  float alpha_f = float(alpha);  // NOLINT(*)
+  float beta_f = float(beta);  // NOLINT(*)
+
+  // As of cuda8, cublas adopted the cuda datatype, rather than maintaining its own datatype.
+#if CUDA_VERSION >= 8000
+  cudaDataType_t half_datatype = CUDA_R_16F;
+#else
+  cublasDataType_t half_datatype = CUBLAS_DATA_HALF;
+#endif
+  CUBLAS_CALL(cublasSgemmEx(blas_handle,
+                            (tB ? CUBLAS_OP_T : CUBLAS_OP_N),
+                            (tA ? CUBLAS_OP_T : CUBLAS_OP_N),
+                            C.size(1), C.size(0), (tB ? B.size(1) : B.size(0)),
+                            &alpha_f,
+                            B.dptr_, half_datatype, B.stride_,
+                            A.dptr_, half_datatype, A.stride_,
+                            &beta_f,
+                            C.dptr_, half_datatype, C.stride_));
+#if CUDA_VERSION >= 9000
+  SetCublasMathMode(blas_handle, previous_math_mode);
+#endif
+#else
+  LOG(FATAL) << "FP16 gemm requires CUDA version >= 7.5!";
+#endif  // CUDA_VERSION >= 7050
+}
+
+
 #define LINALG_GPU_BATCH_GEMM(fname, DType) \
 template<> inline \
 void linalg_batch_gemm<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<gpu, 3, DType>& B, \
@@ -145,7 +233,7 @@ void linalg_batch_gemm<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<
 LINALG_GPU_BATCH_GEMM(SgemmBatched, float)
 LINALG_GPU_BATCH_GEMM(DgemmBatched, double)
 
-#endif
+#endif  // __CUDACC__
 
 //////////////////////////////// TRSM ////////////////////////////////////////////
 
@@ -165,6 +253,8 @@ inline void check_trsm(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DTyp
     << "Non compatible matrix dimensions between inputs A and B for trsm";
 }
 
+#if (MSHADOW_USE_CBLAS == 1 || MSHADOW_USE_MKL == 1)
+
 #define LINALG_CPU_TRSM(fname, DType) \
 template<> inline \
 void linalg_trsm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2, DType>& B, \
@@ -175,8 +265,6 @@ void linalg_trsm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
                 CblasNonUnit, B.size(0), B.size(1), alpha, A.dptr_, \
                 A.stride_, B.dptr_, B.stride_); \
 }
-LINALG_CPU_TRSM(strsm, float)
-LINALG_CPU_TRSM(dtrsm, double)
 
 #define LINALG_CPU_BATCH_TRSM(DType) \
 template<> inline \
@@ -187,6 +275,28 @@ void linalg_batch_trsm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<
     linalg_trsm(A[i], B[i], alpha, rightside, lower, transpose); \
   } \
 }
+
+#else
+
+#define LINALG_CPU_TRSM(fname, DType) \
+template<> inline \
+void linalg_trsm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2, DType>& B, \
+                 DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
+  LOG(FATAL) << "linalg_trsm not implemented, needs cblas!"; \
+}
+
+#define LINALG_CPU_BATCH_TRSM(DType) \
+template<> inline \
+void linalg_batch_trsm<cpu, DType>(const Tensor<cpu, 3, DType>& A, const Tensor<cpu, 3, DType>& B, \
+                   DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
+  LOG(FATAL) << "linalg_batch_trsm not implemented, needs cblas!"; \
+}
+
+#endif  // MSHADOW_USE_CBLAS == 1 || MSHADOW_USE_MKL == 1
+
+LINALG_CPU_TRSM(strsm, float)
+LINALG_CPU_TRSM(dtrsm, double)
+
 LINALG_CPU_BATCH_TRSM(float)
 LINALG_CPU_BATCH_TRSM(double)
 
@@ -244,7 +354,96 @@ void linalg_batch_trsm<gpu, DType>(const Tensor<gpu, 3, DType>& A, const Tensor<
 LINALG_GPU_BATCH_TRSM(StrsmBatched, float)
 LINALG_GPU_BATCH_TRSM(DtrsmBatched, double)
 
-#endif
+#endif  // __CUDACC__
+
+/*!
+ * \brief Performs gemm, setting alpha and beta as appropriate for `req`.
+ *
+ * \param A the first operand of the gemm
+ * \param B the second operand of the gemm
+ * \param C the data to be assigned
+ * \param tA whether the `A` operand should be transposed first.
+ * \param tB whether the `B` operand should be transposed first.
+ * \param s the stream to perform the operation
+ * \param req the assignment request
+ */
+template<typename xpu, typename DType>
+inline void linalg_gemm(const Tensor<xpu, 2, DType>& A,
+                        const Tensor<xpu, 2, DType>& B,
+                        const Tensor<xpu, 2, DType>& C,
+                        bool tA, bool tB, Stream<xpu> *s,
+                        mxnet::OpReqType req) {
+  using namespace mxnet;
+  switch (req) {
+    case kNullOp:
+      break;
+    case kWriteTo:
+    case kWriteInplace:
+      linalg_gemm(A, B, C, DType(1.0), DType(0.0), tA, tB, s);
+      break;
+    case kAddTo:
+      linalg_gemm(A, B, C, DType(1.0), DType(1.0), tA, tB, s);
+      break;
+    default:
+      LOG(FATAL) << "not reached";
+  }
+}
+
+#if (MSHADOW_USE_CBLAS == 0 && MSHADOW_USE_MKL == 0)
+
+// A template for a cpu linalg_gemm implementation using mshadow::dot()
+#define LINALG_CPU_GEMM_NO_CBLAS(DType) \
+template<> inline \
+void linalg_gemm<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                             const Tensor<cpu, 2, DType>& B, \
+                             const Tensor<cpu, 2, DType>& C, \
+                             bool tA, bool tB, Stream<cpu> *s, \
+                             mxnet::OpReqType req) { \
+  using namespace mxnet; \
+  using mshadow::cpu; \
+  switch (req) { \
+    case kNullOp: \
+      break; \
+    case kWriteTo: \
+    case kWriteInplace: \
+      if (tA) { \
+        if (tB) { \
+          const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A.T(), B.T()); \
+        } else { \
+          const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A.T(), B); \
+        } \
+      } else { \
+        if (tB) { \
+          const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A, B.T()); \
+        } else { \
+          const_cast<Tensor<cpu, 2, DType>&>(C) = dot(A, B); \
+        } \
+      } \
+      break; \
+    case kAddTo: \
+      if (tA) { \
+        if (tB) { \
+          const_cast<Tensor<cpu, 2, DType>&>(C) += dot(A.T(), B.T()); \
+        } else { \
+          const_cast<Tensor<cpu, 2, DType>&>(C) += dot(A.T(), B); \
+        } \
+      } else { \
+        if (tB) { \
+          const_cast<Tensor<cpu, 2, DType>&>(C) += dot(A, B.T()); \
+        } else { \
+          const_cast<Tensor<cpu, 2, DType>&>(C) += dot(A, B); \
+        } \
+      } \
+      break; \
+    default: \
+      LOG(FATAL) << "not reached"; \
+  } \
+}
+
+LINALG_CPU_GEMM_NO_CBLAS(float)
+LINALG_CPU_GEMM_NO_CBLAS(double)
+
+#endif  // (MSHADOW_USE_CBLAS == 0 && MSHADOW_USE_MKL == 0)
 
 //////////////////////////////// TRMM ////////////////////////////////////////////
 
@@ -264,6 +463,8 @@ inline void check_trmm(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DTyp
     << "Non compatible matrix dimensions between inputs A and B for trmm";
 }
 
+#if (MSHADOW_USE_CBLAS == 1 || MSHADOW_USE_MKL == 1)
+
 #define LINALG_CPU_TRMM(fname, DType) \
 template<> inline \
 void linalg_trmm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2, DType>& B, \
@@ -274,8 +475,17 @@ void linalg_trmm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2
                 CblasNonUnit, B.size(0), B.size(1), alpha, A.dptr_, \
                 A.stride_, B.dptr_, B.stride_); \
 }
-LINALG_CPU_TRMM(strmm, float)
-LINALG_CPU_TRMM(dtrmm, double)
+
+#else
+
+#define LINALG_CPU_TRMM(fname, DType) \
+template<> inline \
+void linalg_trmm<cpu, DType>(const Tensor<cpu, 2, DType>& A, const Tensor<cpu, 2, DType>& B, \
+                 DType alpha, bool rightside, bool lower, bool transpose, Stream<cpu> *s) { \
+  LOG(FATAL) << "linalg_trmm not implemented, needs cblas!"; \
+}
+
+#endif  // MSHADOW_USE_CBLAS == 1 || MSHADOW_USE_MKL == 1
 
 #define LINALG_XPU_BATCH_TRMM(xpu, DType) \
 template<> inline \
@@ -286,6 +496,10 @@ void linalg_batch_trmm<xpu, DType>(const Tensor<xpu, 3, DType>& A, const Tensor<
     linalg_trmm(A[i], B[i], alpha, rightside, lower, transpose, s); \
   } \
 }
+
+LINALG_CPU_TRMM(strmm, float)
+LINALG_CPU_TRMM(dtrmm, double)
+
 LINALG_XPU_BATCH_TRMM(cpu, float)
 LINALG_XPU_BATCH_TRMM(cpu, double)
 
@@ -305,7 +519,7 @@ void linalg_trmm<gpu, DType>(const Tensor<gpu, 2, DType>& A, const Tensor<gpu, 2
                             (rightside ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT), \
                             (lower ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER), \
                             (transpose ? CUBLAS_OP_T : CUBLAS_OP_N), \
-                            CUBLAS_DIAG_NON_UNIT, B.size(0), B.size(1), &alpha, \
+                            CUBLAS_DIAG_NON_UNIT, B.size(1), B.size(0), &alpha, \
                             A.dptr_, A.stride_, B.dptr_, B.stride_, \
                             B.dptr_, B.stride_)); \
 }
@@ -315,7 +529,7 @@ LINALG_GPU_TRMM(Dtrmm, double)
 LINALG_XPU_BATCH_TRMM(gpu, float)
 LINALG_XPU_BATCH_TRMM(gpu, double)
 
-#endif
+#endif  // __CUDACC__
 
 //////////////////////////////// POTRF ////////////////////////////////////////////
 
@@ -351,7 +565,7 @@ void linalg_batch_potrf<cpu, DType>(const Tensor<cpu, 3, DType>& A, bool lower,
 LINALG_CPU_BATCH_POTRF(float)
 LINALG_CPU_BATCH_POTRF(double)
 
-#if MXNET_USE_CUSOLVER == 1
+#if defined(__CUDACC__) && MXNET_USE_CUSOLVER == 1
 
 #define LINALG_GPU_BUFFSIZE_POTRF(fname, DType) \
 inline int linalg_potrf_buffsize(const Tensor<gpu, 2, DType>& A, bool lower, Stream<gpu> *s) { \
@@ -505,4 +719,377 @@ LINALG_GPU_BATCH_POTRI(double)
 
 #endif
 
+//////////////////////////////// SYRK ////////////////////////////////////////////
+
+// CPU/GPU-versions of BLAS3 function "syrk". Please refer to the BLAS3-documentation
+// for further information about the function and its parameters.
+// Note that this is B = syrk(A, B), so B is input and output parameter.
+
+template<typename xpu, typename DType> inline
+void check_syrk(const Tensor<xpu, 2, DType>& A, const Tensor<xpu, 2, DType>& B,
+                DType alpha, DType beta, bool tA) {
+  // Any checking that helps user debug potential problems.
+  CHECK_EQ(B.size(0), B.size(1))
+    << "B must be square symmetric matrix for syrk";
+  CHECK_EQ((tA ? A.size(1) : A.size(0)), B.size(0))
+    << "Non compatible matrix dimensions between inputs A and B for syrk";
+}
+
+#if (MSHADOW_USE_CBLAS == 1 || MSHADOW_USE_MKL == 1)
+
+#define LINALG_CPU_SYRK(fname, DType) \
+template<> inline \
+void linalg_syrk<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                             const Tensor<cpu, 2, DType>& B, DType alpha, \
+                             DType beta, bool tA, Stream<cpu> *s) { \
+  check_syrk(A, B, alpha, beta, tA); \
+  cblas_##fname(CblasRowMajor, CblasLower, (tA ? CblasTrans : CblasNoTrans), \
+                B.size(0), (tA ? A.size(0) : A.size(1)), alpha, \
+                A.dptr_, A.stride_, beta, B.dptr_, B.stride_); \
+}
+
+#else
+
+#define LINALG_CPU_SYRK(fname, DType) \
+template<> inline \
+void linalg_syrk<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                             const Tensor<cpu, 2, DType>& B, DType alpha, \
+                             DType beta, bool tA, Stream<cpu> *s) { \
+  LOG(FATAL) << "linalg_syrk not implemented by mxnet for cpu, needs cblas!"; \
+}
+
+#endif  // MSHADOW_USE_CBLAS == 1 || MSHADOW_USE_MKL == 1
+
+#define LINALG_XPU_BATCH_SYRK(xpu, DType) \
+template<> inline \
+void linalg_batch_syrk(const Tensor<xpu, 3, DType>& A, \
+                       const Tensor<xpu, 3, DType>& B, DType alpha, DType beta, \
+                       bool tA, Stream<xpu> *s) { \
+  linalg_check_batch_size(A.size(0), B.size(0), B.size(0)); \
+  for (index_t i = 0; i < A.size(0); ++i) { \
+    linalg_syrk(A[i], B[i], alpha, beta, tA, s); \
+  } \
+}
+
+LINALG_CPU_SYRK(ssyrk, float)
+LINALG_CPU_SYRK(dsyrk, double)
+LINALG_XPU_BATCH_SYRK(cpu, float)
+LINALG_XPU_BATCH_SYRK(cpu, double)
+
+#ifdef __CUDACC__
+
+// cublas col-major processing accounted for by switching transpose and fill mode
+#define LINALG_GPU_SYRK(fname, DType) \
+template<> inline \
+void linalg_syrk<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                             const Tensor<gpu, 2, DType>& B, DType alpha, \
+                             DType beta, bool tA, Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  check_syrk(A, B, alpha, beta, tA); \
+  CUBLAS_CALL(cublas##fname(Stream<gpu>::GetBlasHandle(s), \
+              CUBLAS_FILL_MODE_UPPER, (tA ? CUBLAS_OP_N : CUBLAS_OP_T), \
+              B.size(1), (tA ? A.size(0) : A.size(1)), &alpha, \
+              A.dptr_, A.stride_, &beta, B.dptr_, B.stride_)); \
+}
+
+LINALG_GPU_SYRK(Ssyrk, float)
+LINALG_GPU_SYRK(Dsyrk, double)
+LINALG_XPU_BATCH_SYRK(gpu, float)
+LINALG_XPU_BATCH_SYRK(gpu, double)
+
+#endif  // __CUDACC__
+
+//////////////////////////////// GELQF ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK functions "gelqf", "orglq".
+
+template<typename xpu, typename DType> inline
+void check_gelqf(const Tensor<xpu, 2, DType>& A,
+                 const Tensor<xpu, 1, DType>& work) {
+  // Any checking that helps user debug potential problems.
+  CHECK_LE(A.size(0), A.size(1))
+    << "A must have num(rows) <= num(columns)";
+  CHECK_LE(A.size(0), work.size(0))
+    << "Size of work is too small";
+}
+
+#define LINALG_CPU_GELQF(fname, DType) \
+template<> inline \
+void linalg_gelqf<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                              const Tensor<cpu, 1, DType>& work, \
+                              Stream<cpu> *s) { \
+  check_gelqf(A, work); \
+  int m(A.size(0)); \
+  int lwork(work.size(0) - m); \
+  int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, m, A.size(1), \
+                               A.dptr_ , A.stride_, work.dptr_, \
+                               work.dptr_ + m, lwork)); \
+  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
+}
+LINALG_CPU_GELQF(sgelqf, float)
+LINALG_CPU_GELQF(dgelqf, double)
+
+#define LINALG_CPU_ORGLQ(fname, DType) \
+template<> inline \
+void linalg_orglq<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                              const Tensor<cpu, 1, DType>& work, \
+                              Stream<cpu> *s) { \
+  check_gelqf(A, work); \
+  int m(A.size(0)); \
+  int lwork(work.size(0) - m); \
+  int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, m, A.size(1), \
+                               A.dptr_ , A.stride_, work.dptr_, \
+                               work.dptr_ + m, lwork)); \
+  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
+}
+LINALG_CPU_ORGLQ(sorglq, float)
+LINALG_CPU_ORGLQ(dorglq, double)
+
+#define LINALG_CPU_GELQF_WORKSPACE_QUERY(prefix, DType) \
+template<> inline \
+int linalg_gelqf_workspace_query<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                                             Stream<cpu> *s) { \
+  int m(A.size(0)); \
+  DType work = 0; \
+  int ret(MXNET_LAPACK_##prefix##gelqf(MXNET_LAPACK_ROW_MAJOR, m, \
+                                       A.size(1), A.dptr_ , A.stride_, &work, \
+                                       &work, -1)); \
+  CHECK_EQ(ret, 0) << #prefix << "gelqf: Workspace query failed on CPU."; \
+  int ws_size(static_cast<int>(work)); \
+  ret = MXNET_LAPACK_##prefix##orglq(MXNET_LAPACK_ROW_MAJOR, m, \
+                                     A.size(1), A.dptr_ , \
+                                     A.stride_, &work, &work, -1); \
+  CHECK_EQ(ret, 0) << #prefix << "orglq: Workspace query failed on CPU."; \
+  int wsz2(static_cast<int>(work)); \
+  if (wsz2 > ws_size) ws_size = wsz2; \
+  return ws_size + m; \
+}
+LINALG_CPU_GELQF_WORKSPACE_QUERY(s, float)
+LINALG_CPU_GELQF_WORKSPACE_QUERY(d, double)
+
+#ifdef __CUDACC__
+
+#define LINALG_GPU_GELQF(fname, DType) \
+template<> inline \
+void linalg_gelqf<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                              const Tensor<gpu, 1, DType>& work, \
+                              Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  check_gelqf(A, work); \
+  int m(A.size(0)); \
+  int lwork(work.size(0) - m); \
+  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
+                A.size(1), m, A.dptr_ , A.stride_, work.dptr_, \
+                work.dptr_ + m, lwork, static_cast<int *>(info.dptr))); \
+  Storage::Get()->Free(info); \
+}
+// Col-major QR-decomposition results in row-major LQ decomposition.
+LINALG_GPU_GELQF(DnSgeqrf, float)
+LINALG_GPU_GELQF(DnDgeqrf, double)
+
+// ORGLQ only available with cuda8 or higher.
+#if CUDA_VERSION >= 8000
+
+#define LINALG_GPU_ORGLQ(fname, DType) \
+template<> inline \
+void linalg_orglq<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                              const Tensor<gpu, 1, DType>& work, \
+                              Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  check_gelqf(A, work); \
+  int m(A.size(0)); \
+  int lwork(work.size(0) - m); \
+  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
+                A.size(1), m, m, A.dptr_ , A.stride_, work.dptr_, \
+                work.dptr_ + m, lwork, static_cast<int *>(info.dptr))); \
+  Storage::Get()->Free(info); \
+}
+
+#else
+
+#define LINALG_GPU_ORGLQ(fname, DType) \
+template<> inline \
+void linalg_orglq<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                              const Tensor<gpu, 1, DType>& work, \
+                              Stream<gpu> *s) { \
+  LOG(FATAL) << "orglq requires CUDA version >= 8.0!"; \
+}
+
+#endif  // CUDA_VERSION >= 8000
+
+LINALG_GPU_ORGLQ(DnSorgqr, float)
+LINALG_GPU_ORGLQ(DnDorgqr, double)
+
+// ORGLQ only available with cuda8 or higher.
+#if CUDA_VERSION >= 8000
+
+#define LINALG_GPU_GELQF_WORKSPACE_QUERY(prefix, DType) \
+template<> inline \
+int linalg_gelqf_workspace_query<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                                             Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  int m(A.size(0)); \
+  int work1(0); \
+  CUSOLVER_CALL(cusolverDn##prefix##geqrf_bufferSize(Stream<gpu>::GetSolverHandle(s), \
+                A.size(1), m, A.dptr_ , A.stride_, &work1)); \
+  int work2(0);  \
+  Storage::Handle tau = Storage::Get()->Alloc(sizeof(DType), Context::GPU()); \
+  CUSOLVER_CALL(cusolverDn##prefix##orgqr_bufferSize(Stream<gpu>::GetSolverHandle(s), \
+                A.size(1), m, m, A.dptr_ , A.stride_, static_cast<DType *>(tau.dptr), &work2)); \
+  Storage::Get()->Free(tau); \
+  return std::max(work1, work2) + m; \
+}
+
+#else
+
+#define LINALG_GPU_GELQF_WORKSPACE_QUERY(prefix, DType) \
+template<> inline \
+int linalg_gelqf_workspace_query<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                                             Stream<gpu> *s) { \
+  LOG(FATAL) << "orglq requires CUDA version >= 8.0!"; \
+  return 0; \
+}
+
+#endif  // CUDA_VERSION >= 8000
+
+LINALG_GPU_GELQF_WORKSPACE_QUERY(S, float)
+LINALG_GPU_GELQF_WORKSPACE_QUERY(D, double)
+
+#endif  // __CUDACC__
+
+//////////////////////////////// SYEVD ////////////////////////////////////////////
+
+// CPU/GPU-versions of LAPACK function "syevd"
+
+template<typename xpu, typename DType> inline
+void check_syevd(const Tensor<xpu, 2, DType>& A,
+                 const Tensor<xpu, 1, DType>& L) {
+  // Any checking that helps user debug potential problems.
+  CHECK_EQ(A.size(0), A.size(1))
+    << "A must be square symmetric matrix";
+  CHECK_EQ(A.size(0), L.size(0))
+    << "A, L have incompatible sizes";
+}
+
+#define LINALG_CPU_SYEVD(fname, DType) \
+template<> inline \
+void linalg_syevd<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                              const Tensor<cpu, 1, DType>& L, \
+                              const Tensor<cpu, 1, DType>& work, \
+                              Stream<cpu> *s) { \
+  check_syevd(A, L); \
+  int liwork(0); \
+  MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, 'L', A.size(0), \
+                       A.dptr_, A.stride_, L.dptr_, work.dptr_, -1, &liwork, \
+                      -1); \
+  int lwork(static_cast<int>(*work.dptr_)); \
+  int *iwork = static_cast<int*>(static_cast<void*>(work.dptr_ + lwork)); \
+  int ret(MXNET_LAPACK_##fname(MXNET_LAPACK_ROW_MAJOR, 'L', A.size(0), \
+                               A.dptr_, A.stride_, L.dptr_, work.dptr_, \
+                               lwork, iwork, liwork)); \
+  CHECK_EQ(ret, 0) << #fname << " failed in lapack on cpu."; \
+}
+LINALG_CPU_SYEVD(ssyevd, float)
+LINALG_CPU_SYEVD(dsyevd, double)
+
+// Mangle temp storage requirements for DType and int into a single
+// request as we can only allocate one temp space per operator. We
+// partition this temp space into two chunks again when calling sseyvd.
+// Returned is the number of elements of type DType that the temp space
+// needs to accomodate. This also makes this function signature equivalent
+// to the work space query on GPU.
+#define LINALG_CPU_SYEVD_WORKSPACE_QUERY(func, DType) \
+template<> inline \
+int linalg_syevd_workspace_query<cpu, DType>(const Tensor<cpu, 2, DType>& A, \
+                                             const Tensor<cpu, 1, DType>& L, \
+                                             Stream<cpu> *s) { \
+  DType work(0.0); \
+  int iwork(0); \
+  MXNET_LAPACK_##func(MXNET_LAPACK_ROW_MAJOR, 'L', A.size(0), \
+                      A.dptr_, A.stride_, L.dptr_, &work, -1, &iwork, \
+                      -1); \
+  iwork = (sizeof(int) * iwork + sizeof(DType) - 1) / sizeof(DType); \
+  return static_cast<int>(work) + iwork; \
+}
+LINALG_CPU_SYEVD_WORKSPACE_QUERY(ssyevd, float)
+LINALG_CPU_SYEVD_WORKSPACE_QUERY(dsyevd, double)
+
+#ifdef __CUDACC__
+
+// SYEVD only available with cuda8 or higher.
+#if CUDA_VERSION >= 8000
+
+// Row-major vs. col-major handled by using upper triangular
+// in cusolver-call.
+#define LINALG_GPU_SYEVD(fname, DType) \
+template<> inline \
+void linalg_syevd<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                              const Tensor<gpu, 1, DType>& L, \
+                              const Tensor<gpu, 1, DType>& work, \
+                              Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  CHECK_NOTNULL(s); \
+  check_syevd(A, L); \
+  Storage::Handle info = Storage::Get()->Alloc(sizeof(int), Context::GPU()); \
+  CUSOLVER_CALL(cusolver##fname(Stream<gpu>::GetSolverHandle(s), \
+                CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, \
+                A.size(0), A.dptr_ , A.stride_, L.dptr_, work.dptr_, \
+                work.size(0), static_cast<int *>(info.dptr))); \
+  Storage::Get()->Free(info); \
+}
+
+#define LINALG_GPU_SYEVD_WORKSPACE_QUERY(fname, DType) \
+template<> inline \
+int linalg_syevd_workspace_query<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                                             const Tensor<gpu, 1, DType>& L, \
+                                             Stream<gpu> *s) { \
+  using namespace mxnet; \
+  using mshadow::gpu; \
+  int lwork(0); \
+  CUSOLVER_CALL(cusolver##fname##_bufferSize(Stream<gpu>::GetSolverHandle(s), \
+                CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, \
+                A.size(0), A.dptr_ , A.stride_, L.dptr_, &lwork)); \
+  return lwork; \
+}
+
+#else
+
+#define LINALG_GPU_SYEVD(fname, DType) \
+template<> inline \
+void linalg_syevd<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                              const Tensor<gpu, 1, DType>& L, \
+                              const Tensor<gpu, 1, DType>& work, \
+                              Stream<gpu> *s) { \
+  LOG(FATAL) << "syevd requires CUDA version >= 8.0!"; \
+}
+
+#define LINALG_GPU_SYEVD_WORKSPACE_QUERY(fname, DType) \
+template<> inline \
+int linalg_syevd_workspace_query<gpu, DType>(const Tensor<gpu, 2, DType>& A, \
+                                             const Tensor<gpu, 1, DType>& L, \
+                                             Stream<gpu> *s) { \
+  LOG(FATAL) << "syevd requires CUDA version >= 8.0!"; \
+  return 0; \
+}
+
+#endif  // CUDA_VERSION >= 8000
+
+LINALG_GPU_SYEVD(DnSsyevd, float)
+LINALG_GPU_SYEVD(DnDsyevd, double)
+
+LINALG_GPU_SYEVD_WORKSPACE_QUERY(DnSsyevd, float)
+LINALG_GPU_SYEVD_WORKSPACE_QUERY(DnDsyevd, double)
+
+#endif  // __CUDACC__
+
 #endif  // MXNET_OPERATOR_LINALG_IMPL_H_
diff --git a/src/operator/loss_binary_op-inl.h b/src/operator/loss_binary_op-inl.h
index 8add82725292..1362997231a0 100644
--- a/src/operator/loss_binary_op-inl.h
+++ b/src/operator/loss_binary_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file loss_binary_op-inl.h
  * \brief Loss functions
  */
diff --git a/src/operator/loss_binary_op.cc b/src/operator/loss_binary_op.cc
index d0a77946ffba..c1fedb3de61c 100644
--- a/src/operator/loss_binary_op.cc
+++ b/src/operator/loss_binary_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file loss_binary_op.cc
  * \brief loss function that takes a data and label
 */
diff --git a/src/operator/loss_binary_op.cu b/src/operator/loss_binary_op.cu
index 8694b9f2844f..74ff563cf3e3 100644
--- a/src/operator/loss_binary_op.cu
+++ b/src/operator/loss_binary_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file loss_binary_op.cu
  * \brief loss function that takes a data and label
 */
diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h
index a320a26bed30..adfe4676702d 100644
--- a/src/operator/lrn-inl.h
+++ b/src/operator/lrn-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file lrn-inl.h
  * \brief
  * \author Bing Xu
@@ -152,9 +153,7 @@ class LocalResponseNormProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     int n_out = this->ListOutputs().size();
diff --git a/src/operator/lrn.cc b/src/operator/lrn.cc
index 46f4fca486b5..9b3afd80cd18 100644
--- a/src/operator/lrn.cc
+++ b/src/operator/lrn.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file lrn.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/lrn.cu b/src/operator/lrn.cu
index 702f4b2fa92a..ba872f1d26d0 100644
--- a/src/operator/lrn.cu
+++ b/src/operator/lrn.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file lrn.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/make_loss-inl.h b/src/operator/make_loss-inl.h
index 3f4a99373ca3..b83e5b9b687b 100644
--- a/src/operator/make_loss-inl.h
+++ b/src/operator/make_loss-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file make_loss-inl.h
  * \brief special layer for propagating loss
 */
diff --git a/src/operator/make_loss.cc b/src/operator/make_loss.cc
index 748357d243f5..14304d3cc263 100644
--- a/src/operator/make_loss.cc
+++ b/src/operator/make_loss.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file make_loss.cc
  * \brief special layer for propagating loss
 */
diff --git a/src/operator/make_loss.cu b/src/operator/make_loss.cu
index 7f508500f58e..e1e217e3614e 100644
--- a/src/operator/make_loss.cu
+++ b/src/operator/make_loss.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file make_loss.cu
  * \brief special layer for propagating loss
 */
diff --git a/src/operator/math_functions-inl.h b/src/operator/math_functions-inl.h
new file mode 100644
index 000000000000..a5b83ea3cf98
--- /dev/null
+++ b/src/operator/math_functions-inl.h
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file special_functions-inl.h
+ * \brief
+ * \author Matthias Seeger
+*/
+
+
+#ifndef MXNET_OPERATOR_MATH_FUNCTIONS_INL_H_
+#define MXNET_OPERATOR_MATH_FUNCTIONS_INL_H_
+
+#include "math.h"
+
+namespace mxnet {
+namespace op {
+
+namespace math {
+
+// Wrappers for math.h unary and binary functions
+// - For DType != double: math::name(a) does computation in float
+//   and returns float
+// - For DType == double: math::name(a) does computation in double
+//   and returns double
+
+#define MXNET_UNARY_MATH_FUNC(name) \
+template<typename DType> MSHADOW_XINLINE \
+float name(DType a) { \
+  return ::name##f(static_cast<float>(a)); \
+} \
+MSHADOW_XINLINE \
+double name(double a) { \
+  return ::name(a); \
+}
+
+#define MXNET_BINARY_MATH_FUNC(name) \
+template<typename DType> MSHADOW_XINLINE \
+float name(DType a, DType b) { \
+  return ::name##f(static_cast<float>(a), static_cast<float>(b)); \
+} \
+MSHADOW_XINLINE \
+double name(double a, double b) { \
+  return ::name(a, b); \
+}
+
+MXNET_UNARY_MATH_FUNC(exp)
+
+MXNET_UNARY_MATH_FUNC(expm1)
+
+MXNET_UNARY_MATH_FUNC(tanh)
+
+MXNET_UNARY_MATH_FUNC(log1p)
+
+MXNET_UNARY_MATH_FUNC(log)
+
+MXNET_UNARY_MATH_FUNC(log10)
+
+MXNET_UNARY_MATH_FUNC(log2)
+
+MXNET_UNARY_MATH_FUNC(sin)
+
+MXNET_UNARY_MATH_FUNC(cos)
+
+MXNET_UNARY_MATH_FUNC(tan)
+
+MXNET_UNARY_MATH_FUNC(asin)
+
+MXNET_UNARY_MATH_FUNC(sqrt)
+
+MXNET_UNARY_MATH_FUNC(acos)
+
+MXNET_UNARY_MATH_FUNC(atan)
+
+MXNET_UNARY_MATH_FUNC(sinh)
+
+MXNET_UNARY_MATH_FUNC(cosh)
+
+MXNET_UNARY_MATH_FUNC(asinh)
+
+MXNET_UNARY_MATH_FUNC(acosh)
+
+MXNET_UNARY_MATH_FUNC(atanh)
+
+MXNET_UNARY_MATH_FUNC(fabs)
+
+MXNET_UNARY_MATH_FUNC(cbrt)
+
+MXNET_UNARY_MATH_FUNC(round)
+
+MXNET_UNARY_MATH_FUNC(ceil)
+
+MXNET_UNARY_MATH_FUNC(floor)
+
+MXNET_UNARY_MATH_FUNC(trunc)
+
+MXNET_UNARY_MATH_FUNC(tgamma)
+
+MXNET_UNARY_MATH_FUNC(lgamma)
+
+MXNET_BINARY_MATH_FUNC(hypot)
+
+MXNET_BINARY_MATH_FUNC(pow)
+
+template<typename DType> MSHADOW_XINLINE
+float id(DType a) {
+  return static_cast<float>(a);
+}
+MSHADOW_XINLINE
+double id(double a) {
+  return a;
+}
+
+template<typename DType> MSHADOW_XINLINE
+float sqr(DType a) {
+  float af(static_cast<float>(a));
+  return af * af;
+}
+MSHADOW_XINLINE
+double sqr(double a) {
+  return a * a;
+}
+
+}  // namespace math
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_MATH_FUNCTIONS_INL_H_
diff --git a/src/operator/mkl/mkl_pooling-inl.h b/src/operator/mkl/mkl_pooling-inl.h
index 1249220456a8..bc3fcee317f2 100644
--- a/src/operator/mkl/mkl_pooling-inl.h
+++ b/src/operator/mkl/mkl_pooling-inl.h
@@ -181,8 +181,7 @@ class MKLPoolingOp : public Operator {
       algorithm = dnnAlgorithmPoolingMax;
       break;
     case pool_enum::kAvgPooling:
-      algorithm = (param_.pooling_convention == pool_enum::kValid) ?
-          dnnAlgorithmPoolingAvgIncludePadding : dnnAlgorithmPoolingAvg;
+      algorithm = dnnAlgorithmPoolingAvgIncludePadding;
 
       break;
     default:
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index f7815d2f8d4c..10be627ee73e 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file mshadow_op.h
  * \brief
  * \author Bing Xu
@@ -26,8 +27,10 @@
 #define MXNET_OPERATOR_MSHADOW_OP_H_
 
 #include <mxnet/base.h>
-#include <math.h>
+#include "math.h"
+#include "math_functions-inl.h"
 #include "special_functions-inl.h"
+#include "./mxnet_op.h"
 
 #ifdef __CUDACC__
 #include <cuda_fp16.h>
@@ -36,6 +39,25 @@
 namespace mxnet {
 namespace op {
 namespace mshadow_op {
+
+/*!
+ * \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD' macro outside of the mshadow_op namespace
+ *        See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD'
+ *
+ * \note An entry for the operator must also be added in operator_tune.cc, which will register it
+ *       for auto-tuning and also hold its workload weight
+ */
+#define MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(__op$) \
+  } MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow_op::__op$) namespace mshadow_op {  // NOLINT(*)
+/*!
+ * \brief Use the 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD' macro outside of the mshadow_op namespace
+ *        See mxnet_op.h for a description of 'MXNET_TUNABLE_MSHADOW_OP_BACKWARD'
+ *
+ * \note An entry for the operator must also be added in operator_tune.cc, which will register it
+ *       for auto-tuning and also hold its workload weight
+ */
+#define MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(__op$) \
+  }  MXNET_TUNABLE_MSHADOW_OP_BACKWARD(mshadow_op::__op$) namespace mshadow_op {  // NOLINT(*)
 #ifdef __CUDA_ARCH__
 __constant__ const float PI = 3.14159265358979323846;
 #else
@@ -45,677 +67,339 @@ using std::isnan;
 using std::enable_if;
 using std::is_unsigned;
 
-/*! \brief identity Operation */
-struct identity {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return a;
-  }
-};
+#define MXNET_UNARY_MATH_OP(name, expr) \
+  struct name { \
+    template<typename DType> \
+    MSHADOW_XINLINE static DType Map(DType a) { \
+      return DType(expr); \
+    } \
+  }; \
+  MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)
 
-struct identity_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(1.0f);
-  }
-};
 
-struct left {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return a;
-  }
-};
+#define MXNET_UNARY_MATH_OP_NC(name, expr) \
+  struct name { \
+    template<typename DType> \
+    MSHADOW_XINLINE static DType Map(DType a) { \
+      return (expr); \
+    } \
+  }; \
+  MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)
 
-struct right {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return b;
-  }
-};
+#define MXNET_BINARY_MATH_OP(name, expr) \
+  struct name { \
+    template<typename DType> \
+    MSHADOW_XINLINE static DType Map(DType a, DType b) { \
+      return DType(expr); \
+    } \
+  }; \
+  MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)
 
-struct negation {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(-a);
-  }
-};
+#define MXNET_BINARY_MATH_OP_NC(name, expr) \
+  struct name { \
+    template<typename DType> \
+    MSHADOW_XINLINE static DType Map(DType a, DType b) { \
+      return (expr); \
+    } \
+  }; \
+  MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(name)
 
-struct reciprocal {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(1.0f/a);
-  }
-};
+#define MXNET_SIMPLE_UNARY_MATH_OP(name) MXNET_UNARY_MATH_OP(name, math::name(a))
 
-struct reciprocal_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(-(DType(1.0f) / (a * a)));
-  }
-};
+#define MXNET_SIMPLE_BINARY_MATH_OP(name) MXNET_BINARY_MATH_OP(name, math::name(a, b))
 
-/*! \brief sigmoid unit */
-struct sigmoid {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(DType(1.0f) / (DType(1.0f) + expf(-a)));
-  }
-};
-struct sigmoid_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(a * (DType(1.0f) - a));
-  }
-};
-/*! \brief Rectified Linear Operation */
-struct relu {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(a > DType(0.0f) ? a : DType(0.0f));
-  }
-};
-struct relu_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(a > DType(0.0f) ? DType(1.0f) : DType(0.0f));
-  }
-};
+MXNET_UNARY_MATH_OP_NC(identity, a);
 
-/*! \brief Leaky ReLU Operation */
-struct xelu {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(a > DType(0.0f) ? a : a * b);
-  }
-};
+MXNET_UNARY_MATH_OP(identity_grad, 1);
 
-struct xelu_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(a > DType(0.0f) ? DType(1.0f) : b);
-  }
-};
+MXNET_BINARY_MATH_OP_NC(left, a);
 
-/*! \brief Exponential Linear Unit */
-struct elu {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType x, DType a) {
-    return DType(x > DType(0.0f) ? x : a * (expf(x) - DType(1.0f)));
-  }
-};
+MXNET_BINARY_MATH_OP_NC(right, b);
 
-struct elu_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType x, DType a) {
-    return DType(x > DType(0.0f) ? DType(1.0f) : a + x);
-  }
-};
+MXNET_UNARY_MATH_OP(negation, -a);
 
-struct tanh {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(tanhf( a ));
-  }
-};
+MXNET_UNARY_MATH_OP(reciprocal, 1.0f / math::id(a));
 
-struct tanh_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(DType(1.0f) - a * a);
-  }
-};
+MXNET_UNARY_MATH_OP(reciprocal_grad, -1.0f / math::sqr(a));
+
+MXNET_UNARY_MATH_OP(sigmoid, 1.0f / (1.0f + math::exp(-a)));
+
+MXNET_UNARY_MATH_OP(sigmoid_grad, math::id(a) * (1.0f - math::id(a)));
+
+MXNET_UNARY_MATH_OP_NC(relu, a > DType(0) ? a : DType(0));
+
+MXNET_UNARY_MATH_OP_NC(relu_grad, a > DType(0) ? DType(1) : DType(0));
+
+MXNET_BINARY_MATH_OP(xelu, a > DType(0) ? math::id(a) :
+                     math::id(a) * math::id(b));
+
+MXNET_BINARY_MATH_OP_NC(xelu_grad, a > DType(0) ? DType(1) : b);
 
-/*! \brief SoftReLU, also known as softplus activation. */
+MXNET_BINARY_MATH_OP(elu, a > DType(0) ? math::id(a) :
+                     math::id(b) * math::expm1(a));
+
+MXNET_BINARY_MATH_OP_NC(elu_grad, a > DType(0) ? DType(1) : DType(b + a));
+
+MXNET_SIMPLE_UNARY_MATH_OP(tanh);
+
+MXNET_UNARY_MATH_OP(tanh_grad, 1.0f - math::sqr(a));
+
+/*! \brief SoftReLU, also known as softplus activation */
 struct softrelu {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a) {
     // Avoid overflow of exp for large inputs.
     // Thresholds 20.0 is chosen such that softrelu(a) = a
-    // for a > 20 using floating precision.
-    if (a > DType(20.0)) {
+    // for a > 20 using floating precision
+    if (a > DType(20.0f)) {
       return a;
     } else {
-      return DType(log1pf(expf(a)));
+      return DType(math::log1p(math::exp(a)));
     }
   }
 };
-struct softrelu_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return -DType(expm1f(-a));
-  }
-};
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(softrelu)
 
-struct exp {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(expf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(softrelu_grad, -math::expm1(-a));
 
-struct expm1 {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(expm1f(a));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(exp);
 
-struct log {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(logf(a));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(expm1);
 
-struct log10 {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(log10f(a));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(log);
 
-struct log2 {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(log2f(a));
-  }
-};
+MXNET_UNARY_MATH_OP(log_grad, 1.0f / math::id(a));
 
-struct log_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(DType(1.0f) / a);
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(log10);
 
-struct sin {
+// Constant is 1 / log(10)
+struct log10_grad {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(sinf(a));
+    return DType(0.4342944819f / static_cast<float>(a));
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log10_grad)
 
-struct sin_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(cosf(a));
-  }
-};
+template<>
+MSHADOW_XINLINE double log10_grad::Map<double>(double a) {
+  return 0.43429448190325182765 / a;
+}
 
-struct log1p {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(log1pf(a));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(log2);
 
-struct log1p_grad {
+// Constant is 1 / log(2)
+struct log2_grad {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(DType(1.0f) / (DType(1.0f) + a));
+    return DType(1.442695041f / static_cast<float>(a));
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(log2_grad)
 
-struct cos {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(cosf(a));
-  }
-};
+template<>
+MSHADOW_XINLINE double log2_grad::Map<double>(double a) {
+  return 1.44269504088896340737 / a;
+}
 
-struct cos_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(-sinf(a));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(sin);
 
-struct tan {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(tanf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(sin_grad, math::cos(a));
 
-struct tan_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(a * a + 1);
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(log1p);
 
-struct arcsin {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(asinf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(log1p_grad, 1.0f / (1.0f + math::id(a)));
 
-struct arcsin_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(1.0 / (sqrtf(1 - a*a)));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(cos);
 
-struct arccos {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(acosf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(cos_grad, -math::sin(a));
 
-struct arccos_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(-1.0 / (sqrtf(1 - a*a)));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(tan);
 
-struct arctan {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(atanf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(tan_grad, math::sqr(a) + 1.0f);
 
-struct arctan_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(1 / (a*a + 1));
-  }
-};
+MXNET_UNARY_MATH_OP(arcsin, math::asin(a));
 
-struct hypot {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(sqrtf(a * a + b * b));
-  }
-};
+MXNET_UNARY_MATH_OP(arcsin_grad, 1.0f / math::sqrt(1.0f - math::sqr(a)));
 
-struct hypot_grad_left {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(a/sqrtf(a * a + b * b));
-  }
-};
+MXNET_UNARY_MATH_OP(arccos, math::acos(a));
 
-struct hypot_grad_right {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(b/sqrtf(a * a + b * b));
-  }
-};
+MXNET_UNARY_MATH_OP(arccos_grad, -1.0f / math::sqrt(1.0f - math::sqr(a)));
 
-struct degrees {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(180. / PI * a);
-  }
-};
+MXNET_UNARY_MATH_OP(arctan, math::atan(a));
 
-struct degrees_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(180. / PI);
-  }
-};
+MXNET_UNARY_MATH_OP(arctan_grad, 1.0f / (math::sqr(a) + 1.0f));
 
-struct radians {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(PI /180. * a);
-  }
-};
+MXNET_SIMPLE_BINARY_MATH_OP(hypot);
 
-struct radians_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(PI / 180.);
-  }
-};
+MXNET_BINARY_MATH_OP(hypot_grad_left, math::id(a) / math::hypot(a, b));
 
-struct sinh {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(sinhf(a));
-  }
-};
+MXNET_BINARY_MATH_OP(hypot_grad_right, math::id(b) / math::hypot(a, b));
 
-struct sinh_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(coshf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(degrees, 180.0f / PI * math::id(a));
 
-struct cosh {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(coshf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(degrees_grad, 180.0f / PI);
 
-struct cosh_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(sinhf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(radians, PI / 180.0f * math::id(a));
 
-struct arcsinh {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(asinhf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(radians_grad, PI / 180.0f);
 
-struct arcsinh_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(1.0 / (sqrtf(1 + a*a)));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(sinh);
 
-struct arccosh {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(acoshf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(sinh_grad, math::cosh(a));
 
-struct arccosh_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(1.0 / (sqrtf(a*a - 1.0)));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(cosh);
 
-struct arctanh {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(atanhf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(cosh_grad, math::sinh(a));
 
-struct arctanh_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(-1.0 / (a*a - 1.0));
-  }
-};
+MXNET_UNARY_MATH_OP(arcsinh, math::asinh(a));
 
-struct square {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(a * a);
-  }
-};
+MXNET_UNARY_MATH_OP(arcsinh_grad, 1.0f / math::hypot(a, DType(1)));
 
-struct square_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(DType(2.0f) * a);
-  }
-};
+MXNET_UNARY_MATH_OP(arccosh, math::acosh(a));
+
+MXNET_UNARY_MATH_OP(arccosh_grad, 1.0f / math::sqrt(math::sqr(a) - 1.0f));
+
+MXNET_UNARY_MATH_OP(arctanh, math::atanh(a));
+
+MXNET_UNARY_MATH_OP(arctanh_grad, 1.0f / (1.0f - math::sqr(a)));
+
+MXNET_UNARY_MATH_OP(square, math::sqr(a));
+
+MXNET_UNARY_MATH_OP(square_grad, 2.0f * math::id(a));
 
 /*! \brief used for generate Bernoulli mask */
-struct threshold {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(a < b ? DType(1.0f) : DType(0.0f));
-  }
-};
+MXNET_BINARY_MATH_OP_NC(threshold, a < b ? DType(1) : DType(0));
 
 /*! \brief used for generate element of abs */
-struct abs {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(fabsf(float(a)));  // NOLINT(*)
-  }
-};
+MXNET_UNARY_MATH_OP(abs, math::fabs(a)); // NOLINT(*)
 
 /*! \brief used for generate element of sign */
 struct sign {
   template<typename DType>
   MSHADOW_XINLINE static typename enable_if<!is_unsigned<DType>::value, DType>::type
   Map(DType a) {
-    if (a < 0.0f) return DType(-DType(1.0f));
-    if (a > 0.0f) return DType(1.0f);
-    return DType(0.0f);
+    if (a < DType(0)) return DType(-DType(1));
+    if (a > DType(0)) return DType(1);
+    return DType(0);
   }
   template<typename DType>
   MSHADOW_XINLINE static typename enable_if<is_unsigned<DType>::value, DType>::type
   Map(DType a) {
-    if (a > 0.0f) return DType(1.0f);
-    return DType(0.0f);
-  }
-};
-struct sign_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(0.0f);
+    if (a > DType(0)) return DType(1);
+    return DType(0);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(sign)
+
+MXNET_UNARY_MATH_OP_NC(sign_grad, DType(0));
+
 /*! \brief used for generate element of power */
-struct power {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(powf( a, b ));
-  }
-};
+MXNET_BINARY_MATH_OP(power, math::pow(a, b));
 
-struct power_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(powf( a, b - 1 )*b);
-  }
-};
+MXNET_BINARY_MATH_OP(power_grad, math::pow(a, b - DType(1)) * math::id(b));
 
-struct power_rgrad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(powf( a, b )*logf(a));
-  }
-};
+MXNET_BINARY_MATH_OP(power_rgrad, math::pow(a, b) * math::log(a));
 
-struct rpower {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(powf( b, a ));
-  }
-};
+MXNET_BINARY_MATH_OP(rpower, math::pow(b, a));
 
-struct rpower_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(a*logf(b));
-  }
-};
+MXNET_BINARY_MATH_OP(rpower_grad, math::id(a) * math::log(b));
 
 /*! \brief used for generate element of maximum */
-struct maximum {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return a > b ? a : b;
-  }
-};
+MXNET_BINARY_MATH_OP(maximum, a > b ? a : b);
 
 /*! \brief used for generate element of minimum */
-struct minimum {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return a < b ? a : b;
-  }
-};
+MXNET_BINARY_MATH_OP_NC(minimum, a < b ? a : b);
 
-struct ge {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return a >= b ? DType(1) : DType(0);
-  }
-};
+MXNET_BINARY_MATH_OP_NC(ge, a >= b ? DType(1) : DType(0));
 
-struct gt {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return a > b ? DType(1) : DType(0);
-  }
-};
+MXNET_BINARY_MATH_OP_NC(gt, a > b ? DType(1) : DType(0));
 
-struct lt {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return a < b ? DType(1) : DType(0);
-  }
-};
+MXNET_BINARY_MATH_OP_NC(lt, a < b ? DType(1) : DType(0));
 
-struct le {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return a <= b ? DType(1) : DType(0);
-  }
-};
+MXNET_BINARY_MATH_OP_NC(le, a <= b ? DType(1) : DType(0));
 
-struct eq {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return a == b ? DType(1) : DType(0);
-  }
-};
+MXNET_BINARY_MATH_OP_NC(eq, a == b ? DType(1) : DType(0));
 
-struct ne {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return a != b ? DType(1) : DType(0);
-  }
-};
+MXNET_BINARY_MATH_OP_NC(ne, a != b ? DType(1) : DType(0));
 
-/*!\ \brief used for generate element sqrt */
-struct square_root {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(sqrtf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(square_root, math::sqrt(a));
 
-struct square_root_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(DType(0.5f) / a);
-  }
-};
+MXNET_UNARY_MATH_OP(square_root_grad, 0.5f / math::id(a));
 
-/*!\ \brief used for generate element sqrt */
-struct reciprocal_square_root {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(DType(1.0f)/sqrtf(a));
-  }
-};
+MXNET_UNARY_MATH_OP(reciprocal_square_root, 1.0f / math::sqrt(a));
 
-struct reciprocal_square_root_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(-(DType(1.0f) / (DType(2.0f) * a * sqrtf(a))));
-  }
-};
+MXNET_UNARY_MATH_OP(reciprocal_square_root_grad, -0.5f / (math::sqrt(a) * math::id(a)));
+
+MXNET_UNARY_MATH_OP(cube_root, math::cbrt(a));
+
+MXNET_UNARY_MATH_OP(cube_root_grad, 1.0f / (3.0f * math::sqr(a)));
+
+MXNET_UNARY_MATH_OP(reciprocal_cube_root, 1.0f / math::cbrt(a));
+
+MXNET_UNARY_MATH_OP(reciprocal_cube_root_grad, -1.0f / (3.0f * math::cbrt(a) * math::id(a)));
 
 /*! \brief used for generate element of round */
-struct round {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(roundf(a));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(round);
 
 /*! \brief used for generate element of ceil */
-struct ceil {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(ceilf(a));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(ceil);
 
 /*! \brief used for generate element of floor */
-struct floor {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(floorf(a));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(floor);
 
 /*! \brief used to round towards zero */
-struct trunc {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    return DType(truncf(a));
-  }
-};
+MXNET_SIMPLE_UNARY_MATH_OP(trunc);
 
 /*! \brief used to round number to nearest integer */
 struct rint {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a) {
-    float floor = floorf(a);
-    float ceil = ceilf(a);
-    return DType((a - floor) <= (ceil - a) ? floor : ceil);
+    auto floor = math::floor(a);
+    auto ceil = math::ceil(a);
+    auto af = math::id(a);
+    return DType((af - floor) <= (ceil - af) ? floor : ceil);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rint)
 
 /*! \brief used to round number to integer nearest to 0 */
 struct fix {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a) {
-    float floor = floorf(a);
-    float ceil = ceilf(a);
+    auto floor = math::floor(a);
+    auto ceil = math::ceil(a);
     return DType((floor > 0 ? floor : -floor) < (ceil > 0 ? ceil : -ceil) ? floor : ceil);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(fix)
 
 /*! \brief used for generate gradient of MAE loss*/
-struct minus_sign {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(a-b > DType(0.0f) ? DType(1.0f) : -DType(1.0f));
-  }
-};
+MXNET_BINARY_MATH_OP_NC(minus_sign, a - b > DType(0) ? DType(1) : -DType(1));
 
-struct rminus {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(b-a);
-  }
-};
+MXNET_BINARY_MATH_OP(rminus, b - a);
 
-struct div_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(DType(1)/b);
-  }
-};
+MXNET_BINARY_MATH_OP(div_grad, 1.0f / math::id(b));
 
-struct div_rgrad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(-a/(b*b));
-  }
-};
+template<>
+MSHADOW_XINLINE mshadow::half::half2_t div_grad::Map<mshadow::half::half2_t>
+                                               (mshadow::half::half2_t a,
+                                                mshadow::half::half2_t b) {
+  return mshadow::half::half2_t(1) / b;
+}
 
-struct rdiv {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(b/a);
-  }
-};
+MXNET_BINARY_MATH_OP(div_rgrad, -math::id(a) / math::sqr(b));
 
-struct rdiv_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(-b/(a*a));
-  }
-};
+template<>
+MSHADOW_XINLINE mshadow::half::half2_t div_rgrad::Map<mshadow::half::half2_t>
+                                               (mshadow::half::half2_t a,
+                                                mshadow::half::half2_t b) {
+  return -a / (b * b);
+}
+
+MXNET_BINARY_MATH_OP(rdiv, math::id(b) / math::id(a));
+
+MXNET_BINARY_MATH_OP(rdiv_grad, -math::id(b) / math::sqr(a));
 
 struct mod {
   template<typename DType>
@@ -751,14 +435,14 @@ struct mod {
     }
   }
 };
-#ifdef __CUDACC__
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(mod)
+
 template<>
 MSHADOW_XINLINE mshadow::half::half2_t mod::Map<mshadow::half::half2_t>
                                                (mshadow::half::half2_t a,
                                                 mshadow::half::half2_t b) {
   return a%b;
 }
-#endif
 
 struct mod_grad {
   template<typename DType>
@@ -766,15 +450,17 @@ struct mod_grad {
     return DType(0);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_grad)
+
 template<>
 MSHADOW_XINLINE double mod_grad::Map<double>(double a, double b) {
-  return 1.0f;
+  return 1.0;
 }
 template<>
 MSHADOW_XINLINE float mod_grad::Map<float>(float a, float b) {
   return 1.0f;
 }
-#ifdef __CUDACC__
+
 template<>
 MSHADOW_XINLINE mshadow::half::half_t mod_grad::Map<mshadow::half::half_t>
                                                    (mshadow::half::half_t a,
@@ -786,7 +472,7 @@ MSHADOW_XINLINE mshadow::half::half2_t mod_grad::Map<mshadow::half::half2_t>
                                                     (mshadow::half::half2_t a,
                                                      mshadow::half::half2_t b) {
   mshadow::half::half2_t result = mshadow::half::half2_t();
-#if MSHADOW_CUDA_HALF2
+#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2)
   result.half2_ = ::__float2half2_rn(1.0f);
 #else
   result.half_t2[0] = mshadow::half::half_t(0.0f);
@@ -794,7 +480,6 @@ MSHADOW_XINLINE mshadow::half::half2_t mod_grad::Map<mshadow::half::half2_t>
 #endif
   return result;
 }
-#endif
 
 struct mod_rgrad {
   template<typename DType>
@@ -802,6 +487,8 @@ struct mod_rgrad {
     return DType(0);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(mod_rgrad)
+
 template<>
 MSHADOW_XINLINE double mod_rgrad::Map<double>(double a, double b) {
   return -::floor(a/b);
@@ -810,7 +497,7 @@ template<>
 MSHADOW_XINLINE float mod_rgrad::Map<float>(float a, float b) {
   return -::floorf(a/b);
 }
-#ifdef __CUDACC__
+
 template<>
 MSHADOW_XINLINE mshadow::half::half_t mod_rgrad::Map<mshadow::half::half_t>
                                                     (mshadow::half::half_t a,
@@ -821,7 +508,7 @@ template<>
 MSHADOW_XINLINE mshadow::half::half2_t mod_rgrad::Map<mshadow::half::half2_t>
                                                      (mshadow::half::half2_t a,
                                                       mshadow::half::half2_t b) {
-#if MSHADOW_CUDA_HALF2
+#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2)
   return mshadow::half::half2_t(__hneg2(::h2floor((a/b).half2_)));
 #else
   return mshadow::half::half2_t(mshadow::half::half_t(-::floorf(
@@ -830,7 +517,6 @@ MSHADOW_XINLINE mshadow::half::half2_t mod_rgrad::Map<mshadow::half::half2_t>
                                   static_cast<float>(a.half_t2[1]/b.half_t2[1]))));
 #endif
 }
-#endif
 
 struct rmod {
   template<typename DType>
@@ -866,14 +552,14 @@ struct rmod {
     }
   }
 };
-#ifdef __CUDACC__
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(rmod)
+
 template<>
 MSHADOW_XINLINE mshadow::half::half2_t rmod::Map<mshadow::half::half2_t>
                                                 (mshadow::half::half2_t a,
                                                  mshadow::half::half2_t b) {
   return b%a;
 }
-#endif
 
 struct rmod_grad {
   template<typename DType>
@@ -881,6 +567,8 @@ struct rmod_grad {
     return DType(0);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(rmod_grad)
+
 template<>
 MSHADOW_XINLINE double rmod_grad::Map<double>(double a, double b) {
   return -::floor(b/a);
@@ -889,7 +577,7 @@ template<>
 MSHADOW_XINLINE float rmod_grad::Map<float>(float a, float b) {
   return -::floorf(b/a);
 }
-#ifdef __CUDACC__
+
 template<>
 MSHADOW_XINLINE mshadow::half::half_t rmod_grad::Map<mshadow::half::half_t>
                                                    (mshadow::half::half_t a,
@@ -900,7 +588,7 @@ template<>
 MSHADOW_XINLINE mshadow::half::half2_t rmod_grad::Map<mshadow::half::half2_t>
                                                      (mshadow::half::half2_t a,
                                                       mshadow::half::half2_t b) {
-#if MSHADOW_CUDA_HALF2
+#if (defined(__CUDACC__) && MSHADOW_CUDA_HALF2)
   return mshadow::half::half2_t(::__hneg2(::h2floor((b/a).half2_)));
 #else
   return mshadow::half::half2_t(mshadow::half::half_t(-::floorf(
@@ -909,7 +597,6 @@ MSHADOW_XINLINE mshadow::half::half2_t rmod_grad::Map<mshadow::half::half2_t>
                                   static_cast<float>(b.half_t2[1]/a.half_t2[1]))));
 #endif
 }
-#endif
 
 struct clip {
   template<typename DType>
@@ -923,49 +610,30 @@ struct clip {
     }
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(clip)
 
 /***** gamma ******/
 
-struct gamma {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    // default implementation using floating precision
-    return DType(tgammaf(a));
-  }
-};
-
-template<>
-MSHADOW_XINLINE double gamma::Map<double>(double a) {
-  return tgamma(a);
-}
+MXNET_UNARY_MATH_OP(gamma, math::tgamma(a));
 
 struct gamma_grad {
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a) {
     // default implementation using floating precision
-    return DType(tgammaf(a) * special_functions::cephes::psi<float>(a));
+    float af(static_cast<float>(a));
+    return DType(math::tgamma(af) * special_functions::cephes::psi<float>(af));
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gamma_grad)
 
 template<>
 MSHADOW_XINLINE double gamma_grad::Map<double>(double a) {
-  return tgamma(a) * special_functions::cephes::psi<double>(a);
+  return math::tgamma(a) * special_functions::cephes::psi<double>(a);
 }
 
 /***** gammaln ******/
 
-struct gammaln {
-  template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a) {
-    // default implementation using floating precision
-    return DType(lgammaf(a));
-  }
-};
-
-template<>
-MSHADOW_XINLINE double gammaln::Map<double>(double a) {
-  return lgamma(a);
-}
+MXNET_UNARY_MATH_OP(gammaln, math::lgamma(a));
 
 struct gammaln_grad {
   template<typename DType>
@@ -974,6 +642,7 @@ struct gammaln_grad {
     return DType(special_functions::cephes::psi<float>(a));
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(gammaln_grad)
 
 template<>
 MSHADOW_XINLINE double gammaln_grad::Map<double>(double a) {
@@ -981,28 +650,31 @@ MSHADOW_XINLINE double gammaln_grad::Map<double>(double a) {
 }
 
 /* Smooth L1 Loss is a loss specific for R-CNN franchise training
- * Smooth L1 Loss function
+ * Smooth L1 Loss function:
  * f(x) = 0.5 * (sigma * x) ^ 2,     |x| < 1 / sigma^2
  *      = |x| - 0.5 / sigma / sigma, otherwise
- * When sigma = 1, it is equivalent to Huber Loss evaluated at
+ * When sigma = 1, it is equivalent to the Huber loss, evaluated at
  * delta = 1.
  * smooth_l1_loss = w_out * f(w_in * x)
  * with w_in, w_out provided by input_data.
  */
 struct smooth_l1_loss {
-  // a is x, b is sigma2
+  // a is x, b is sigma
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    b *= b;
-    if (a > 1.0f / b) {
-      return a - 0.5f / b;
-    } else if (a < -1.0f / b) {
-      return -a - 0.5f / b;
+    auto bsq = math::sqr(b);
+    auto ibsq = 1.0f / bsq;
+    auto af = math::id(a);
+    if (af > ibsq) {
+      return DType(af - 0.5f * ibsq);
+    } else if (af < -ibsq) {
+      return DType(-af - 0.5f * ibsq);
     } else {
-      return 0.5f * a * a * b;
+      return DType(0.5f * af * af * bsq);
     }
   }
 };  // struct smooth_l1_loss
+MSHADOW_OP_DECLARE_TUNABLE_FWD_AND_BWD(smooth_l1_loss)
 
 /* The derivative of smooth l1 loss is
  * f'(x) = sigma^2 * x, |x| < 1 / sigma^2
@@ -1012,16 +684,19 @@ struct smooth_l1_gradient {
   // a is x, b is sigma2
   template<typename DType>
   MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    b *= b;
-    if (a > 1.0f / b) {
-      return 1.0f;
-    } else if (a < -1.0f / b) {
+    auto bsq = math::sqr(b);
+    auto ibsq = 1.0f / bsq;
+    auto af = math::id(a);
+    if (af > ibsq) {
+      return DType(1);
+    } else if (af < -ibsq) {
       return DType(-1);
     } else {
-      return b * a;
+      return DType(bsq * af);
     }
   }
 };  // struct smooth_l1_derivative
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(smooth_l1_gradient)
 
 /*! \brief product reducer */
 struct product {
@@ -1030,6 +705,11 @@ struct product {
   MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { // NOLINT(*)
     dst *= src;
   }
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src, volatile DType& none) { // NOLINT(*)
+    Reduce(dst, src);
+  }
   /*!
   *\brief calculate gradient of redres with respect to redsrc,
   * redres: reduced result, redsrc: one of reduction element
@@ -1045,6 +725,13 @@ struct product {
   MSHADOW_XINLINE static void SetInitValue(DType &initv) { // NOLINT(*)
     initv = 1;
   }
+  /*!
+  *\brief set the initial value during reduction
+  */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv, DType &none) { // NOLINT(*)
+    SetInitValue(initv);
+  }
 };
 
 namespace isnan_typed {
@@ -1076,19 +763,17 @@ struct nansum {
   /*! \brief do reduction into dst */
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { // NOLINT(*)
-    if (isnan_typed::IsNan(dst)) {
-      if (isnan_typed::IsNan(src)) {
-        dst = DType(0);
-      } else {
-        dst = src;
-      }
-    } else {
-      if (isnan_typed::IsNan(src)) {
-        dst = dst;
-      } else {
-        dst += src;
-      }
-    }
+    if (isnan_typed::IsNan(src)) return;
+    dst += src;
+  }
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src, volatile DType& residual) { // NOLINT(*)
+    if (isnan_typed::IsNan(src)) return;
+    DType y = src - residual;
+    DType t = dst + y;
+    residual = (t - dst) - y;
+    dst = t;
   }
   /*!
   *\brief set the initial value during reduction
@@ -1097,6 +782,14 @@ struct nansum {
   MSHADOW_XINLINE static void SetInitValue(DType & initv) { // NOLINT(*)
       initv = 0;
   }
+  /*!
+   *\brief set the initial value during reduction
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv, DType &residual) { // NOLINT(*)
+    SetInitValue(initv);
+    residual = 0;
+  }
 };
 
 struct nansum_grad {
@@ -1105,25 +798,20 @@ struct nansum_grad {
     return isnan_typed::IsNan(a) ? DType(0) : DType(1);
   }
 };
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nansum_grad)
 
 /*! \brief product reducer that ignores NaN values in the input */
 struct nanprod {
   /*! \brief do reduction into dst */
   template<typename DType>
   MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src) { // NOLINT(*)
-    if (isnan_typed::IsNan(dst)) {
-      if (isnan_typed::IsNan(src)) {
-        dst = DType(1);
-      } else {
-        dst = src;
-      }
-    } else {
-      if (isnan_typed::IsNan(src)) {
-        dst = dst;
-      } else {
-        dst *= src;
-      }
-    }
+    if (isnan_typed::IsNan(src)) return;
+    dst *= src;
+  }
+  /*! \brief do reduction into dst */
+  template<typename DType>
+  MSHADOW_XINLINE static void Reduce(volatile DType& dst, volatile DType src, volatile DType& none) { // NOLINT(*)
+    Reduce(dst, src);
   }
   /*!
   *\brief set the initial value during reduction
@@ -1132,6 +820,13 @@ struct nanprod {
   MSHADOW_XINLINE static void SetInitValue(DType & initv) { // NOLINT(*)
     initv = 1;
   }
+  /*!
+  *\brief set the initial value during reduction
+  */
+  template<typename DType>
+  MSHADOW_XINLINE static void SetInitValue(DType &initv, DType &none) { // NOLINT(*)
+    SetInitValue(initv);
+  }
 };
 
 struct nanprod_grad {
@@ -1140,7 +835,7 @@ struct nanprod_grad {
     return isnan_typed::IsNan(a) ? DType(0) : b / a;
   }
 };
-
+MSHADOW_OP_DECLARE_TUNABLE_BACKWARD(nanprod_grad)
 }  // namespace mshadow_op
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 0af7d026d9d5..1d47943082e2 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file mxnet_op.h
  * \brief
  * \author Junyuan Xie
@@ -25,8 +26,17 @@
 #ifndef MXNET_OPERATOR_MXNET_OP_H_
 #define MXNET_OPERATOR_MXNET_OP_H_
 
+#include <dmlc/omp.h>
 #include <mxnet/base.h>
+#include <mxnet/engine.h>
+#include <mxnet/op_attr_types.h>
 #include <algorithm>
+#include "./operator_tune.h"
+#include "../engine/openmp.h"
+
+#ifdef __CUDACC__
+#include "../common/cuda_utils.h"
+#endif  // __CUDACC__
 
 namespace mxnet {
 namespace op {
@@ -40,6 +50,8 @@ const float PI = 3.14159265358979323846;
 using std::isnan;
 #endif
 
+template<typename xpu>
+int get_num_threads(const int N);
 
 #ifdef __CUDACC__
 #define CUDA_KERNEL_LOOP(i, n) \
@@ -47,6 +59,13 @@ using std::isnan;
       i < (n); \
       i += blockDim.x * gridDim.x)
 
+inline cudaDeviceProp cuda_get_device_prop() {
+  int device;
+  CUDA_CALL(cudaGetDevice(&device));
+  cudaDeviceProp deviceProp;
+  CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device));
+  return deviceProp;
+}
 
 /*!
  * \brief Get the number of blocks for cuda kernel given N
@@ -55,8 +74,19 @@ inline int cuda_get_num_blocks(const int N) {
   using namespace mshadow::cuda;
   return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
 }
+
+template<>
+inline int get_num_threads<gpu>(const int N) {
+  using namespace mshadow::cuda;
+  return kBaseThreadNum * cuda_get_num_blocks(N);
+}
+
 #endif  // __CUDACC__
 
+template<>
+inline int get_num_threads<cpu>(const int N) {
+  return omp_get_max_threads();
+}
 
 /*! \brief operator request type switch */
 #define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...)  \
@@ -66,13 +96,13 @@ inline int cuda_get_num_blocks(const int N) {
   case kWriteInplace:                               \
   case kWriteTo:                                    \
     {                                               \
-      const int ReqType = kWriteTo;                 \
+      const OpReqType ReqType = kWriteTo;           \
       {__VA_ARGS__}                                 \
     }                                               \
     break;                                          \
   case kAddTo:                                      \
     {                                               \
-      const int ReqType = kAddTo;                   \
+      const OpReqType ReqType = kAddTo;             \
       {__VA_ARGS__}                                 \
     }                                               \
     break;                                          \
@@ -81,6 +111,28 @@ inline int cuda_get_num_blocks(const int N) {
   }
 
 
+#define MXNET_NDIM_SWITCH(NDim, ndim, ...)         \
+  if (NDim == 0) {                                 \
+  } else if (NDim == 1) {                          \
+    const int ndim = 1;                            \
+    {__VA_ARGS__}                                  \
+  } else if (NDim == 2) {                          \
+    const int ndim = 2;                            \
+    {__VA_ARGS__}                                  \
+  } else if (NDim == 3) {                          \
+    const int ndim = 3;                            \
+    {__VA_ARGS__}                                  \
+  } else if (NDim == 4) {                          \
+    const int ndim = 4;                            \
+    {__VA_ARGS__}                                  \
+  } else if (NDim == 5) {                          \
+    const int ndim = 5;                            \
+    {__VA_ARGS__}                                  \
+  } else {                                         \
+    LOG(FATAL) << "ndim=" << NDim << "too large "; \
+  }
+
+
 /*!
  * \brief assign the val to out according
  * to request in Kernel::Launch
@@ -139,8 +191,9 @@ template<int ndim>
 MSHADOW_XINLINE int dot(const Shape<ndim>& coord, const Shape<ndim>& stride) {
   int ret = 0;
   #pragma unroll
-  for (int i = 0; i < ndim; ++i)
+  for (int i = 0; i < ndim; ++i) {
     ret += coord[i] * stride[i];
+  }
   return ret;
 }
 
@@ -173,40 +226,222 @@ MSHADOW_XINLINE Shape<ndim> calc_stride(const Shape<ndim>& shape) {
   return stride;
 }
 
+/* Increment coordinates and modify index */
+template<int ndim>
+MSHADOW_XINLINE void inc(Shape<ndim>* coord, const Shape<ndim>& shape,
+                         index_t* idx, const Shape<ndim>& stride) {
+  ++(*coord)[ndim-1];
+  *idx += stride[ndim-1];
+  #pragma unroll
+  for (int i = ndim - 1; i > 0 && (*coord)[i] >= shape[i]; --i) {
+    (*coord)[i] -= shape[i];
+    ++(*coord)[i-1];
+    *idx = *idx + stride[i-1] - shape[i] * stride[i];
+  }
+}
 
-struct fill {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType val) {
-    out[i] = val;
+/* Increment coordinates and modify index */
+template<int ndim>
+MSHADOW_XINLINE void inc(Shape<ndim>* coord, const Shape<ndim>& shape,
+                         index_t* idx1, const Shape<ndim>& stride1,
+                         index_t* idx2, const Shape<ndim>& stride2) {
+  ++(*coord)[ndim-1];
+  *idx1 += stride1[ndim-1];
+  *idx2 += stride2[ndim-1];
+  #pragma unroll
+  for (int i = ndim - 1; i > 0 && (*coord)[i] >= shape[i]; --i) {
+    (*coord)[i] -= shape[i];
+    ++(*coord)[i-1];
+    *idx1 = *idx1 + stride1[i-1] - shape[i] * stride1[i];
+    *idx2 = *idx2 + stride2[i-1] - shape[i] * stride2[i];
+  }
+}
+
+/*!
+ * \brief Simple copy data from one blob to another
+ * \param to Destination blob
+ * \param from Source blob
+ */
+template <typename xpu>
+MSHADOW_CINLINE void copy(mshadow::Stream<xpu> *s, const TBlob& to, const TBlob& from) {
+  CHECK_EQ(from.Size(), to.Size());
+  CHECK_EQ(from.dev_mask(), to.dev_mask());
+  MSHADOW_TYPE_SWITCH(to.type_flag_, DType, {
+    if (to.type_flag_ == from.type_flag_) {
+      mshadow::Copy(to.FlatTo1D<xpu, DType>(), from.FlatTo1D<xpu, DType>(), s);
+    } else {
+      MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
+        to.FlatTo1D<xpu, DType>(s) = mshadow::expr::tcast<DType>(from.FlatTo1D<xpu, SrcDType>(s));
+      })
+    }
+  })
+}
+
+/*! \brief Binary op backward gradient OP wrapper */
+template<typename GRAD_OP>
+struct backward_grad {
+  /* \brief Backward calc with grad
+   * \param a - output grad
+   * \param args... - data to grad calculation op (what this is -- input, output, etc. -- varies)
+   * \return input grad
+   */
+  template<typename DType, typename ...Args>
+  MSHADOW_XINLINE static DType Map(DType a, Args... args) {
+    return DType(a * GRAD_OP::Map(args...));
   }
 };
 
+/*! \brief Select assignment operation based upon the req value
+ * Also useful for mapping mshadow Compute (F<OP>) to Kernel<OP>::Launch
+ */
+template<typename OP, int req>
+struct op_with_req {
+  typedef OP Operation;
 
-struct set_zero {
+  /*! \brief input is one tensor */
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out) {
-    out[i] = static_cast<DType>(0);
+  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(in[i]));
   }
-};
 
+  /*! \brief inputs are two tensors */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *lhs, const DType *rhs) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(lhs[i], rhs[i]));
+  }
+
+  /*! \brief input is tensor and a scalar value */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out, const DType *in, const DType value) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(in[i], value));
+  }
+
+  /*! \brief No inputs (ie fill to constant value) */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out) {
+    KERNEL_ASSIGN(out[i], req, OP::Map());
+  }
+
+  /*! \brief input is single scalar value */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out, const DType value) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(value));
+  }
+
+  /*! \brief inputs are two tensors and a scalar value */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out,
+                                  const DType *input_1, const DType *input_2, const DType value) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(input_1[i], input_2[i], value));
+  }
+
+  /*! \brief inputs are three tensors (ie backward grad with binary grad function) */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out,
+                                  const DType *input_1,
+                                  const DType *input_2,
+                                  const DType *input_3) {
+    KERNEL_ASSIGN(out[i], req, OP::Map(input_1[i], input_2[i], input_3[i]));
+  }
+};
 
 template<typename OP, typename xpu>
 struct Kernel;
 
-
+/*!
+ * \brief CPU Kernel launcher
+ * \tparam OP Operator to launch
+ */
 template<typename OP>
 struct Kernel<OP, cpu> {
+  /*!
+   * \brief Launch a generic CPU kernel.
+   * When using this for a new kernel op, add declaration and tuning objects to
+   * operator_tune.cc
+   * \tparam Args Varargs type to eventually pass to the OP::Map() functoion
+   * \param N Number of iterations
+   * \param dest Destination pointer (used to infer DType)
+   * \param args Varargs to eventually pass to the OP::Map() functoion
+   */
   template<typename ...Args>
-  inline static void Launch(mshadow::Stream<cpu> *s, int N, Args... args) {
-#if (MXNET_USE_CUDA == 0)
-    #pragma omp parallel for
+  inline static void Launch(mshadow::Stream<cpu> *, const int N, Args... args) {
+#ifdef _OPENMP
+    const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    if (omp_threads < 2) {
+      for (int i = 0; i < N; ++i) {
+        OP::Map(i, args...);
+      }
+    } else {
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int i = 0; i < N; ++i) {
+        OP::Map(i, args...);
+      }
+    }
+#else
+    for (int i = 0; i < N; ++i) {
+      OP::Map(i, args...);
+    }
 #endif
+  }
+
+  /*!
+   * \brief Launch CPU kernel which has OMP tuning data available.
+   * When using this for a new kernel op, add declaration and tuning objects to
+   * operator_tune.cc
+   * \tparam PRIMITIVE_OP The primitive operation to use for tuning
+   * \tparam DType Data type
+   * \tparam Args Varargs type to eventually pass to the OP::Map() functoion
+   * \param N Number of iterations
+   * \param dest Destination pointer (used to infer DType)
+   * \param args Varargs to eventually pass to the OP::Map() functoion
+   */
+  template<typename PRIMITIVE_OP, typename DType, typename ...Args>
+  static void LaunchTuned(mshadow::Stream<cpu> *, const int N, Args... args) {
+#ifdef _OPENMP
+    const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    if (omp_threads < 2 || !tuned_op<PRIMITIVE_OP, DType>::UseOMP(
+      static_cast<size_t>(N), static_cast<size_t>(omp_threads))) {
+      for (int i = 0; i < N; ++i) {
+        OP::Map(i, args...);
+      }
+    } else {
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int i = 0; i < N; ++i) {
+        OP::Map(i, args...);
+      }
+    }
+#else
     for (int i = 0; i < N; ++i) {
       OP::Map(i, args...);
     }
+#endif
   }
-};
 
+  /*!
+   * \brief Launch custom-tuned kernel where each thread is set to
+   *        operate on a contiguous partition
+   * \tparam Args Varargs type to eventually pass to the OP::Map() functoion
+   * \param N Number of iterations
+   * \param args Varargs to eventually pass to the UseOMP() and OP::Map() functions
+   */
+  template<typename ...Args>
+  inline static void LaunchEx(mshadow::Stream<cpu> *s, const int N, Args... args) {
+#ifdef _OPENMP
+    const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    if (omp_threads < 2) {
+      OP::Map(0, N, args...);
+    } else {
+      const int length = (N + omp_threads - 1) / omp_threads;
+      #pragma omp parallel for num_threads(omp_threads)
+      for (int i = 0; i < N; i += length) {
+        OP::Map(i, i + length > N ? N - i : length, args...);
+      }
+    }
+#else
+    OP::Map(0, N, args...);
+#endif
+  }
+};
 
 #ifdef __CUDACC__
 template<typename OP, typename ...Args>
@@ -216,9 +451,16 @@ __global__ void mxnet_generic_kernel(int N, Args... args) {
   }
 }
 
+template<typename OP, typename ...Args>
+__global__ void mxnet_generic_kernel_ex(int N, Args... args) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {
+    OP::Map(i, 1, args...);
+  }
+}
 
 template<typename OP>
 struct Kernel<OP, gpu> {
+  /*! \brief Launch GPU kernel */
   template<typename ...Args>
   inline static void Launch(mshadow::Stream<gpu> *s, int N, Args... args) {
     using namespace mshadow::cuda;
@@ -227,11 +469,95 @@ struct Kernel<OP, gpu> {
       <<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
         N, args...);
   }
+
+  template<typename ...Args>
+  inline static void LaunchEx(mshadow::Stream<gpu> *s, const int N, Args... args) {
+    using namespace mshadow::cuda;
+    int ngrid = std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
+    mxnet_generic_kernel_ex<OP, Args...>
+      <<<ngrid, kBaseThreadNum, 0, mshadow::Stream<gpu>::GetStream(s)>>>(
+        N, args...);
+  }
 };
 #endif  // __CUDACC__
 
+/*!
+ * \brief Wrap Kernel<OP, xpu>::Launch* with some special-case helpers
+ */
+template<typename OP, typename xpu>
+struct KernelWrapper {
+  /*!
+   * \brief Launch 'mshadow_op-type' op (i.e. DType (*)( ... ) { return <operation> }
+   * \tparam Args Varargs type to eventually pass to the OP::Map() function
+   * \param s Stream object pointer (unused)
+   * \param N Number of iterations
+   * \param args Varargs to eventually pass to the OP::Map() functoion
+   */
+  template<typename DType, typename ...Args>
+  MSHADOW_CINLINE static void LaunchMShadowOpEx(mshadow::Stream<xpu> *s,
+                                                const int N,
+                                                DType *dest,
+                                                Args... args) {
+    mxnet::op::mxnet_op::Kernel<OP, xpu>::template LaunchTuned<
+      typename OP::Operation, DType>(s, N, dest, args...);
+  }
+
+  /*!
+   * \brief Launch 'mxnet_op-type' op (i.e. void (*)(int N, DType *out, ... )
+   * \tparam Args Varargs type to eventually pass to the OP::Map() function
+   * \param s Stream object pointer (unused)
+   * \param N Number of iterations
+   * \param args Varargs to eventually pass to the OP::Map() functoion
+   */
+  template<typename DType, typename ...Args>
+  MSHADOW_CINLINE static void LaunchMXNetOpEx(mshadow::Stream<xpu> *s,
+                                              const int N,
+                                              DType *dest,
+                                              Args... args) {
+    mxnet::op::mxnet_op::Kernel<OP, xpu>::template LaunchTuned<OP, DType>(s, N, dest, args...);
+  }
+};
 
+/*!
+ * \brief Set to immediate scalar value kernel
+ * \tparam val Scalar immediate
+ */
+template<int val>
+struct set_to_int {
+  // mxnet_op version (when used directly with Kernel<>::Launch()) */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType *out) {
+    out[i] = DType(val);
+  }
+  // mshadow_op version (when used with op_with_req<>)
+  MSHADOW_XINLINE static int Map() {
+    return val;
+  }
+};
+
+/*!
+ * \brief Special-case kernel shortcut for setting to zero and one
+ */
+using set_zero = set_to_int<0>;
+using set_one  = set_to_int<1>;
+_MXNET_TUNABLE_MXNET_OP_FWD(set_zero);  // _ prefix denotes "already in mxnet_op namespace"
+_MXNET_TUNABLE_MXNET_OP_FWD(set_one);
 }  // namespace mxnet_op
+
+/*!
+ * \brief Tuning specializations for the simple ops in <mshadow/base.h>
+ *        Basically, this overrides mxnet::op::mxnet_op::Kernel<OP, cpu>::Launch() and
+ *        redirects to mxnet::op::mxnet_op::KernelWrapper<OP, cpu>::Launch????OpEx(),
+ *        which eventually leads back to mxnet::op::mxnet_op::Kernel<OP, cpu>::LaunchTuned()
+ */
+MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::identity)
+MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::plus)
+MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::minus)
+MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::mul)
+MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::div)
+MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(mshadow::op::right)
+
 }  // namespace op
 }  // namespace mxnet
+
 #endif  // MXNET_OPERATOR_MXNET_OP_H_
diff --git a/src/operator/nn/im2col.h b/src/operator/nn/im2col.h
index 621b2451a19e..256319dd1e71 100644
--- a/src/operator/nn/im2col.h
+++ b/src/operator/nn/im2col.h
@@ -67,6 +67,7 @@
  *
  ***************** END Caffe Copyright Notice and Disclaimer ********************
  *
+ * Copyright (c) 2017 by Contributors
  * \file im2col.h
  * \brief Function definitions of converting an image to
  * column matrix based on kernel, padding, and dilation.
diff --git a/src/operator/nn/pool.h b/src/operator/nn/pool.h
index 3bac86560407..79accb5d521f 100644
--- a/src/operator/nn/pool.h
+++ b/src/operator/nn/pool.h
@@ -1,22 +1,3 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
  ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
  *
@@ -67,6 +48,7 @@
  *
  ***************** END Caffe Copyright Notice and Disclaimer ********************
  *
+ * Copyright (c) 2017 by Contributors
  * \file pool.h
  * \brief Function definitions of pooling 1/2/3-D images.
  * We adopted looping 2-D image pixels from Caffe and extended it to 1-D and 3-D cases.
diff --git a/src/operator/nn/sequence_mask-inl.h b/src/operator/nn/sequence_mask-inl.h
new file mode 100644
index 000000000000..a3b41f6ecf0a
--- /dev/null
+++ b/src/operator/nn/sequence_mask-inl.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file sequence_mask-inl.h
+ * \brief
+*/
+#ifndef MXNET_OPERATOR_NN_SEQUENCE_MASK_INL_H_
+#define MXNET_OPERATOR_NN_SEQUENCE_MASK_INL_H_
+
+namespace mxnet {
+namespace op {
+namespace mxnet_op {
+
+template <typename DType, typename LType>
+inline void SequenceMask(const mshadow::Tensor<cpu, 3, DType> &dst,
+                         const mshadow::Tensor<cpu, 1, LType> label, DType value) {
+  for (index_t b = 0; b < dst.size(1); ++b)
+    for (index_t s = label[b]; s < dst.size(0); ++s)
+      for (index_t r = 0; r < dst.size(2); ++r)
+        dst[s][b][r] = value;
+}
+
+#ifdef __CUDACC__
+template<int n_bits, typename DType, typename LType>
+__global__ void SequenceMaskKernel(mshadow::Tensor<gpu, 3, DType> dst,
+                                   const mshadow::Tensor<gpu, 1, LType> lengths, DType value) {
+  const index_t smax = dst.size(0);
+  const index_t bmax = lengths.size(1);
+  const index_t nmax = dst.size(2);
+  unsigned int batch = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // early return if out of bounds
+  if (batch >= bmax)
+    return;
+
+  // loop over batches
+    for (index_t s = lengths[batch]; s < smax; ++s)
+      for (index_t r = 0; r < nmax; ++r)
+      dst[s][batch][r] = value;
+}
+
+template<typename DType, typename LType>
+inline void SequenceMask(const mshadow::Tensor<gpu, 3, DType> &dst,
+                         const mshadow::Tensor<gpu, 1, LType> &lengths, DType value) {
+  using namespace mshadow;
+  using namespace mshadow::cuda;
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid(dst.size(1));
+  CheckLaunchParam(dimGrid, dimBlock, "SequenceMask");
+  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
+  SequenceMaskKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, lengths, value);
+}
+#endif
+
+}  // namespace mxnet_op
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_NN_SEQUENCE_MASK_INL_H_
diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h
index e1150b14f69d..2badecf3d087 100644
--- a/src/operator/nn/softmax-inl.h
+++ b/src/operator/nn/softmax-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file softmax-inl.h
  * \brief
 */
diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc
index 58c1a051248e..e804c67c0757 100644
--- a/src/operator/nn/softmax.cc
+++ b/src/operator/nn/softmax.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file softmax.cc
  * \brief CPU Implementation of softmax
  */
diff --git a/src/operator/nn/softmax.cu b/src/operator/nn/softmax.cu
index d5a843ddc07b..4b9c04cdbe8f 100644
--- a/src/operator/nn/softmax.cu
+++ b/src/operator/nn/softmax.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file softmax.cc
  * \brief CPU Implementation of softmax
  */
diff --git a/src/operator/nnpack/nnpack_convolution-inl.h b/src/operator/nnpack/nnpack_convolution-inl.h
index 4a1342688969..0e2c73693d15 100644
--- a/src/operator/nnpack/nnpack_convolution-inl.h
+++ b/src/operator/nnpack/nnpack_convolution-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_convolution-inl.h
  * \brief
  * \author Carwin
diff --git a/src/operator/nnpack/nnpack_fully_connected-inl.h b/src/operator/nnpack/nnpack_fully_connected-inl.h
index f85ddd89c702..d9412d20d0c1 100644
--- a/src/operator/nnpack/nnpack_fully_connected-inl.h
+++ b/src/operator/nnpack/nnpack_fully_connected-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_fully_connected-inl.h
  * \brief
  * \author Wei Wu
diff --git a/src/operator/nnpack/nnpack_pooling-inl.h b/src/operator/nnpack/nnpack_pooling-inl.h
index 968ead16204d..25b478322753 100644
--- a/src/operator/nnpack/nnpack_pooling-inl.h
+++ b/src/operator/nnpack/nnpack_pooling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_pooling-inl.h
  * \brief
  * \author Wei Wu
diff --git a/src/operator/nnpack/nnpack_util.cc b/src/operator/nnpack/nnpack_util.cc
index b873b591fa57..7d075e0554ba 100644
--- a/src/operator/nnpack/nnpack_util.cc
+++ b/src/operator/nnpack/nnpack_util.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_util.cc
  * \brief
  * \author Wei Wu
diff --git a/src/operator/nnpack/nnpack_util.h b/src/operator/nnpack/nnpack_util.h
index cde1880257a3..2edfb79ad46e 100644
--- a/src/operator/nnpack/nnpack_util.h
+++ b/src/operator/nnpack/nnpack_util.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file nnpack_util.h
  * \brief
  * \author Carwin
diff --git a/src/operator/operator.cc b/src/operator/operator.cc
index 9117c1c1288a..6474cd08708b 100644
--- a/src/operator/operator.cc
+++ b/src/operator/operator.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file operator.cc
  * \brief operator module of mxnet
  */
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 2d46bd3230ce..560d11e67e10 100644
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file  operator_common.h
  * \brief common internal header of most operators
  *   this header includes utility functions operator can use
@@ -28,13 +29,18 @@
 
 #include <dmlc/json.h>
 #include <dmlc/logging.h>
+#include <dmlc/thread_local.h>
 #include <mxnet/operator.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/op_attr_types.h>
 #include <mxnet/base.h>
 #include <istream>
 #include <ostream>
 #include <string>
 #include <vector>
+#include <algorithm>
 #include "../common/cuda_utils.h"
+#include "../common/utils.h"
 
 namespace mxnet {
 namespace op {
@@ -86,7 +92,18 @@ struct InferTypeError : public dmlc::Error {
     : dmlc::Error(msg_), msg(msg_), index(index) {}
 };
 
-/*! \brief check if shape is empty or contains unkown (0) dim. */
+/*! \brief exception throwed by InferStorageType error */
+struct InferStorageTypeError : public dmlc::Error {
+  /*! \brief analyze message */
+  std::string msg;
+  /*! \brief corresponding input index */
+  int index;
+  // constructor
+  InferStorageTypeError(const std::string& msg_, int index)
+    : dmlc::Error(msg_), msg(msg_), index(index) {}
+};
+
+/*! \brief check if shape is empty or contains unknown (0) dim. */
 inline bool shape_is_none(const TShape& x) {
   return x.ndim() == 0 || x.Size() == 0;
 }
@@ -96,6 +113,11 @@ inline bool type_is_none(const int& x) {
   return x == -1;
 }
 
+/*! \brief check if type is none (-1) */
+inline bool storage_type_is_none(const int& x) {
+  return x == -1;
+}
+
 /*! \brief check if shape is scalar({1}). */
 inline bool shape_is_scalar(const TShape& x) {
   return x.ndim() == 1 && x.Size() == 1;
@@ -108,7 +130,7 @@ inline std::string shape_string(const TShape& x) {
   return os.str();
 }
 
-/*! \brief get string representation of shape */
+/*! \brief get string representation of data type */
 inline std::string type_string(const int& x) {
   switch (x) {
     case mshadow::kFloat32:
@@ -117,10 +139,14 @@ inline std::string type_string(const int& x) {
       return "float64";
     case mshadow::kFloat16:
       return "float16";
+    case mshadow::kInt8:
+      return "int8";
     case mshadow::kUint8:
       return "uint8";
     case mshadow::kInt32:
       return "int32";
+    case mshadow::kInt64:
+      return "int64";
   }
   return "unknown";
 }
@@ -166,6 +192,26 @@ inline bool type_assign(int *y, const int& x) {
   return true;
 }
 
+/*!
+ * \brief Assign x to y. Checks for compatiblity when y is not DispatchMode::kUndefined.
+ * \param y target mode.
+ * \param x source mode.
+ * \return whether x and y are compatible.
+ */
+inline bool dispatch_mode_assign(DispatchMode *y, const DispatchMode& x) {
+  if (*y == DispatchMode::kUndefined) {
+    *y = x;
+    return true;
+  } else if (*y != x && x != DispatchMode::kUndefined) {
+    return false;
+  }
+  return true;
+}
+
+/*! \brief Register op name as an alias */
+#define MXNET_ADD_SPARSE_OP_ALIAS(__name$) \
+  .add_alias("_sparse_" #__name$)
+
 /*!
  * \brief macro assign shape to out if out is unknown otherwise check consistency
  *  Use macro so we can see the error file more clearly
@@ -201,6 +247,55 @@ inline bool type_assign(int *y, const int& x) {
     }                                                                       \
   }
 
+/*!
+ * \brief macro assign storage type to out if out is unknown (-1) otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param type_array the type array to store the result
+ * \param index the index of in the array
+ * \param type the inferred storage type
+ */
+#define STORAGE_TYPE_ASSIGN_CHECK(type_array, index, type)                  \
+  {                                                                         \
+    if (!type_assign(&(type_array)[index], type)) {                         \
+      std::ostringstream os;                                                \
+      os << "Storage type inconsistent, Provided="                          \
+         << common::stype_string((type_array)[index]) << ','                \
+         << " inferred storage type=" << common::stype_string(type);        \
+      throw ::mxnet::op::InferStorageTypeError(os.str(), index);            \
+    }                                                                       \
+  }
+
+/*!
+ * \brief macro assign type to out if out is unknown (-1) otherwise check consistency
+ *  Use macro so we can see the error file more clearly
+ * \param type_array the type array to store the result
+ * \param index the index of in the array
+ * \param type the inferred dispatch type
+ */
+#define DISPATCH_MODE_ASSIGN_CHECK(type_array, index, type)                 \
+  {                                                                         \
+    if (!dispatch_mode_assign(&(type_array)[index], type)) {                         \
+      std::ostringstream os;                                                \
+      os << "Dispatch mode inconsistent, Provided="                         \
+         << common::dispatch_mode_string((type_array)[index]) << ','        \
+         << " inferred mode=" << common::dispatch_mode_string(type);        \
+      throw ::mxnet::op::InferStorageTypeError(os.str(), index);            \
+    }                                                                       \
+  }
+
+/*!
+ * \brief macro check if type is the same as expected.
+ * \param type the type to be checked
+ * \param expected the expected type
+ */
+#define UNIFORM_TYPE_CHECK(type, expected, arg)                         \
+  {                                                                     \
+    CHECK_EQ(type, expected) << "This layer requires uniform type. "    \
+                             << "Expected '" << type_string(expected)   \
+                             << "' v.s. given '" << type_string(type)   \
+                             << "' at '" << arg << "'";                 \
+  }
+
 // helper macro to implement bind dispatch
 #if MXNET_USE_CUDA
 #define DO_BIND_DISPATCH(Method, ...)                                \
@@ -219,6 +314,48 @@ inline bool type_assign(int *y, const int& x) {
   }
 #endif
 
+/*! \brief assign stype to target_stype, if successful,
+ *         assign dispatch_mode to target_dispatch
+ */
+inline bool storage_type_assign(int* stype,
+                                const NDArrayStorageType target_stype,
+                                DispatchMode* dispatch,
+                                const DispatchMode target_dispatch) {
+  if (type_assign(stype, target_stype)) {
+    DISPATCH_MODE_ASSIGN_CHECK(dispatch, 0, target_dispatch);
+    return true;
+  }
+  return false;
+}
+
+/*! \brief assign the stype vector to target_stype, if successful,
+ *         assign dispatch_mode to target_dispatch
+ */
+inline bool storage_type_assign(StorageTypeVector* stypes,
+                                const NDArrayStorageType target_stype,
+                                DispatchMode* dispatch,
+                                const DispatchMode target_dispatch) {
+  CHECK_GT(stypes->size(), 0);
+  bool success = true;
+  for (size_t i = 0; i < stypes->size(); i++) {
+    if (!type_assign(&(*stypes)[i], target_stype)) {
+      success = false;
+    }
+  }
+  if (success) {
+    DISPATCH_MODE_ASSIGN_CHECK(dispatch, 0, target_dispatch);
+  }
+  return success;
+}
+
+/*! \brief update the stype vector to default storage and dispatch_mode to fallback
+ */
+inline void dispatch_fallback(StorageTypeVector* stypes, DispatchMode* dispatch) {
+  for (auto& stype : *stypes) {
+    type_assign(&stype, kDefaultStorage);
+  }
+  DISPATCH_MODE_ASSIGN_CHECK(dispatch, 0, DispatchMode::kFComputeFallback);
+}
 
 // make a new node with operator op_name. Inputs are not filled.
 inline nnvm::NodePtr MakeNode(
@@ -284,8 +421,8 @@ inline std::vector<nnvm::NodeEntry> MakeZeroGradNodes(
 
 // check whether all output grads are zero.
 inline bool CheckGradAllZero(const std::vector<nnvm::NodeEntry>& ograds) {
-  const auto zero_op = nnvm::Op::Get("_zeros");
-  const auto zero_like_op = nnvm::Op::Get("zeros_like");
+  static const auto zero_op = nnvm::Op::Get("_zeros");
+  static const auto zero_like_op = nnvm::Op::Get("zeros_like");
   if (!ograds.size()) return false;
   for (const auto& grad : ograds) {
     if (!grad.node) return false;
@@ -300,7 +437,7 @@ inline std::vector<nnvm::NodeEntry> MakeNonlossGradNode(
     const char* op_name, const nnvm::NodePtr& n,
     const std::vector<nnvm::NodeEntry>& ograds,
     const std::vector<nnvm::NodeEntry>& inputs,
-    const std::unordered_map<std::string, std::string> dict) {
+    const std::unordered_map<std::string, std::string>& dict) {
   if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
   auto p = MakeNode(op_name, n->attrs.name + "_backward",
                     nullptr, &dict, &n);
@@ -333,6 +470,81 @@ inline void ParamParser(nnvm::NodeAttrs* attrs) {
   attrs->parsed = std::move(param);
 }
 
+#define CHECK_RSP_ALL_ROWS_NON_ZERO(rsp, func, param)                              \
+  {                                                                                \
+    CHECK(rsp.storage_shape()[0] == rsp.shape()[0]) << func                        \
+          << " for RowSparse " << param << " is only implemented for "             \
+          << "RowSparse " << param << " with all rows containing non-zeros. "      \
+          << "Expects " << param << ".values.shape[0] (" << rsp.storage_shape()[0] \
+          << ") == " << param << ".shape[0] (" << rsp.shape()[0] << ").";          \
+  }
+
+/*! \brief get string representation of the operator stypes */
+inline std::string operator_stype_string(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         const std::vector<int>& in_attrs,
+                                         const std::vector<int>& out_attrs) {
+  std::string result = "";
+  result += "operator = " + attrs.op->name + "\n";
+  result += "input storage types = [";
+  for (const auto attr : in_attrs) {
+    result += common::stype_string(attr) + ", ";
+  }
+  result += "]\n";
+  result += "output storage types = [";
+  for (const auto attr : out_attrs) {
+    result += common::stype_string(attr) + ", ";
+  }
+  result += "]\n";
+  result += "params = {";
+  for (auto kv : attrs.dict) {
+    result += "\"" + kv.first + "\" : " + kv.second + ", ";
+  }
+  result += "}\n";
+  result += "context.dev_mask = " + std::to_string(dev_mask);
+  return result;
+}
+
+/*! \brief get string representation of the operator */
+inline std::string operator_string(const nnvm::NodeAttrs& attrs,
+                                  const OpContext& ctx,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<OpReqType>& req,
+                                  const std::vector<NDArray>& outputs) {
+  std::string result = "";
+  std::vector<int> in_stypes;
+  std::vector<int> out_stypes;
+  auto xform = [](const NDArray arr) -> int { return arr.storage_type(); };
+  std::transform(inputs.begin(), inputs.end(), std::back_inserter(in_stypes), xform);
+  std::transform(outputs.begin(), outputs.end(), std::back_inserter(out_stypes), xform);
+  result += operator_stype_string(attrs, ctx.run_ctx.ctx.dev_mask(), in_stypes, out_stypes);
+  return result;
+}
+
+/*! \brief log storage fallback event
+ */
+inline void LogStorageFallback(const nnvm::NodeAttrs& attrs,
+                               const int dev_mask,
+                               const std::vector<int>* in_attrs,
+                               const std::vector<int>* out_attrs) {
+  using namespace op;
+  auto warning_printed = dmlc::ThreadLocalStore<std::unordered_set<std::string>>::Get();
+  static bool log_verbose = dmlc::GetEnv("MXNET_STORAGE_FALLBACK_LOG_VERBOSE", true);
+  if (log_verbose) {
+    std::string warning = operator_stype_string(attrs, dev_mask, *in_attrs, *out_attrs);
+    if (warning_printed->find(warning) == warning_printed->end()) {
+      LOG(INFO) << "\nStorage fallback detected:\n" << warning
+                << "\nThe operator with default storage type will be dispatched for execution. "
+                << "You're seeing this warning message because the operator above is unable to "
+                << "process the given ndarrays with specified storage types and parameter. "
+                << "Temporary dense ndarrays are generated in order to execute the operator. "
+                << "You can set environment variable MXNET_STORAGE_FALLBACK_LOG_VERBOSE "
+                << "to 0 to suppress the warnings.";
+      warning_printed->insert(warning);
+    }
+  }
+}
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_OPERATOR_COMMON_H_
diff --git a/src/operator/operator_tune-inl.h b/src/operator/operator_tune-inl.h
new file mode 100644
index 000000000000..d0cf7e713958
--- /dev/null
+++ b/src/operator/operator_tune-inl.h
@@ -0,0 +1,758 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef MXNET_OPERATOR_OPERATOR_TUNE_INL_H_
+#define MXNET_OPERATOR_OPERATOR_TUNE_INL_H_
+
+#include <dmlc/base.h>
+#include <dmlc/logging.h>
+#include <mshadow/base.h>
+#include <atomic>
+#include <cstdint>
+#include <chrono>
+#include <thread>
+#include <string>
+#include <vector>
+#include <algorithm>
+#include <list>
+#include <random>
+#include <unordered_set>
+#include "./mxnet_op.h"
+#include "./operator_tune.h"
+
+#if (__GNUC__ >= 4 || (__GNUC__ >= 3 && __GNUC_MINOR__ >= 4)) && !defined(__mips__)
+#  define HAS_CXA_DEMANGLE 1
+#else
+#  define HAS_CXA_DEMANGLE 0
+#endif
+
+#if HAS_CXA_DEMANGLE
+#include <cxxabi.h>
+#endif
+
+namespace mxnet {
+namespace op {
+
+#ifndef MXNET_NO_INLINE
+#ifdef _MSC_VER
+#define MXNET_NO_INLINE __declspec(noinline)
+#else
+#define MXNET_NO_INLINE __attribute__((noinline))
+#endif
+#endif  // MXNET_NO_INLINE
+
+#define OUTSIDE_COUNT_SHIFT    9
+
+namespace tune {
+
+/*!
+ * \brief Convert TuningMode value to a string representation
+ * \param tm  Scalar TuningMode value
+ * \return Character pointer to a string representing the TuningMode value
+ */
+inline const char *TuningModeToString(const TuningMode tm) {
+  switch (tm) {
+    case kAuto:
+      return "Auto";
+    case kNeverOMP:
+      return "NeverOMP";
+    case kAlwaysOMP:
+      return "AlwaysOMP";
+    default:
+      CHECK(false) << "Unknown TuningMode type: " << static_cast<int>(tm);
+      return "<unknown>";
+  }
+}
+}  // namespace tune
+
+/*!
+ * \brief Engine to tune kernel operations
+ * \tparam DType Data type to be used when tuning the kernel operations
+ * \remarks The basic concept here is that we time how long a trivial loop takes with and without
+ * OMP, subtracting the non-OMP run from the OMP run, which gives us the time
+ * that the OMP overhead takes.  Times were found to be relatively invariant with
+ * regard ot the number of threads/cores on a given machine.
+ * Secondly, supplied operators are run and timed (for each data type) in order to determine
+ * their individual time cost.
+ *
+ * Knowing the following items, we can determine how long the OMP and non-OMP run
+ * is expected to take:
+ *  1) OMP overhead time
+ *  2) Number of iterations required
+ *  3) Number of threads to be used if we choose the OMP method
+ *  4) The data type
+ *
+ * Therefore, at Kernel::Launch() time, we can estimate whether it is faster to use OMP or not
+ * for the given kernel operator.
+ *
+ * Results and efficiency of the tuning is tested in the gtest OMP_TUNING test suite
+ */
+template<typename DType>
+class OperatorTune : public OperatorTuneByType<DType> {
+ public:
+  using Tick = OperatorTuneBase::Tick;
+  using duration_t = OperatorTuneBase::duration_t;
+  using OperatorTuneByType<DType>::tuning_mode_;
+
+  /*!
+   * \brief Constructor
+   */
+  OperatorTune() {
+    TuneAll();
+  }
+
+  /*!
+   * \brief Initialize the OperatorTune object
+   * \return Whether the OperatorTune object was successfully initialized
+   */
+  static bool Initialize() {
+    if (!initialized_) {
+      initialized_ = true;
+      // Generate some random data for calling the operator kernels
+      data_set_.reserve(0x100);
+      std::random_device rd;
+      std::mt19937 gen(rd());
+      if (!std::is_integral<DType>::value) {
+        std::uniform_real_distribution<> dis(-1, 1);
+        for (int n = 0; n < 0x100; ++n) {
+          const auto val = static_cast<DType>(dis(gen));
+          // If too close to zero, try again
+          if (std::fabs(static_cast<double>(val)) < 1e-5) {
+            --n;
+            continue;
+          }
+          data_set_.emplace_back(val);
+        }
+      } else {
+        std::uniform_int_distribution<> dis(-128, 127);
+        for (int n = 0; n < 0x100; ++n) {
+          const auto val = static_cast<DType>(dis(gen));
+          // If zero, try again
+          if (!val) {
+            --n;
+            continue;
+          }
+          data_set_.emplace_back(val);
+        }
+      }
+      // Use this environment variable to generate new tuning statistics
+      // In order to avoid printing too many copies, only the float32 object prints
+      output_tuning_data_ = mshadow::DataType<DType>::kFlag == mshadow::kFloat32
+                            && dmlc::GetEnv("MXNET_OUTPUT_TUNING_DATA", false);
+      // If outputting tuning data, then also output verbose logging info
+      OperatorTuneBase::verbose_tuning_info_ = dmlc::GetEnv("MXNET_VERBOSE_TUNING_INFO", false);
+
+      OperatorTuneBase::tuning_weight_scale_ = dmlc::GetEnv("MXNET_TUNING_WEIGHT_SCALE", 0.0);
+
+      // This isn't actually supposed to be multithreaded init, but just to be sure the change is
+      // seen everywhere, using atomic bool.
+      if (!OperatorTuneBase::calculated_.load()) {
+        // Not especially concerned with a race condition, since this hsould
+        // run when only one thread is active (static init), just don't cache this variable
+        OperatorTuneBase::calculated_.store(true);
+        OperatorTuneBase::omp_overhead_ns_ = GetOMPLoopOverhead();
+        std::string config = dmlc::GetEnv("MXNET_USE_OPERATOR_TUNING", std::string());
+        ParseEnablerConfig(config);
+      }
+
+      if (OperatorTuneBase::verbose_tuning_info_) {
+        LOG(INFO) << "OMP overhead: " << OperatorTuneBase::omp_overhead_ns_ << " nanoseconds";
+      }
+    }
+    return true;
+  }
+
+  /*!
+   * \brief Schedule a tuning run
+   * \tparam OP Operator to tune
+   * \param tune_func Function to call which tunes the operator
+   * \return true if the tune operation was scheduled
+   */
+  template<typename OP>
+  static bool ScheduleTune(void (*tune_func)()) {
+#ifdef MXNET_USE_OPERATOR_TUNING
+    if (tune_func) {
+      GetTuningList()->push_back(tune_func);
+      operator_names_.insert(demangle(typeid(OP).name()));
+      return true;
+    }
+    return false;
+#else
+    return true;
+#endif
+  }
+
+  /*!
+   * \brief Is the template parameter type a tuned kernel?
+   * \tparam OP kernel operator type
+   * \return true if the operator/kernel is tuned
+   */
+  template<typename OP>
+  static bool IsTuned() {
+    return operator_names_.find(demangle(typeid(OP).name())) != operator_names_.end();
+  }
+
+  /*!\
+   * \brief Tune all registered kernel operators that haven't already been tuned
+   */
+  static bool TuneAll() {
+    Initialize();
+    std::list<void (*)()> *tl = GetTuningList();
+    const size_t size_save = tl->size();  // For checking if anything asynchronous is
+    // adding or removing items, which is forbidden
+    if (output_tuning_data_ && !tl->empty()) {
+      // Only emit this once, use the most common case, 'float32'
+      if (mshadow::DataType<DType>::kFlag == mshadow::kFloat32) {
+        std::cout << "OperatorTuneBase::duration_t "
+                  << "OperatorTuneBase::omp_overhead_ns_ = " << OperatorTuneBase::omp_overhead_ns_
+                  << ";" << std::endl << std::flush;
+      }
+    }
+    const Tick start = std::chrono::high_resolution_clock::now();
+    for (auto i : *tl) {
+      (*i)();
+    }
+    if (OperatorTuneBase::verbose_tuning_info_) {
+      const duration_t duration = OperatorTune::GetDurationInNanoseconds(start);
+      LOG(INFO) << "Op Tuning  for " << type_name<DType>()
+                << " took " << (duration / 1000000) << " ms";
+    }
+    CHECK_EQ(size_save, tl->size()) << "Tuning list size should not have changed while tuning";
+    tl->clear();
+    return true;
+  }
+
+  /*!
+   * \brief Return set of operator names that were registered to be tuned. Does not imply
+   *        that the operator has been tuned.
+   * \return Set of operator/kernel names that were registered for tuning
+   */
+  static const std::unordered_set<std::string>& TunedOperatorNames() {
+    return operator_names_;
+  }
+
+ protected:
+  /*!
+   * \brief Get the list of tuning function calls for the operators
+   * \return Pointer to list of tuning function calls
+   */
+  static std::list<void (*)()> *GetTuningList();
+
+  /*!
+   * \brief Demangle typeid::name() in order to generate source macros
+   * \param name C++ Mangled name
+   * \return Demangled name as string
+   */
+  static inline std::string demangle(const char *name) {
+#if HAS_CXA_DEMANGLE
+    int status = -4;  // some arbitrary value to eliminate the compiler warning
+    std::unique_ptr<char, void (*)(void *)> res{
+      abi::__cxa_demangle(name, nullptr, nullptr, &status),
+      &std::free
+    };
+    return status ? name : res.get();
+#else
+    return name;
+#endif
+  }
+
+  /*!
+   * \brief Type name as string
+   * \tparam T Type
+   * \return std::string representing the human-readable demangled type name
+   */
+  template<typename T> static inline std::string type_name() {
+    return demangle(typeid(T).name());
+  }
+
+  /*! \brief Measure OMP overhead for a trivial OMP loop using all cores
+   * \param omp_thread_count - Number of OMP threads to use in the timing test
+   * \returns Duration in nanoseconds for the OMP overhead (time to initiate and close the
+   *          OMP session)
+   */
+  static duration_t GetOMPLoopOverhead(const size_t omp_thread_count) {
+    CHECK_GT(omp_thread_count, 1);  // Don't try to use OMP for one thread
+    int wl_count = OperatorTuneBase::WORKLOAD_COUNT;
+
+    Tick start = std::chrono::high_resolution_clock::now();
+    // Use two loops in order to simulate OMP outside timing
+    for (size_t i = 0; i < OUTSIDE_COUNT; ++i) {
+      for (int x = 0; x < wl_count; ++x) {
+        // trivial operation
+        volatile_int_ += x;
+      }
+    }
+    const OperatorTuneBase::duration_t no_omp_duration =
+      OperatorTuneBase::GetDurationInNanoseconds(start);
+
+    // Scale OMP iterations by type calculation complexity
+    double factor;
+
+    // if tuning_weight_scale_ is a number that looks valid, use it as the factor
+    if (OperatorTuneBase::tuning_weight_scale_ > 0.01) {
+      factor = OperatorTuneBase::tuning_weight_scale_;
+    } else {
+      // These are empirically-determined constants found by balancing between
+      // a desktop (8 & 12 cpu's) and large cloud instances (32 & 64 cpu's)
+      switch (mshadow::DataType<DType>::kFlag) {
+        case mshadow::kUint8:
+        case mshadow::kInt8:
+          factor = 8.5;
+          break;
+        case mshadow::kInt32:
+          factor = 4.5;
+          break;
+        case mshadow::kInt64:
+          factor = 2;
+          break;
+        case mshadow::kFloat64:
+          factor = 1.25;
+          break;
+        case mshadow::kFloat32:
+        default:
+          factor = 1.0;
+          break;
+      }
+    }
+
+    wl_count = static_cast<int>(factor * OperatorTuneBase::WORKLOAD_COUNT * omp_thread_count);
+    start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < OUTSIDE_COUNT; ++i) {
+      #pragma omp parallel for num_threads(omp_thread_count)
+      for (int x = 0; x < wl_count; ++x) {
+        // trivial operation
+        volatile_int_ += x;
+      }
+    }
+    const duration_t omp_duration = OperatorTuneBase::GetDurationInNanoseconds(start)
+                                    - no_omp_duration;
+    return omp_duration >> OUTSIDE_COUNT_SHIFT;
+  }
+
+  /*! \brief Measure OMP overhead for a trivial OMP loop using all cores
+   * \returns Time in nanoseconds to initialize/cleanup when excuting an OMP block
+   */
+  static duration_t GetOMPLoopOverhead() {
+    // It was found empirically that OMP times was not heavily tied to number of cores,
+    // so take an average across all core counts
+    const auto max_cores = static_cast<size_t>(omp_get_num_procs()) >> 1;
+    if (max_cores >= 2) {
+      std::vector<duration_t> core_times;
+      // Take care of any OMP lazy-init with a throwaway call
+      for (size_t omp_threads = 2; omp_threads <= max_cores; ++omp_threads) {
+        GetOMPLoopOverhead(omp_threads);
+      }
+      std::vector<duration_t> durations;
+      durations.reserve(max_cores - 1);
+      for (size_t omp_threads = 2; omp_threads <= max_cores; ++omp_threads) {
+        const duration_t duration = GetOMPLoopOverhead(omp_threads);
+        if (OperatorTuneBase::verbose_tuning_info_) {
+          LOG(INFO) << "OMP Thread Count: " << omp_threads << ", overhead: " << duration << " ns";
+        }
+        durations.emplace_back(duration);
+      }
+      // return median
+      std::sort(durations.begin(), durations.end());
+      return durations[durations.size() >> 1];
+    }
+    return INT_MAX;  // If only one core, then never use OMP (say the overhead is huge)
+  }
+
+  /*!
+   * \brief Some string utility functions that aren't specific to tuning
+   */
+  struct StringUtil {
+    /*!
+     * \brief Terim whitespace from beninning and end of string
+     * \param s String to trimp
+     * \return reference to the modified string. This is the same std::string object as what was
+     *         supplied in the parameters
+     */
+    static std::string &trim(std::string *s) {
+      s->erase(s->begin(), std::find_if(s->begin(), s->end(), [](int ch) {
+        return !std::isspace(ch);
+      }));
+      s->erase(std::find_if(s->rbegin(), s->rend(), [](int ch) {
+        return !std::isspace(ch);
+      }).base(), s->end());
+      return *s;
+    }
+
+    /*!
+     * \brief Tokenize a string into a list of tokens
+     * \param s String to tokenize
+     * \return std::list of tokens
+     */
+    static std::list<std::string> string2list(const std::string &s) {
+      std::list<std::string> res;
+      std::istringstream iss(s);
+      std::string token;
+      while (std::getline(iss, token, ',')) {
+        trim(&token);
+        if (!token.empty()) {
+          res.push_back(token);
+        }
+      }
+      return std::move(res);
+    }
+  };
+
+  /*!
+   * \brief Get data type from string representation
+   * \warning Do not call from a performance-sensitive area
+   */
+  static int type_from_string(const std::string& type_string) {
+    if (type_string == "float32")
+      return mshadow::kFloat32;
+    if (type_string == "float64")
+      return mshadow::kFloat64;
+    if (type_string == "float16")
+      return mshadow::kFloat16;
+    if (type_string == "int8")
+      return mshadow::kInt8;
+    if (type_string == "uint8")
+      return mshadow::kUint8;
+    if (type_string == "int32")
+      return mshadow::kInt32;
+    if (type_string == "int64")
+      return mshadow::kInt64;
+    return -1;  // invalid
+  }
+
+  /*!
+   * \brief Parse MXNET_ENABLE_OPERATOR_TUNING environment variable
+   * \param config String representation of MXNET_ENABLE_OPERATOR_TUNING environment variable
+   *        Values:
+   *            0=disable all
+   *            1=enable all
+   *            float32, float16, float32=list of types to enable, and disable those not listed
+   */
+  static void ParseEnablerConfig(std::string config) {
+    StringUtil::trim(&config);
+    if (!config.empty()) {
+      // First disable all
+      OperatorTuneByType<float>::set_tuning_mode(tune::kAlwaysOMP);
+      OperatorTuneByType<double>::set_tuning_mode(tune::kAlwaysOMP);
+      OperatorTuneByType<int8_t>::set_tuning_mode(tune::kAlwaysOMP);
+      OperatorTuneByType<uint8_t>::set_tuning_mode(tune::kAlwaysOMP);
+      OperatorTuneByType<int32_t>::set_tuning_mode(tune::kAlwaysOMP);
+      OperatorTuneByType<int64_t>::set_tuning_mode(tune::kAlwaysOMP);
+      // See if it's a non-number (ie type or list of types)
+      if (!::isdigit(config[0])) {
+        OperatorTuneByType<mshadow::half::half_t>::set_tuning_mode(tune::kAuto);
+        std::list<std::string> tokens = StringUtil::string2list(config);
+        for (const std::string& stype : tokens) {
+          // We don't have an enum for halt_t
+          const int typ = type_from_string(stype);
+          if (typ >= 0) {
+            switch (typ) {
+              case mshadow::kFloat32:
+                OperatorTuneByType<float>::set_tuning_mode(tune::kAuto);
+                break;
+              case mshadow::kFloat64:
+                OperatorTuneByType<double>::set_tuning_mode(tune::kAuto);
+                break;
+              case mshadow::kFloat16:
+                OperatorTuneByType<mshadow::half::half_t>::set_tuning_mode(tune::kAuto);
+                break;
+              case mshadow::kInt8:
+                OperatorTuneByType<int8_t>::set_tuning_mode(tune::kAuto);
+                break;
+              case mshadow::kUint8:
+                OperatorTuneByType<uint8_t>::set_tuning_mode(tune::kAuto);
+                break;
+              case mshadow::kInt32:
+                OperatorTuneByType<int32_t>::set_tuning_mode(tune::kAuto);
+                break;
+              case mshadow::kInt64:
+                OperatorTuneByType<int64_t>::set_tuning_mode(tune::kAuto);
+                break;
+              default:
+                CHECK(false) << "Unsupported tuning data type: " << stype;
+                break;
+            }
+          } else {
+            // -1 is error
+            LOG(WARNING) << "Unknown data type to be tuned: " << stype;
+          }
+        }
+      } else {
+        if (std::atoi(config.c_str()) > 0) {
+          OperatorTuneByType<float>::set_tuning_mode(tune::kAuto);
+          OperatorTuneByType<double>::set_tuning_mode(tune::kAuto);
+          OperatorTuneByType<int8_t>::set_tuning_mode(tune::kAuto);
+          OperatorTuneByType<uint8_t>::set_tuning_mode(tune::kAuto);
+          OperatorTuneByType<int32_t>::set_tuning_mode(tune::kAuto);
+          OperatorTuneByType<int64_t>::set_tuning_mode(tune::kAuto);
+          OperatorTuneByType<mshadow::half::half_t>::set_tuning_mode(tune::kAuto);
+        }
+      }
+    }
+  }
+
+  /*! \brief Whether this object has been initialized */
+  static bool initialized_;
+  /*! \brief Number of passes to obtain an average */
+  static constexpr duration_t OUTSIDE_COUNT = (1 << OUTSIDE_COUNT_SHIFT);
+  /*! \brief Random data for timing operator calls */
+  static std::vector<DType> data_set_;
+  /*! \brief Operators tuned */
+  static std::unordered_set<std::string> operator_names_;
+  /*! \brief Arbitary object to modify in OMP loop */
+  static volatile int volatile_int_;
+  /*! \brief Output insertable (into code) instantiation+default-value macros */
+  static bool output_tuning_data_;
+};
+
+/*!
+ * \brief Class that tunes unary operators
+ * \tparam DType Data type to be used when tuning the kernel operations
+ */
+template<typename DType>
+class UnaryOpTune : public OperatorTune<DType> {
+ protected:
+  typedef OperatorTune<DType> Super;
+  using duration_t = typename Super::duration_t;
+  using Tick = typename Super::Tick;
+
+  /*!
+   * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations
+   *        Used for kernels that take no arguments (ie set_zero)
+   * \tparam OP Kernel operator
+   * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations
+   */
+  template<typename OP>
+  static duration_t GetBlankWorkload() {
+    DType tmp;
+    volatile DType *res = &tmp;
+    const Tick start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) {
+      // Use a logical AND instead of mod to avoid affecting the timing result with a slow divide
+      *res += OP::Map();
+    }
+    const duration_t omp_duration = Super::GetDurationInNanoseconds(start);
+    return omp_duration ? omp_duration : 1;
+  }
+
+  /*!
+   * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations
+   *        Used for kernels that take one argument (ie sqrt())
+   * \tparam OP Kernel operator
+   * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations
+   */
+  template<typename OP>
+  static duration_t GetUnaryWorkload() {
+    DType tmp;
+    volatile DType *res = &tmp;
+    const Tick start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) {
+      // Use a logical AND instead of mod to avoid affecting the timing result with a slow divide
+      *res = OP::Map(Super::data_set_[i & 0xFF]);
+    }
+    const duration_t omp_duration = Super::GetDurationInNanoseconds(start);
+    return omp_duration ? omp_duration : 1;
+  }
+
+  /*!
+   * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations
+   *        Used for kernels that take two arguments (ie elemwise_add())
+   * \tparam OP Kernel operator
+   * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations
+   */
+  template<typename OP>
+  static inline duration_t GetBinaryWorkload() {
+    DType tmp;
+    volatile DType *res = &tmp;
+    const Tick start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) {
+      // Use a logical AND instead of mod to avoid affecting the timing result with a slow divide
+      *res = OP::Map(Super::data_set_[i & 0xFF], Super::data_set_[(i + 1) & 0xFF]);
+    }
+    const duration_t omp_duration = Super::GetDurationInNanoseconds(start);
+    return omp_duration ? omp_duration : 1;
+  }
+
+  /*!
+   * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations
+   *        Used for kernels that take three arguments (ie backwards_grad<elemwise_add>())
+   * \tparam OP Kernel operator
+   * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations
+   */
+  template<typename OP>
+  static duration_t GetTertiaryWorkload() {
+    DType tmp;
+    volatile DType *res = &tmp;
+    const Tick start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) {
+      // Use a logical AND instead of mod to avoid affecting the timing result with a slow divide
+      *res = OP::Map(Super::data_set_[i & 0xFF],
+                     Super::data_set_[(i + 1) & 0xFF],
+                     Super::data_set_[i & 0xFF]);
+    }
+    const duration_t omp_duration = Super::GetDurationInNanoseconds(start);
+    return omp_duration ? omp_duration : 1;
+  }
+
+  /*!
+   * \brief Determine the time it takes a kernel operator to execute WORKLOAD_COUNT iterations
+   *        Used for mxnet-like kernels that take no arguments)
+   * \tparam OP Kernel operator
+   * \return Duration in nanoseconds for the 'WORKLOAD_COUNT' operations
+   */
+  template<typename OP>
+  static duration_t GetBlankWorkloadEx() {
+    std::unique_ptr<DType> tmp(new DType[Super::WORKLOAD_COUNT]);
+    DType *tmp_ptr = tmp.get();
+    const Tick start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < Super::WORKLOAD_COUNT; ++i) {
+      OP::Map(i, tmp_ptr);
+    }
+    const duration_t omp_duration = Super::GetDurationInNanoseconds(start);
+    return omp_duration ? omp_duration : 1;
+  }
+
+ public:
+  /*!
+   * \brief Tune the specified kernel operator.  Optionally print out C++ macro that defines the
+   *        tuning data variable and the default tuned value
+   *        This function tunes an operator which takes no arguments
+   * \tparam OP The kernel operator to be tuned
+   */
+  template<typename OP>
+  static void TuneBlankOperator() {
+    mxnet::op::mxnet_op::tuned_op<OP, DType>::workload_ = GetBlankWorkload<OP>();
+    if (Super::output_tuning_data_) {
+      std::cout << "IMPLEMENT_UNARY_WORKLOAD_FWD("
+                << Super::template type_name<OP>()
+                << ");  // NOLINT()" << std::endl << std::flush;  // For long lines
+    }
+  }
+
+  /*!
+   * \brief Tune the specified kernel operator.  Optionally print out C++ macro that defines the
+   *        tuning data variable and the default tuned value
+   *        This function tunes an operator which takes one argument
+   * \tparam OP The kernel operator to be tuned
+   */
+  template<typename OP>
+  static void TuneUnaryOperator() {
+    mxnet::op::mxnet_op::tuned_op<OP, DType>::workload_ = GetUnaryWorkload<OP>();
+    if (Super::output_tuning_data_) {
+      std::cout << "IMPLEMENT_UNARY_WORKLOAD_FWD("
+                << Super::template type_name<OP>()
+                << ");  // NOLINT()" << std::endl << std::flush;  // For long lines
+    }
+  }
+
+  /*!
+   * \brief Tune the specified kernel operator.  Optionally print out C++ macro that defines the
+   *        tuning data variable and the default tuned value
+   *        This function tunes a backward operator which takes one argument
+   * \tparam OP The kernel operator to be tuned
+   */
+  template<typename OP>
+  static void TuneUnaryBackwardOperator() {
+    mxnet::op::mxnet_op::tuned_op<mxnet_op::backward_grad<OP>, DType>::workload_ =
+      GetBinaryWorkload<mxnet::op::mxnet_op::backward_grad<OP>>();
+    if (Super::output_tuning_data_) {
+      std::cout << "IMPLEMENT_UNARY_WORKLOAD_BWD("
+                << Super::template type_name<OP>()
+                << ");  // NOLINT()" << std::endl << std::flush;  // For long lines
+    }
+  }
+
+  /*!
+   * \brief Tune the specified "mxnet_op-type" kernel operator.
+   *        Optionally print out C++ macro that defines the
+   *        tuning data variable and the default tuned value
+   *        This function tunes an operator which takes no arguments
+   * \tparam OP The kernel operator to be tuned
+   */
+  template<typename OP>
+  static void TuneBlankOperatorEx() {
+    mxnet::op::mxnet_op::tuned_op<OP, DType>::workload_ = GetBlankWorkloadEx<OP>();
+    if (Super::output_tuning_data_) {
+      std::cout << "IMPLEMENT_BLANK_WORKLOAD_FWD("
+                << Super::template type_name<OP>()
+                << ");  // NOLINT()" << std::endl << std::flush;  // For long lines
+    }
+  }
+
+  /*!
+   * \brief Determine whether to use OMP based upon both timing and configuration using the
+   *        given (templated) operator's workload
+   * \tparam OP Operator whose workload to use (tuned_op::workload_)
+   * \param N Number of iterations desired
+   * \param thread_count Number of OMP threads available to perform the iterations
+   * \returns Whether it's faster to use OMP for these iterations
+   */
+  template<typename OP>
+  inline static bool UseOMP(size_t N, size_t thread_count) {
+      return OperatorTune<DType>::UseOMP(N,
+                                         thread_count,
+                                         static_cast<uint64_t>(N) * OP::workload_);
+  }
+};
+
+/*!
+ * \brief Class that tunes binary and unary operators
+ * \tparam DType Data type to be used when tuning the kernel operations
+ */
+template<typename DType>
+class BinaryOpTune : public UnaryOpTune<DType> {
+ protected:
+  typedef UnaryOpTune<DType> Super;
+
+ public:
+  /*!
+   * \brief Tune a generic binary operator
+   * @tparam OP - Operator type
+   */
+  template<typename OP>
+  static void TuneBinaryOperator() {
+    mxnet_op::tuned_op<OP, DType>::workload_ = Super::template GetBinaryWorkload<OP>();
+    if (Super::Super::output_tuning_data_) {
+      std::cout << "IMPLEMENT_BINARY_WORKLOAD_FWD("
+                << Super::template type_name<OP>()
+                << ");  // NOLINT()" << std::endl << std::flush;  // For long lines
+    }
+  }
+
+  /*!
+   * \brief Tune binary backward operator
+   * \tparam OP - operator
+   */
+  template<typename OP>
+  static void TuneBinaryBackwardOperator() {
+    mxnet::op::mxnet_op::tuned_op<mxnet_op::backward_grad<OP>, DType>::workload_ =
+      Super::template GetTertiaryWorkload<mxnet::op::mxnet_op::backward_grad<OP>>();
+    if (Super::Super::output_tuning_data_) {
+      std::cout << "IMPLEMENT_BINARY_WORKLOAD_BWD("
+                << Super::template type_name<OP>()
+                << ");  // NOLINT()" << std::endl << std::flush;  // For long lines
+    }
+  }
+};
+
+#undef OUTSIDE_COUNT_SHIFT
+#undef WORKLOAD_COUNT_SHIFT
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_OPERATOR_TUNE_INL_H_
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
new file mode 100644
index 000000000000..525a66b6f824
--- /dev/null
+++ b/src/operator/operator_tune.cc
@@ -0,0 +1,347 @@
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <atomic>
+#include "./mxnet_op.h"
+#include "./mshadow_op.h"
+#include "./tensor/init_op.h"
+#include "./operator_tune-inl.h"
+#include "./tensor/elemwise_binary_broadcast_op.h"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief Shared static variables for all OperatorTune data types
+ */
+std::atomic<bool> OperatorTuneBase::calculated_(false);
+bool OperatorTuneBase::verbose_tuning_info_ = false;
+double OperatorTuneBase::tuning_weight_scale_ = 0.0;
+
+/*!
+ * \brief Instantiate static variables for OperatorTune<DType>, where 'DType' is specified
+ */
+#define IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(__typ$) \
+  template<> bool OperatorTune<__typ$>::initialized_ = false; \
+  template<> std::vector<__typ$> OperatorTune<__typ$>::data_set_ = {}; \
+  template<> volatile tune::TuningMode OperatorTuneByType<__typ$>::tuning_mode_ = tune::kAuto; \
+  template<> volatile int OperatorTune<__typ$>::volatile_int_ = 9;  /* arbitrary number */ \
+  template<> std::unordered_set<std::string> OperatorTune<__typ$>::operator_names_ = {}; \
+  template<> bool OperatorTune<__typ$>::output_tuning_data_ = false; \
+  template<> std::list<void (*)()> *OperatorTune<__typ$>::GetTuningList() { \
+    static std::list<void (*)()> ll; \
+    return &ll; \
+  }
+
+/*!
+ * \brief Static variables for different types (ie OperatorTune<float>, OperatorTune<double>, etc.
+ */
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(float);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(double);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(mshadow::half::half_t);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(int8_t);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(uint8_t);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(int32_t);
+IMPLEMENT_OPERATOR_TUNE_STATICS_FOR_TYPE(int64_t);
+
+/*!
+ * \brief Init variable used to facilitate registering a tunable operator during
+ *        static initialization
+ * \tparam OP Operator type
+ * \tparam DType Data type
+ */
+template<typename OP, typename DType>
+struct static_init_var {
+  static bool init_;
+};
+
+/*!
+ * \brief Repeat the given macro and associated arguments for each data type,
+ *        appending the data type to the end of the arguments
+ */
+#define MSHADOW_MACRO_FOREACH_TYPE(__macro$, ...) \
+  __macro$(__VA_ARGS__, float); \
+  __macro$(__VA_ARGS__, double); \
+  __macro$(__VA_ARGS__, mshadow::half::half_t); \
+  __macro$(__VA_ARGS__, uint8_t); \
+  __macro$(__VA_ARGS__, int8_t); \
+  __macro$(__VA_ARGS__, int32_t); \
+  __macro$(__VA_ARGS__, int64_t);
+
+
+#define IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$, __typ$) \
+  namespace mxnet_op { \
+  template<> size_t mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::workload_ = INT_MAX / 4; \
+  template<> std::vector<float> mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::workload_ex_ = {}; \
+  }  /* namespace mxnet_op */
+
+/*!
+ * \brief Implement tuning objects for a forward blank (no arguments) kernel operator
+ */
+#define _IMPLEMENT_BLANK_WORKLOAD_FWD(__op$, __typ$) \
+  IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$, __typ$); \
+  namespace mxnet_op { \
+  template<> bool mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::UseOMP( \
+    size_t N, size_t omp_threads) { \
+    return mxnet::op::UnaryOpTune<__typ$>::UseOMP<mxnet_op::tuned_op<__op$, __typ$>>( \
+      N, omp_threads); \
+  }}  /* namespace mxnet_op */ \
+  template<> bool static_init_var<__op$, __typ$>::init_ = \
+    mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>( \
+      mxnet::op::UnaryOpTune<__typ$>::TuneBlankOperatorEx<__op$>)
+
+/*!
+ * \brief Implement tuning objects for a forward unary kernel operator
+ */
+#define _IMPLEMENT_UNARY_WORKLOAD_FWD(__op$, __typ$) \
+  IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$, __typ$); \
+  namespace mxnet_op { \
+  template<> bool mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::UseOMP( \
+    size_t N, size_t omp_threads) { \
+    return mxnet::op::UnaryOpTune<__typ$>::UseOMP<mxnet_op::tuned_op<__op$, __typ$>>( \
+      N, omp_threads); \
+  }}  /* namespace mxnet_op */ \
+  template<> bool static_init_var<__op$, __typ$>::init_ = \
+    mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>( \
+      mxnet::op::UnaryOpTune<__typ$>::TuneUnaryOperator<__op$>)
+
+/*!
+ * \brief Implement tuning objects for a backward unary kernel operator
+ */
+#define _IMPLEMENT_UNARY_WORKLOAD_BWD(__op$, __typ$) \
+  IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(mxnet::op::mxnet_op::backward_grad<__op$>, __typ$); \
+  namespace mxnet_op { \
+  template<> \
+  bool mxnet::op::mxnet_op::tuned_op<mxnet::op::mxnet_op::backward_grad<__op$>, __typ$>::UseOMP( \
+    size_t N, size_t omp_threads) { \
+    return mxnet::op::UnaryOpTune<__typ$>::UseOMP<mxnet_op::tuned_op< \
+      mxnet::op::mxnet_op::backward_grad<__op$>, __typ$>>(N, omp_threads); \
+  }}  /* namespace mxnet_op */ \
+  template<> bool static_init_var<mxnet::op::mxnet_op::backward_grad<__op$>, __typ$>::init_ = \
+    mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>( \
+      mxnet::op::UnaryOpTune<__typ$>::TuneUnaryBackwardOperator<__op$>)
+
+/*!
+ * \brief Implement tuning objects for a forward binary kernel operator
+ */
+#define _IMPLEMENT_BINARY_WORKLOAD_FWD(__op$, __typ$) \
+  IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$, __typ$); \
+  namespace mxnet_op { \
+  template<> bool mxnet::op::mxnet_op::tuned_op<__op$, __typ$>::UseOMP( \
+    size_t N, size_t omp_threads) { \
+    return mxnet::op::BinaryOpTune<__typ$>::UseOMP<mxnet_op::tuned_op<__op$, __typ$>>( \
+      N, omp_threads); \
+  }}  /* namespace mxnet_op */ \
+  template<> bool static_init_var<__op$, __typ$>::init_ = \
+    mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>( \
+      mxnet::op::BinaryOpTune<__typ$>::TuneBinaryOperator<__op$>)
+
+/*!
+ * \brief Implement tuning objects for a backward binary kernel operator
+ */
+#define _IMPLEMENT_BINARY_WORKLOAD_BWD(__op$, __typ$) \
+  IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(mxnet::op::mxnet_op::backward_grad<__op$>, __typ$); \
+  namespace mxnet_op { \
+  template<> \
+    bool mxnet::op::mxnet_op::tuned_op<mxnet::op::mxnet_op::backward_grad<__op$>, __typ$>::UseOMP( \
+    size_t N, size_t omp_threads) { \
+    return mxnet::op::BinaryOpTune<__typ$>::UseOMP<mxnet_op::tuned_op< \
+      mxnet::op::mxnet_op::backward_grad<__op$>, __typ$>>(N, omp_threads); \
+  }}  /* namespace mxnet_op */ \
+  template<> bool static_init_var<mxnet::op::mxnet_op::backward_grad<__op$>, __typ$>::init_ = \
+    mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$>(  \
+      mxnet::op::BinaryOpTune<__typ$>::TuneBinaryBackwardOperator<__op$>)
+
+/*!
+ * \brief Implement tuning objects for a custom forward kernel operator
+ */
+#define _IMPLEMENT_CUSTOM_WORKLOAD_FWD(__op$, __typ$) \
+  IMPLEMENT_WORKLOAD_VALUE_FOR_TYPE(__op$<__typ$>, __typ$); \
+  template<> bool static_init_var<__op$<__typ$>, __typ$>::init_ = \
+    mxnet::op::OperatorTune<__typ$>::ScheduleTune<__op$<__typ$>>(\
+      __op$<__typ$>::Tune)
+
+/*!
+ * \brief Macros for manually adding new blank, unary and binary operators to the tuning set
+ */
+#define IMPLEMENT_UNARY_WORKLOAD_FWD(__op$) \
+  MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_UNARY_WORKLOAD_FWD, __op$)
+
+#define IMPLEMENT_BLANK_WORKLOAD_FWD(__op$) \
+  MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_BLANK_WORKLOAD_FWD, __op$)
+
+#define IMPLEMENT_UNARY_WORKLOAD_BWD(__op$) \
+  MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_UNARY_WORKLOAD_BWD, __op$)
+
+#define IMPLEMENT_BINARY_WORKLOAD_FWD(__op$) \
+  MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_BINARY_WORKLOAD_FWD, __op$)
+
+#define IMPLEMENT_BINARY_WORKLOAD_BWD(__op$) \
+  MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_BINARY_WORKLOAD_BWD, __op$)
+
+#define IMPLEMENT_CUSTOM_WORKLOAD_FWD(__op$) \
+  MSHADOW_MACRO_FOREACH_TYPE(_IMPLEMENT_CUSTOM_WORKLOAD_FWD, __op$)
+
+/*!
+ * \brief Tuning data and default weights in the case that MXNET_ENABLE_OPERATOR_AUTOTUNE is set
+ *        to zero (thus turning off auto-tuning)
+ * \note This code can be automatically generated
+ *       by setting the environment variable MXNET_OUTPUT_TUNING_DATA to a positive
+ *       integer value
+ */
+OperatorTuneBase::duration_t OperatorTuneBase::omp_overhead_ns_ = 5000;
+IMPLEMENT_UNARY_WORKLOAD_FWD(mshadow::op::identity);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::identity);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::identity_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::negation);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::reciprocal);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::reciprocal_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sigmoid);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sigmoid_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::relu);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::relu_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::tanh);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::tanh_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::softrelu);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::softrelu_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::exp);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::exp);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::expm1);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log1p);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log1p_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log2);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log2_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::log10);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::log10_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sin);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sin_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sinh);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sinh_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arcsin);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arcsin_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arcsinh);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arcsinh_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::cos);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::cos_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::cosh);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::cosh_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arccos);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arccos_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arccosh);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arccosh_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::tan);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::tan_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arctan);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arctan_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::arctanh);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::arctanh_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::square);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::square_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::square_root);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::square_root_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::reciprocal_square_root);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::reciprocal_square_root_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::cube_root);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::cube_root_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::reciprocal_cube_root);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::reciprocal_cube_root_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::abs);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::sign);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sign);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::sign_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::round);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::floor);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::trunc);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rint);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::fix);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::gamma);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::gamma_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::gammaln);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::gammaln_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::ceil);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::degrees);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::degrees_grad);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_FWD(mxnet::op::mshadow_op::radians);  // NOLINT()
+IMPLEMENT_UNARY_WORKLOAD_BWD(mxnet::op::mshadow_op::radians_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::plus);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::minus);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::mul);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::div);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mshadow::op::right);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rminus);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rdiv);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::div_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::div_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::div_rgrad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::div_rgrad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rdiv_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::mod);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::mod_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::mod_rgrad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rmod);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rmod_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::left);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::left);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::right);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::right);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::power);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rpower);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rpower_grad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_rgrad);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::maximum);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minimum);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::hypot);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::hypot_grad_left);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::hypot_grad_left);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::hypot_grad_right);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::hypot_grad_right);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::lt);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::lt);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::le);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::le);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::gt);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::gt);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::ge);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::ge);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::ne);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::ne);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::eq);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::eq);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::smooth_l1_loss);  // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::smooth_l1_gradient);  // NOLINT()
+IMPLEMENT_BLANK_WORKLOAD_FWD(mxnet::op::mxnet_op::set_to_int<0>);  // NOLINT()
+IMPLEMENT_BLANK_WORKLOAD_FWD(mxnet::op::mxnet_op::set_to_int<1>);  // NOLINT()
+IMPLEMENT_BLANK_WORKLOAD_FWD(mxnet::op::PopulateFullIdxRspKernel);  // NOLINT()
+/*!
+ * \brief Tuner objects, *not* automatically generated
+ */
+#ifdef MXNET_USE_OPERATOR_TUNING
+static BinaryOpTune<float>                  binaryOpTuneFloat;
+static BinaryOpTune<double>                 binaryOpTuneDouble;
+static BinaryOpTune<mshadow::half::half_t>  binaryOpTuneHalf;
+static BinaryOpTune<int8_t>                 binaryOpTuneInt8;
+static BinaryOpTune<uint8_t>                binaryOpTuneUInt8;
+static BinaryOpTune<int32_t>                binaryOpTuneInt32;
+static BinaryOpTune<int64_t>                binaryOpTuneInt64;
+#endif  // MXNET_USE_OPERATOR_TUNING
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/operator_tune.h b/src/operator/operator_tune.h
new file mode 100644
index 000000000000..b343e83a0258
--- /dev/null
+++ b/src/operator/operator_tune.h
@@ -0,0 +1,331 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef MXNET_OPERATOR_OPERATOR_TUNE_H_
+#define MXNET_OPERATOR_OPERATOR_TUNE_H_
+
+#include <mshadow/base.h>
+#include <mshadow/tensor.h>
+#include <vector>
+#include <set>
+#include <atomic>
+
+namespace mxnet {
+namespace op {
+
+#define WORKLOAD_COUNT_SHIFT  11
+
+/*!
+ * \brief Shared data for all data types being tuned, acts as a base class for the higher-level
+ *        templated tunin classes
+ */
+class OperatorTuneBase {
+ public:
+  typedef int64_t duration_t;
+
+ protected:
+  /*! \brief Have calculated omp_overhead_ yet? */
+  static std::atomic<bool> calculated_;
+  /*! \brief Time in nanoseconds for OMP overhead */
+  static duration_t omp_overhead_ns_;
+  /*! \brief Print debug/trace output for tuning info */
+  static bool verbose_tuning_info_;
+  /*! \brief Tuning scale factor */
+  static double tuning_weight_scale_;
+
+ public:
+  typedef std::chrono::high_resolution_clock::time_point Tick;
+
+  /*!
+   * \brief Get timestamp for "now"
+   * \return Tick object representing the current itmestamp
+   */
+  static MSHADOW_CINLINE Tick Now() {
+    return std::chrono::high_resolution_clock::now();
+  }
+
+  /*!
+   * \brief Get duration in nanoseconds
+   * \param t1 Start time tick
+   * \param t2 End time tick
+   * \return duration in nanoseconds between t1 and t2
+   */
+  static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &t1, const Tick &t2) {
+    return static_cast<duration_t>(
+      std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count());
+  }
+
+  /*!
+   * \brief Get duration in nanoseconds between the given 'since' value and now
+   * \param since Reference time which to calculate the duration
+   * \return Duration in nanoseconds between the given 'since' value and now
+   */
+  static MSHADOW_CINLINE duration_t GetDurationInNanoseconds(const Tick &since) {
+    return GetDurationInNanoseconds(since, Now());
+  }
+
+  /*! \brief Loop size to be timed (single op nanos may be too small to store accurately) */
+  static constexpr duration_t WORKLOAD_COUNT = (1 << WORKLOAD_COUNT_SHIFT);
+
+  /*!
+   * \brief Timer convenience class, sets start time as "now" in the constructor
+   */
+  struct Timer {
+    /*!
+     * \brief Constructor, sets start time
+     */
+    MSHADOW_CINLINE Timer()
+      : start_(OperatorTuneBase::Now()) {}
+    /*!
+     * \brief Get duration in nanoseconds since construction
+     * \return Duration in nanoseconds since construction
+     */
+    MSHADOW_CINLINE int64_t duration() const {
+      return OperatorTuneBase::GetDurationInNanoseconds(start_);
+    }
+
+    /*!
+     * \brief Reference start time, set in constructor
+     */
+    const OperatorTuneBase::Tick start_;
+  };
+
+  /*!
+   * \brief Estimate the time to compute with and without OMP, then return whether OMP is faster
+   * \param N - Number of iterations desired
+   * \param thread_count - Number of OMP threads available to perform the iterations
+   * \returns Whether it's faster to use OMP for these iterations
+   */
+  inline static bool IsOMPFaster(size_t N, size_t thread_count, const uint64_t serial_workload) {
+    if (thread_count >= 2) {
+      // Compute serial time required
+      const uint64_t total_serial_time_ns = serial_workload >> WORKLOAD_COUNT_SHIFT;
+
+      // Compute time required for OMP + # items per thread
+      const uint64_t omp_compute_time_ns = (serial_workload / thread_count) >> WORKLOAD_COUNT_SHIFT;
+      const uint64_t total_omp_time_ns = omp_overhead_ns_ + omp_compute_time_ns;
+
+      const bool rc = total_omp_time_ns < total_serial_time_ns;
+      return rc;
+    }
+    return false;
+  }
+};
+
+namespace tune {
+/*!
+ * \brief Tuning mode for registered kernel operators
+ */
+enum TuningMode {
+  kAuto,         // Based upon tuning data, choose whether to use OMP for kernel CPU Launch() loops
+  kNeverOMP,     // Don't use OMP for parallelism (legacy behavior for GPU builds)
+  kAlwaysOMP     // Don't use OMP for parallelism (legacy behavior for CPU builds)
+};
+}  // namespace tune
+
+template<typename DType>
+class OperatorTuneByType : public OperatorTuneBase {
+ public:
+  /*!
+   * \brief Set tuning mode
+   * \param tuning_mode The tune::TuningMode tuning mode value to set
+   */
+  static MSHADOW_CINLINE void set_tuning_mode(const tune::TuningMode tuning_mode) {
+    // Use const_cast to get past "assigning non-volatile to volatile warning
+    const_cast<tune::TuningMode &>(tuning_mode_) = tuning_mode;
+  }
+
+  /*!
+   * \brief Get the current tuning mode
+   * \return tune::TuningMode value for the current tuning mode
+   */
+  static MSHADOW_CINLINE tune::TuningMode tuning_mode() {
+    return tuning_mode_;
+  }
+
+  /*!
+   * \brief Determine whether to use OMP based upon both timing and configuration
+   * \param N - Number of iterations desired
+   * \param thread_count - Number of OMP threads available to perform the iterations
+   * \returns Whether it's faster to use OMP for these iterations
+   */
+  inline static bool UseOMP(size_t N, size_t thread_count, const uint64_t serial_workload) {
+#ifdef MXNET_USE_OPERATOR_TUNING
+    switch (tuning_mode()) {
+      case tune::kAuto:
+        return OperatorTuneBase::IsOMPFaster(N, thread_count, serial_workload);
+      case tune::kNeverOMP:
+        return false;
+      case tune::kAlwaysOMP:
+      default:
+        return thread_count > 1;
+    }
+#else
+    return true;
+#endif
+  }
+
+ protected:
+  /*! \brief Tuning mode */
+  static volatile tune::TuningMode tuning_mode_;
+};
+
+namespace mxnet_op {
+/*!
+ * \brief Kernel operator wrapper used for tuning data
+ */
+template<typename Operation, typename DType>
+struct tuned_op : public Operation {
+  /*! \brief nanoseconds to perform WORKLOAD_COUNT operations
+   *  \note It is conceivable that a vector of values could be used for more complex tuning,
+   *        but the need hasn't yet arisen
+   *  \remarks This variable generally needs to be implemented somewhere.  Currently this is mostly
+   *           done via macros in operator_tune.cc.  If you get undefined reference errors when
+   *           linking, then try to use one of the macros in that file to instantiate the required
+   *           data/functions
+   */
+  static size_t workload_;
+
+  /*!
+   * \brief Extra workload-calculating information (ie times for sub-portions of the calculation)
+   */
+  static std::vector<float> workload_ex_;
+
+  /*!
+   * \brief Calls parent class (Operation)'s UseOMP
+   * \tparam Args Variable arguments passed
+   * \param N Number of iterations
+   * \param thread_count Number of threads available
+   * \param args Variable arguments passed
+   * \return true if OMP parallelism is recommended
+   */
+  template<typename ...Args>
+  static MSHADOW_CINLINE bool UseOMP(size_t N, size_t thread_count, Args... args) {
+    return Operation::UseOMP(N, thread_count, args...);
+  }
+
+  /*!
+   * \brief Call a standard UseOMP() implementation (if it exists). Currently, these
+   *        are implemented in operator_tune.cc for standard unary, binary,
+   *        and argumentless kernels (i.e. mshadow_op::sqrt)
+   * \tparam Args Variable arguments passed
+   * \param N Number of iterations
+   * \param thread_count Number of threads available
+   * \param args Variable arguments passed
+   * \return true if OMP parallelism is recommended
+   */
+  static bool UseOMP(size_t N, size_t thread_count);
+};
+}  // namespace mxnet_op
+
+/*!
+ * \brief Calculate workload for a given lambda function
+ * \tparam Function Lambda type to time for WORKLOAD_COUNT calls
+ * \param function Lambda to time for WORKLOAD_COUNT calls
+ * \return median workload for function call (nanoseconds for WORKLOAD_COUNT calls)
+ */
+template<typename Function>
+inline int64_t get_workload(Function function) {
+  std::multiset<int64_t> durations;
+  typename OperatorTuneBase::Timer timer;
+  for (int pass = 0; pass < 3; ++pass) {
+    for (int i = 0; i < OperatorTuneBase::WORKLOAD_COUNT; ++i) {
+      function();
+    }
+  }
+  const OperatorTuneBase::duration_t dd = timer.duration();
+  durations.insert(dd);
+  return *++durations.begin();  // return median value
+}
+
+/*!
+ * \brief Declare a template specialization for the Kernel::Launch call for the given OP
+ *        wrapped with mxnet_op::op_with_req, using the given OpReqType as the 'req'
+ *        template parameter for 'op_with_req'.  This is useful for the standard mshadow_op
+ *        operators which need to be wrapped with op_with_req in order to be used with the
+ *        Kernel::Launch command.
+ *
+ * \note Expects to be used within the mxnet::op namespace
+ *
+ * For example:
+ *
+ * namespace mxnet_op {
+ * template <>
+ * template <typename... Args>
+ * inline void Kernel<typename mxnet_op::op_with_req<mshadow::op::identity, kNullOp>, cpu>
+ *   ::Launch(mshadow::Stream<cpu>* s, const int N, Args... args) {
+ *   ::mxnet::op::mxnet_op::Kernel<typename mxnet_op::op_with_req<mshadow::op::identity, kNullOp>,
+ *     cpu>::LaunchMShadowOpEx(s, N, args...);
+ *   }
+ * }
+ *
+ */
+#define MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, __req$) \
+  namespace mxnet_op { \
+  template<> template<typename ...Args> \
+  inline void Kernel<typename mxnet_op::op_with_req<__op$, __req$>, ::mshadow::cpu>:: \
+    Launch(mshadow::Stream<::mshadow::cpu> *s, const int N, Args... args) { \
+      /* Launch via LaunchMShadowOpEx() */ \
+      KernelWrapper<typename mxnet_op::op_with_req<__op$, __req$>, ::mshadow::cpu>:: \
+        LaunchMShadowOpEx(s, N, args...); \
+  } \
+  }  /* namespace mxnet_op */
+
+/*!
+ * \brief Declare template specializations for the Kernel::Launch call for the given OP
+ *        wrapped with mxnet_op::op_with_req, using the all supported OpReqType as the 'req'
+ *        template parameter for 'op_with_req'.  This is useful for the standard mshadow_op
+ *        operators which need to be wrapped with op_with_req in order to be used with the
+ *        Kernel::Launch command.
+ * \note Expects to be used within the mxnet::op namespace
+ */
+#define MXNET_TUNABLE_MSHADOW_OP(__op$) \
+  MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, kNullOp); \
+  MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, kWriteTo); \
+  MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, kWriteInplace); \
+  MXNET_TUNABLE_MSHADOW_OP_WITH_REQ(__op$, kAddTo);
+
+#define MXNET_TUNABLE_MSHADOW_OP_BACKWARD(__op$) \
+  MXNET_TUNABLE_MSHADOW_OP(mxnet::op::mxnet_op::backward_grad<__op$>)
+
+#define MXNET_TUNABLE_MSHADOW_OP_FWD_AND_BWD(__op$) \
+  MXNET_TUNABLE_MSHADOW_OP(__op$) \
+  MXNET_TUNABLE_MSHADOW_OP_BACKWARD(__op$)
+
+/*!
+ * \brief mxnet::op::mxnet_op format ops (work directly with Kernel<>::Launch()
+ *        Used from within mxnet::op::mxnet_op namespace
+ */
+#define _MXNET_TUNABLE_MXNET_OP_FWD(__op$) \
+  template<> template<typename ...Args> inline void Kernel<__op$, ::mshadow::cpu>::Launch( \
+    mshadow::Stream<::mshadow::cpu> *s, const int N, Args... args) { \
+      /* Launch via LaunchMXNetOpEx() */ \
+      KernelWrapper<__op$, ::mshadow::cpu>::LaunchMXNetOpEx(s, N, args...); \
+  }
+
+/*!
+ * \brief mxnet::op::mxnet_op format ops (work directly with Kernel<>::Launch()
+ *        Used from within mxnet::op
+ */
+#define MXNET_TUNABLE_MXNET_OP_FWD(__op$) \
+  namespace mxnet_op { _MXNET_TUNABLE_MXNET_OP_FWD(__op$) }  /* namespace mxnet_op */
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_OPERATOR_TUNE_H_
diff --git a/src/operator/operator_util.cc b/src/operator/operator_util.cc
index 25fa209a026c..bae3cb6a2964 100644
--- a/src/operator/operator_util.cc
+++ b/src/operator/operator_util.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file operator_util.cc
  *  Implementation of operator util.
  */
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index aa4af9edcba1..80039eda3bb9 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file optimizer_op-inl.h
  * \brief Optimizer operators
  * \author Junyuan Xie
@@ -36,6 +37,7 @@
 #include "./mshadow_op.h"
 #include "./elemwise_op_common.h"
 #include "mxnet_op.h"
+#include "./tensor/init_op.h"
 
 namespace mxnet {
 namespace op {
@@ -102,6 +104,101 @@ inline void SGDUpdate(const nnvm::NodeAttrs& attrs,
   });
 }
 
+/*! \brief kernel for sparse sgd
+ */
+template<int req>
+struct SGDDnsRspKernel {
+  // DType is the output data type
+  // IType is row sparse idx type
+  // i is the ith row in row sparse gradient
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, const index_t row_length, DType* out, const DType* weight,
+                                  const IType* grad_idx, const DType *grad_val,
+                                  const DType clip_gradient, const DType lr,
+                                  const DType wd, const DType rescale_grad) {
+    for (index_t j = 0; j < row_length; j++) {
+      index_t data_i = grad_idx[i] * row_length + j;
+      index_t grad_i = i * row_length + j;
+      if (clip_gradient >= 0.0f) {
+        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
+                     (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[grad_i], clip_gradient));
+      } else {
+        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
+                      (lr * rescale_grad) * grad_val[grad_i]);
+      }
+    }
+  }
+};
+
+template<typename xpu>
+inline void SGDUpdateDnsRspImpl(const SGDParam& param,
+                                const OpContext &ctx,
+                                const TBlob& weight,
+                                const NDArray& grad,
+                                const OpReqType& req,
+                                TBlob *out) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  using namespace mxnet_op;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  CHECK_EQ(grad.storage_type(), kRowSparseStorage);
+  // if gradients are zeros, no weights are updated
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        DType* weight_data = weight.dptr<DType>();
+        IType* grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
+        DType* grad_val = grad.data().dptr<DType>();
+        index_t num_rows = grad.aux_shape(rowsparse::kIdx)[0];
+        auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+        Kernel<SGDDnsRspKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
+          out->dptr<DType>(), weight_data, grad_idx, grad_val,
+          static_cast<DType>(param.clip_gradient),
+          static_cast<DType>(param.lr), static_cast<DType>(param.wd),
+          static_cast<DType>(param.rescale_grad));
+      });
+    });
+  });
+}
+
+template<typename xpu>
+inline void SGDUpdateRspRspImpl(const SGDParam& param,
+                                const OpContext& ctx,
+                                const NDArray& weight,
+                                const NDArray& grad,
+                                const OpReqType& req,
+                                NDArray *out) {
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDUpdate", "weights");
+  // reuse dns rsp implementation when storage_shape == shape
+  TBlob out_blob = out->data();
+  SGDUpdateDnsRspImpl<xpu>(param, ctx, weight.data(), grad, req, &out_blob);
+}
+
+template<typename xpu>
+inline void SGDUpdateEx(const nnvm::NodeAttrs& attrs,
+                        const OpContext &ctx,
+                        const std::vector<NDArray> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<NDArray> &outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  const SGDParam& param = nnvm::get<SGDParam>(attrs.parsed);
+  auto out_stype = outputs[0].storage_type();
+  if (common::ContainsOnlyStorage(inputs, kRowSparseStorage) &&
+      out_stype == kRowSparseStorage) {
+    NDArray out = outputs[0];
+    SGDUpdateRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1], req[0], &out);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
 struct SGDMomParam : public dmlc::Parameter<SGDMomParam> {
   float lr;
   float momentum;
@@ -168,7 +265,7 @@ inline void SGDMomUpdate(const nnvm::NodeAttrs& attrs,
       grad.dptr_, static_cast<DType>(param.clip_gradient), static_cast<DType>(param.momentum),
       static_cast<DType>(param.lr), static_cast<DType>(param.wd),
       static_cast<DType>(param.rescale_grad), req[0]);
-  });
+    });
 }
 
 template<int n_in, int n_out, int total_in>
@@ -275,6 +372,119 @@ inline void MP_SGDMomUpdate(const nnvm::NodeAttrs& attrs,
   });
 }
 
+template<int req>
+struct SGDMomDnsRspDnsKernel {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, index_t row_length, DType* out_data,
+    DType* mom_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const DType clip_gradient, const DType momentum,
+    const DType lr, const DType wd, const DType rescale_grad) {
+    const DType rate = lr * wd;
+    for (index_t j = 0; j < row_length; j++) {
+      index_t data_i = grad_idx[i] * row_length + j;
+      index_t grad_i = i * row_length + j;
+      if (clip_gradient >= 0.0f) {
+        mom_data[data_i] = momentum * mom_data[data_i]
+                - rate * weight_data[data_i]
+                - lr *
+                mshadow_op::clip::Map(rescale_grad * grad_data[grad_i],
+                                      clip_gradient);
+      } else {
+        mom_data[data_i] = momentum * mom_data[data_i]
+                  - rate * weight_data[data_i]
+                  - lr * rescale_grad * grad_data[grad_i];
+      }
+      KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
+    }
+  }
+};
+
+template<typename xpu>
+inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param,
+                                      const OpContext& ctx,
+                                      const TBlob& weight,
+                                      const NDArray& grad,
+                                      const TBlob& mom,
+                                      const OpReqType& req,
+                                      TBlob *out) {
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse sgd_mom_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(mom.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        DType* weight_data = weight.dptr<DType>();
+        IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
+        DType* grad_val = grad.data().dptr<DType>();
+        DType* mom_data = mom.dptr<DType>();
+        DType* out_data = out->dptr<DType>();
+        index_t num_rows = grad.aux_shape(kIdx)[0];
+        auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+        Kernel<SGDMomDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
+          out_data, mom_data, weight_data, grad_idx, grad_val,
+          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.momentum),
+          static_cast<DType>(param.lr), static_cast<DType>(param.wd),
+          static_cast<DType>(param.rescale_grad));
+      });
+    });
+  });
+}
+
+template<typename xpu>
+inline void SGDMomUpdateRspRspRspImpl(const SGDMomParam& param,
+                                      const OpContext& ctx,
+                                      const NDArray& weight,
+                                      const NDArray& grad,
+                                      const NDArray& mom,
+                                      const OpReqType& req,
+                                      NDArray *out) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "SGDMomUpdate", "weights");
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  // fill mom with zero values in order to reuse the sgd mom dns impl
+  if (!mom.storage_initialized()) {
+    NDArray mom_zeros = mom;
+    FillDnsZerosRspImpl(s, &mom_zeros);
+  }
+  TBlob out_blob = out->data();
+  // reuse dns rsp implementation when storage_shape == shape
+  SGDMomUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad,
+                                 mom.data(), req, &out_blob);
+}
+
+template<typename xpu>
+inline void SGDMomUpdateEx(const nnvm::NodeAttrs& attrs,
+                           const OpContext &ctx,
+                           const std::vector<NDArray> &inputs,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<NDArray> &outputs) {
+  using namespace mxnet_op;
+  const SGDMomParam& param = nnvm::get<SGDMomParam>(attrs.parsed);
+  auto &weight = inputs[0];
+  auto &grad = inputs[1];
+  auto &mom = inputs[2];
+  const auto weight_stype = weight.storage_type();
+  const auto mom_stype = mom.storage_type();
+  const auto out_stype = outputs[0].storage_type();
+  CHECK_EQ(weight_stype, mom_stype) << "Inconsistent storage type detected between mom.stype = "
+           << mom_stype << " and weight.stype = " << weight_stype;
+  if (common::ContainsOnlyStorage(inputs, kRowSparseStorage) &&
+      out_stype == kRowSparseStorage) {
+     NDArray out = outputs[0];
+     SGDMomUpdateRspRspRspImpl<xpu>(param, ctx, weight, grad, mom, req[0], &out);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
 struct AdamParam : public dmlc::Parameter<AdamParam> {
   float lr;
   float beta1;
@@ -364,6 +574,141 @@ inline void AdamUpdate(const nnvm::NodeAttrs& attrs,
   });
 }
 
+/*!
+ * Note: this kernel performs sparse adam update. For each row-slice in row_sparse
+ * gradient, it finds the corresponding elements in weight, mean and var and performs
+ * the update.
+ * The kernel assumes dense weight/mean/var, and row_sparse gradient
+ */
+template<int req>
+struct AdamDnsRspDnsKernel {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
+    DType* mean_data, DType* var_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const DType clip_gradient, const DType beta1, const DType beta2,
+    const DType lr, const DType wd, const DType epsilon, const DType rescale_grad) {
+    using nnvm::dim_t;
+    using namespace mshadow_op;
+    const dim_t row_offset = grad_idx[i] * row_length;
+    for (dim_t j = 0; j < row_length; j++) {
+      // index in data/mean/var
+      const dim_t data_i = row_offset + j;
+      // index in grad
+      const dim_t grad_i = i * row_length + j;
+      const DType grad_rescaled = grad_data[grad_i] * rescale_grad + weight_data[data_i] * wd;
+      if (clip_gradient >= 0.0f) {
+        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
+                            clip::Map(grad_rescaled, clip_gradient);
+        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
+                            clip::Map(grad_rescaled, clip_gradient));
+      } else {
+        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+        var_data[data_i] = beta2 * var_data[data_i] +
+                           (1.f - beta2) * grad_rescaled * grad_rescaled;
+      }
+      KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
+                    (square_root::Map(var_data[data_i]) + epsilon));
+    }
+  }
+};
+
+
+template<typename xpu>
+inline void AdamUpdateDnsRspDnsImpl(const AdamParam& param,
+                                    const OpContext& ctx,
+                                    const TBlob& weight,
+                                    const NDArray& grad,
+                                    const TBlob& mean,
+                                    const TBlob& var,
+                                    const OpReqType& req,
+                                    TBlob *out) {
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse adam_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(mean.shape_.Size(), 0);
+  CHECK_GT(var.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        const DType* weight_data = weight.dptr<DType>();
+        const IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
+        const DType* grad_val = grad.data().dptr<DType>();
+        DType* mean_data = mean.dptr<DType>();
+        DType* var_data = var.dptr<DType>();
+        DType* out_data = out->dptr<DType>();
+        nnvm::dim_t num_rows = grad.aux_shape(kIdx)[0];
+        const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+        Kernel<AdamDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
+          out_data, mean_data, var_data, weight_data, grad_idx, grad_val,
+          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.beta1),
+          static_cast<DType>(param.beta2), static_cast<DType>(param.lr),
+          static_cast<DType>(param.wd), static_cast<DType>(param.epsilon),
+          static_cast<DType>(param.rescale_grad));
+      });
+    });
+  });
+}
+
+template<typename xpu>
+inline void AdamUpdateRspRspRspImpl(const AdamParam& param,
+                                    const OpContext& ctx,
+                                    const NDArray& weight,
+                                    const NDArray& grad,
+                                    const NDArray& mean,
+                                    const NDArray& var,
+                                    const OpReqType& req,
+                                    NDArray *out) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "AdamUpdate", "weights");
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  // fill mean and variance with zero values in order to reuse the sgd mom dns impl
+  if (!mean.storage_initialized()) {
+    NDArray mean_zeros = mean;
+    FillDnsZerosRspImpl(s, &mean_zeros);
+  }
+  if (!var.storage_initialized()) {
+    NDArray var_zeros = var;
+    FillDnsZerosRspImpl(s, &var_zeros);
+  }
+  TBlob out_blob = out->data();
+  // reuse dns rsp implementation when storage_shape == shape
+  AdamUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad, mean.data(),
+                               var.data(), req, &out_blob);
+}
+
+
+template<typename xpu>
+inline void AdamUpdateEx(const nnvm::NodeAttrs& attrs,
+                         const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+  const AdamParam& param = nnvm::get<AdamParam>(attrs.parsed);
+  const auto weight_stype = inputs[0].storage_type();
+  const auto mean_stype = inputs[2].storage_type();
+  const auto var_stype = inputs[3].storage_type();
+  const auto out_stype = outputs[0].storage_type();
+  CHECK_EQ(mean_stype, weight_stype) << "Inconsistent storage type detected between "
+           << " mean.stype = " << mean_stype << " and weight.stype = " << weight_stype;
+  CHECK_EQ(var_stype, weight_stype) << "Inconsistent storage type detected between "
+           << " var.stype = " << var_stype << " and weight.stype = " << weight_stype;
+  if (common::ContainsOnlyStorage(inputs, kRowSparseStorage) &&
+      out_stype == kRowSparseStorage) {
+     NDArray out = outputs[0];
+     AdamUpdateRspRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1], inputs[2],
+                                  inputs[3], req[0], &out);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
 // This RMSProp code follows the version in
 // http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45)
 // by Alex Graves, 2013.
@@ -558,6 +903,207 @@ inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
   });
 }
 
+struct FtrlParam : public dmlc::Parameter<FtrlParam> {
+  float lr;
+  float lamda1;
+  float beta;
+  float wd;
+  float rescale_grad;
+  float clip_gradient;
+  DMLC_DECLARE_PARAMETER(FtrlParam) {
+    DMLC_DECLARE_FIELD(lr)
+    .describe("Learning rate");
+    DMLC_DECLARE_FIELD(lamda1)
+    .set_default(0.01f)
+    .describe("The L1 regularization coefficient.");
+    DMLC_DECLARE_FIELD(beta)
+    .set_default(1.0f)
+    .describe("Per-Coordinate Learning Rate beta.");
+    DMLC_DECLARE_FIELD(wd)
+    .set_default(0.0f)
+    .describe("Weight decay augments the objective function with a "
+              "regularization term that penalizes large weights. "
+              "The penalty scales with the square of the magnitude of each weight.");
+    DMLC_DECLARE_FIELD(rescale_grad)
+    .set_default(1.0f)
+    .describe("Rescale gradient to grad = rescale_grad*grad.");
+    DMLC_DECLARE_FIELD(clip_gradient)
+    .set_default(-1.0f)
+    .describe("Clip gradient to the range of [-clip_gradient, clip_gradient] "
+              "If clip_gradient <= 0, gradient clipping is turned off. "
+              "grad = max(min(grad, clip_gradient), -clip_gradient).");
+  }
+};
+
+template<typename xpu>
+inline void FtrlUpdate(const nnvm::NodeAttrs& attrs,
+                       const OpContext &ctx,
+                       const std::vector<TBlob> &inputs,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mshadow_op;
+  const FtrlParam& param = nnvm::get<FtrlParam>(attrs.parsed);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
+    Tensor<xpu, 2, DType> weight = inputs[0].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> grad = inputs[1].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> z = inputs[2].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> n = inputs[3].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
+
+    grad = scalar<DType>(param.rescale_grad) * grad;
+
+    if (param.clip_gradient >= 0.0f) {
+      z += F<clip>(grad, DType(param.clip_gradient)) - (F<square_root>(n +
+           F<square>(F<clip>(grad, DType(param.clip_gradient)))) - F<square_root>(n)) *
+           weight / scalar<DType>(param.lr);
+      n += F<square>(F<clip>(grad, DType(param.clip_gradient)));
+    } else {
+      z += grad - (F<square_root>(n + F<square>(grad)) - F<square_root>(n)) *
+           weight / scalar<DType>(param.lr);
+      n += F<square>(grad);
+    }
+    Assign(out, req[0],
+           (F<sign>(z) * scalar<DType>(param.lamda1) - z) /
+           ((scalar<DType>(param.beta) + F<square_root>(n)) /
+           scalar<DType>(param.lr) + scalar<DType>(param.wd)) *
+           F<gt>(F<abs>(z), scalar<DType>(param.lamda1)));
+  });
+}
+
+template<int req>
+struct FtrlDnsRspDnsKernel {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, const nnvm::dim_t row_length, DType* out_data,
+    DType* z_data, DType* n_data, const DType* weight_data, const IType* grad_idx,
+    const DType* grad_data, const DType clip_gradient, const DType lamda1, const DType beta,
+    const DType lr, const DType wd, const DType rescale_grad) {
+    using nnvm::dim_t;
+    using namespace mshadow_op;
+    const dim_t row_offset = grad_idx[i] * row_length;
+    for (dim_t j = 0; j < row_length; j++) {
+      // index in data/z/n
+      const dim_t data_i = row_offset + j;
+      // index in grad
+      const dim_t grad_i = i * row_length + j;
+      const DType grad_rescaled = grad_data[grad_i] * rescale_grad;
+      if (clip_gradient >= 0.0f) {
+        z_data[data_i] += clip::Map(grad_rescaled, clip_gradient) -
+                          (square_root::Map(n_data[data_i] +
+                          square::Map(clip::Map(grad_rescaled, clip_gradient))) -
+                          square_root::Map(n_data[data_i])) * weight_data[data_i] / lr;
+        n_data[data_i] += square::Map(clip::Map(grad_rescaled, clip_gradient));
+      } else {
+        z_data[data_i] += grad_rescaled - (square_root::Map(n_data[data_i] +
+                          square::Map(grad_rescaled)) - square_root::Map(n_data[data_i])) *
+                          weight_data[data_i] / lr;
+        n_data[data_i] += square::Map(grad_rescaled);
+      }
+      KERNEL_ASSIGN(out_data[data_i], req,
+                    (sign::Map(z_data[data_i]) * lamda1 - z_data[data_i]) /
+                    ((beta + square_root::Map(n_data[data_i])) / lr + wd) *
+                    gt::Map(abs::Map(z_data[data_i]), lamda1));
+    }
+  }
+};
+
+
+template<typename xpu>
+inline void FtrlUpdateDnsRspDnsImpl(const FtrlParam& param,
+                                    const OpContext& ctx,
+                                    const TBlob& weight,
+                                    const NDArray& grad,
+                                    const TBlob& z,
+                                    const TBlob& n,
+                                    const OpReqType& req,
+                                    TBlob *out) {
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  if (!grad.storage_initialized() || req == kNullOp) return;
+  CHECK_EQ(req, kWriteInplace) << "kWriteInplace is expected for sparse ftrl_update";
+  CHECK_GT(weight.shape_.Size(), 0);
+  CHECK_GT(z.shape_.Size(), 0);
+  CHECK_GT(n.shape_.Size(), 0);
+
+  MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        const DType* weight_data = weight.dptr<DType>();
+        const IType* grad_idx = grad.aux_data(kIdx).dptr<IType>();
+        const DType* grad_val = grad.data().dptr<DType>();
+        DType* z_data = z.dptr<DType>();
+        DType* n_data = n.dptr<DType>();
+        DType* out_data = out->dptr<DType>();
+        nnvm::dim_t num_rows = grad.aux_shape(kIdx)[0];
+        const auto row_length = weight.shape_.ProdShape(1, weight.ndim());
+        Kernel<FtrlDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, row_length,
+          out_data, z_data, n_data, weight_data, grad_idx, grad_val,
+          static_cast<DType>(param.clip_gradient), static_cast<DType>(param.lamda1),
+          static_cast<DType>(param.beta), static_cast<DType>(param.lr),
+          static_cast<DType>(param.wd), static_cast<DType>(param.rescale_grad));
+      });
+    });
+  });
+}
+
+template<typename xpu>
+inline void FtrlUpdateRspRspRspImpl(const FtrlParam& param,
+                                    const OpContext& ctx,
+                                    const NDArray& weight,
+                                    const NDArray& grad,
+                                    const NDArray& z,
+                                    const NDArray& n,
+                                    const OpReqType& req,
+                                    NDArray *out) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  CHECK_RSP_ALL_ROWS_NON_ZERO(weight, "FtrlUpdate", "weights");
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  // fill z and n with zero values in order to reuse the sgd mom dns impl
+  if (!z.storage_initialized()) {
+    NDArray z_zeros = z;
+    FillDnsZerosRspImpl(s, &z_zeros);
+  }
+  if (!n.storage_initialized()) {
+    NDArray n_zeros = n;
+    FillDnsZerosRspImpl(s, &n_zeros);
+  }
+  TBlob out_blob = out->data();
+  // reuse dns rsp implementation when storage_shape == shape
+  FtrlUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad, z.data(),
+                               n.data(), req, &out_blob);
+}
+
+template<typename xpu>
+inline void FtrlUpdateEx(const nnvm::NodeAttrs& attrs,
+                         const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+  const FtrlParam& param = nnvm::get<FtrlParam>(attrs.parsed);
+  const auto weight_stype = inputs[0].storage_type();
+  const auto z_stype = inputs[2].storage_type();
+  const auto n_stype = inputs[3].storage_type();
+
+  const auto out_stype = outputs[0].storage_type();
+  CHECK_EQ(z_stype, weight_stype) << "Inconsistent storage type detected between "
+           << " z.stype = " << z_stype << " and weight.stype = " << weight_stype;
+  CHECK_EQ(n_stype, weight_stype) << "Inconsistent storage type detected between "
+           << " n.stype = " << n_stype << " and weight.stype = " << weight_stype;
+  if (common::ContainsOnlyStorage(inputs, kRowSparseStorage) && out_stype == kRowSparseStorage) {
+     NDArray out = outputs[0];
+     FtrlUpdateRspRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1], inputs[2],
+                                  inputs[3], req[0], &out);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index b26c333edaef..080991bb8ee9 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -18,11 +18,13 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file optimizer_op.cc
  * \brief Optimizer operators
  * \author Junyuan Xie
  */
 #include "./optimizer_op-inl.h"
+#include "./elemwise_op_common.h"
 
 namespace mxnet {
 namespace op {
@@ -32,26 +34,37 @@ DMLC_REGISTER_PARAMETER(SGDMomParam);
 DMLC_REGISTER_PARAMETER(AdamParam);
 DMLC_REGISTER_PARAMETER(RMSPropParam);
 DMLC_REGISTER_PARAMETER(RMSPropAlexParam);
+DMLC_REGISTER_PARAMETER(FtrlParam);
 
 NNVM_REGISTER_OP(sgd_update)
+MXNET_ADD_SPARSE_OP_ALIAS(sgd_update)
 .describe(R"code(Update function for Stochastic Gradient Descent (SDG) optimizer.
 
 It updates the weights using::
 
  weight = weight - learning_rate * gradient
 
+If weight is of ``row_sparse`` storage type,
+only the row slices whose indices appear in grad.indices are updated::
+
+ for row in gradient.indices:
+     weight[row] = weight[row] - learning_rate * gradient[row]
+
 )code" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<SGDParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<2, 1, false, true, false>)
 .set_attr<FCompute>("FCompute<cpu>", SGDUpdate<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SGDUpdateEx<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_arguments(SGDParam::__FIELDS__());
 
 NNVM_REGISTER_OP(sgd_mom_update)
+MXNET_ADD_SPARSE_OP_ALIAS(sgd_mom_update)
 .describe(R"code(Momentum update function for Stochastic Gradient Descent (SDG) optimizer.
 
 Momentum update has better convergence rates on neural networks. Mathematically it looks
@@ -70,17 +83,26 @@ It updates the weights using::
 
 Where the parameter ``momentum`` is the decay rate of momentum estimates at each epoch.
 
+If weight and momentum are both of ``row_sparse`` storage type,
+only the row slices whose indices appear in grad.indices are updated (for both weight and momentum)::
+
+  for row in gradient.indices:
+      v[row] = momentum[row] * v[row] - learning_rate * gradient[row]
+      weight[row] += v[row]
+
 )code" ADD_FILELINE)
 .set_num_inputs(3)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<SGDMomParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<3, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<3, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<3, 1, false, true, false>)
 .set_attr<nnvm::FMutateInputs>("FMutateInputs",
   [](const nnvm::NodeAttrs& attrs) {
     return std::vector<uint32_t>{2};
   })
 .set_attr<FCompute>("FCompute<cpu>", SGDMomUpdate<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SGDMomUpdateEx<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_argument("mom", "NDArray-or-Symbol", "Momentum")
@@ -122,6 +144,7 @@ NNVM_REGISTER_OP(mp_sgd_mom_update)
 .add_arguments(SGDMomParam::__FIELDS__());
 
 NNVM_REGISTER_OP(adam_update)
+MXNET_ADD_SPARSE_OP_ALIAS(adam_update)
 .describe(R"code(Update function for Adam optimizer. Adam is seen as a generalization
 of AdaGrad.
 
@@ -141,17 +164,27 @@ It updates the weights using::
  v = beta2*v + (1-beta2)*(grad**2)
  w += - learning_rate * m / (sqrt(v) + epsilon)
 
+If w, m and v are all of ``row_sparse`` storage type,
+only the row slices whose indices appear in grad.indices are updated (for w, m and v)::
+
+ for row in grad.indices:
+     m[row] = beta1*m[row] + (1-beta1)*grad[row]
+     v[row] = beta2*v[row] + (1-beta2)*(grad[row]**2)
+     w[row] += - learning_rate * m[row] / (sqrt(v[row]) + epsilon)
+
 )code" ADD_FILELINE)
 .set_num_inputs(4)
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<AdamParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<4, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<4, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<4, 1, false, true, false>)
 .set_attr<nnvm::FMutateInputs>("FMutateInputs",
   [](const nnvm::NodeAttrs& attrs) {
     return std::vector<uint32_t>{2, 3};
   })
 .set_attr<FCompute>("FCompute<cpu>", AdamUpdate<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", AdamUpdateEx<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
 .add_argument("grad", "NDArray-or-Symbol", "Gradient")
 .add_argument("mean", "NDArray-or-Symbol", "Moving mean")
@@ -250,5 +283,46 @@ to be 0.9 and the learning rate :math:`\eta` to be 0.0001.
 .add_argument("delta", "NDArray-or-Symbol", "delta")
 .add_arguments(RMSPropAlexParam::__FIELDS__());
 
+NNVM_REGISTER_OP(ftrl_update)
+MXNET_ADD_SPARSE_OP_ALIAS(ftrl_update)
+.describe(R"code(Update function for Ftrl optimizer.
+Referenced from *Ad Click Prediction: a View from the Trenches*, available at
+http://dl.acm.org/citation.cfm?id=2488200.
+
+It updates the weights using::
+
+ rescaled_grad = clip(grad * rescale_grad, clip_gradient)
+ z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
+ n += rescaled_grad**2
+ w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
+
+If w, z and n are all of ``row_sparse`` storage type,
+only the row slices whose indices appear in grad.indices are updated (for w, z and n)::
+
+ for row in grad.indices:
+     rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
+     z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
+     n[row] += rescaled_grad[row]**2
+     w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
+
+)code" ADD_FILELINE)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<FtrlParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<4, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<4, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<4, 1, false, true, false>)
+.set_attr<nnvm::FMutateInputs>("FMutateInputs",
+  [](const nnvm::NodeAttrs& attrs) {
+    return std::vector<uint32_t>{2, 3};
+  })
+.set_attr<FCompute>("FCompute<cpu>", FtrlUpdate<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", FtrlUpdateEx<cpu>)
+.add_argument("weight", "NDArray-or-Symbol", "Weight")
+.add_argument("grad", "NDArray-or-Symbol", "Gradient")
+.add_argument("z", "NDArray-or-Symbol", "z")
+.add_argument("n", "NDArray-or-Symbol", "Square of grad")
+.add_arguments(FtrlParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 0e74e303dbc9..5969d331b4a2 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file optimizer_op.cu
  * \brief Optimizer operators
  * \author Junyuan Xie
@@ -28,10 +29,12 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(sgd_update)
-.set_attr<FCompute>("FCompute<gpu>", SGDUpdate<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SGDUpdate<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SGDUpdateEx<gpu>);
 
 NNVM_REGISTER_OP(sgd_mom_update)
-.set_attr<FCompute>("FCompute<gpu>", SGDMomUpdate<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SGDMomUpdate<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SGDMomUpdateEx<gpu>);
 
 NNVM_REGISTER_OP(mp_sgd_update)
 .set_attr<FCompute>("FCompute<gpu>", MP_SGDUpdate<gpu>);
@@ -40,7 +43,8 @@ NNVM_REGISTER_OP(mp_sgd_mom_update)
 .set_attr<FCompute>("FCompute<gpu>", MP_SGDMomUpdate<gpu>);
 
 NNVM_REGISTER_OP(adam_update)
-.set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", AdamUpdateEx<gpu>);
 
 NNVM_REGISTER_OP(rmsprop_update)
 .set_attr<FCompute>("FCompute<gpu>", RMSPropUpdate<gpu>);
@@ -48,5 +52,9 @@ NNVM_REGISTER_OP(rmsprop_update)
 NNVM_REGISTER_OP(rmspropalex_update)
 .set_attr<FCompute>("FCompute<gpu>", RMSPropAlexUpdate<gpu>);
 
+NNVM_REGISTER_OP(ftrl_update)
+.set_attr<FCompute>("FCompute<gpu>", FtrlUpdate<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", FtrlUpdateEx<gpu>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/pad-inl.h b/src/operator/pad-inl.h
index 80f9e0bf92ac..520cd124c49a 100644
--- a/src/operator/pad-inl.h
+++ b/src/operator/pad-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file pad-inl.h
  * \brief
  * \author Sebastian Bodenstien
diff --git a/src/operator/pad.cc b/src/operator/pad.cc
index 468629a43672..2332c93b8d5b 100644
--- a/src/operator/pad.cc
+++ b/src/operator/pad.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pad.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/pad.cu b/src/operator/pad.cu
index 98220b6c39ef..54242a485ec4 100644
--- a/src/operator/pad.cu
+++ b/src/operator/pad.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pad.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index fbc6981a7591..caa6717c49a4 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file pooling-inl.h
  * \brief
  * \author Bing Xu, Jun Wu
diff --git a/src/operator/pooling.cc b/src/operator/pooling.cc
index 51dce873cd04..382e17166a10 100644
--- a/src/operator/pooling.cc
+++ b/src/operator/pooling.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cc
  * \brief
  * \author Bing Xu, Jun Wu
@@ -38,10 +39,8 @@ namespace op {
 template<>
 Operator *CreateOp<cpu>(PoolingParam param, int dtype) {
   Operator *op = NULL;
-  // TODO(lingyan): kFull use exclude padding algorithm now
 #if MXNET_USE_MKL2017 == 1
     if (param.kernel.ndim() == 2
-      && (param.pooling_convention == pool_enum::kValid)
       && ((param.pool_type == pool_enum::kMaxPooling)
       || (param.pool_type == pool_enum::kAvgPooling))) {
       switch (dtype) {
diff --git a/src/operator/pooling.cu b/src/operator/pooling.cu
index 950f09956258..d6e093223cd5 100644
--- a/src/operator/pooling.cu
+++ b/src/operator/pooling.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file pooling.cu
  * \brief
  * \author Bing Xu, Jun Wu
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
index e541298ed2ab..7070c0dd220d 100644
--- a/src/operator/pooling_v1-inl.h
+++ b/src/operator/pooling_v1-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pooling_v1-inl.h
  * \brief
  * \author Bing Xu
diff --git a/src/operator/pooling_v1.cc b/src/operator/pooling_v1.cc
index 40de7457520f..5b68a08db602 100644
--- a/src/operator/pooling_v1.cc
+++ b/src/operator/pooling_v1.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pooling_v1.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/pooling_v1.cu b/src/operator/pooling_v1.cu
index 4db22c18420d..fccda40513e7 100644
--- a/src/operator/pooling_v1.cu
+++ b/src/operator/pooling_v1.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pooling_v1.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/random/multisample_op.cc b/src/operator/random/multisample_op.cc
index f1264e5dc3cc..5f2af61f03ba 100644
--- a/src/operator/random/multisample_op.cc
+++ b/src/operator/random/multisample_op.cc
@@ -18,129 +18,24 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file multisample_op.cc
  * \brief CPU-implementation of multi-sampling operators
  */
+
 #include "./multisample_op.h"
 
 namespace mxnet {
 namespace op {
 
-struct UniformSampler {
-  template<typename DType>
-  struct Sampler{
-    std::mt19937 rnd_;
-    // Ensure that half_t is handled correctly.
-    typedef typename std::conditional<std::is_floating_point<DType>::value,
-                                     DType, double>::type FType;
-    typedef typename std::conditional<std::is_integral<DType>::value,
-                                      std::uniform_int_distribution<DType>,
-                                      std::uniform_real_distribution<FType>>::type GType;
-    GType gen_;
-    template<typename PType>
-    Sampler(PType a, PType b, int seed): rnd_(seed), gen_(a, b) { }
-    MSHADOW_XINLINE DType operator()() { return gen_(rnd_); }
-  };
-};
-
-struct NormalSampler {
-  template<typename DType>
-  struct Sampler{
-    std::mt19937 rnd_;
-    typedef typename std::conditional<std::is_floating_point<DType>::value,
-                                     DType, double>::type GType;
-    std::normal_distribution<GType> gen_;
-    template<typename PType>
-    Sampler(PType mu, PType sigma, int seed): rnd_(seed), gen_(mu, sigma) {}
-    MSHADOW_XINLINE DType operator()() { return gen_(rnd_); }
-  };
-};
-
-struct GammaSampler {
-  template<typename DType>
-  struct Sampler{
-    std::mt19937 rnd_;
-    // Avoid problems with static check during compilation for integral types.
-    typedef typename std::conditional<std::is_floating_point<DType>::value,
-                                      DType, double>::type GType;
-    std::gamma_distribution<GType> gen_;
-    template<typename PType>
-    Sampler(PType alpha, PType beta, int seed): rnd_(seed), gen_(alpha, beta) {}
-    MSHADOW_XINLINE DType operator()() { return gen_(rnd_); }
-  };
-};
-
-struct ExponentialSampler {
-  template<typename DType>
-  struct Sampler{
-    std::mt19937 rnd_;
-    // Avoid problems with static check during compilation for integral types.
-    typedef typename std::conditional<std::is_floating_point<DType>::value,
-                                      DType, double>::type GType;
-    std::exponential_distribution<GType> gen_;
-    template<typename PType>
-    Sampler(PType lambda, PType , int seed): rnd_(seed), gen_(lambda) {}
-    MSHADOW_XINLINE DType operator()() { return gen_(rnd_); }
-  };
-};
-
-struct PoissonSampler {
-  template<typename DType>
-  struct Sampler{
-    std::mt19937 rnd_;
-    // Allow sampling of a Poisson distribution also to output floating point types.
-    typedef typename std::conditional<std::is_integral<DType>::value,
-                                      DType, int>::type GType;
-    std::poisson_distribution<GType> gen_;
-    template<typename PType>
-    Sampler(PType lambda, PType , int seed): rnd_(seed), gen_(lambda) { }
-    MSHADOW_XINLINE DType operator()() { return static_cast<DType>(gen_(rnd_)); }
-  };
-};
-
-// Negative binomial distribution as defined in C++ standard library
-struct NegativeBinomialSampler {
-  template<typename DType>
-  struct Sampler{
-    std::mt19937 rnd_;
-    // Allow sampling of a negative binomial distribution also to output floating point types.
-    typedef typename std::conditional<std::is_integral<DType>::value, DType, int>::type GType;
-    std::negative_binomial_distribution<GType> gen_;
-    template<typename PType>
-    Sampler(PType k, PType p, int seed): rnd_(seed), gen_(k, p) {}
-    MSHADOW_XINLINE DType operator()() { return static_cast<DType>(gen_(rnd_)); }
-  };
-};
-
-// Generalized form of the negative binomial distribution which is generated by
-// a poisson-gamma mixture: X ~ NegBin(mu, alpha) corresponds to
-// X ~ Poisson(Gamma(1/alpha,mu*alpha))
-struct GeneralizedNegativeBinomialSampler {
-  template<typename DType>
-  struct Sampler {
-    // We allow the boundary case where the negative binomial equals the Poisson distribution
-    bool poisson_;
-    double mu_;
-    std::mt19937 rnd_;
-    // Realize the negative binomial by a Poisson distribution over a gamma distributed mean.
-    std::gamma_distribution<> gen_;
-    template<typename PType>
-    Sampler(PType mu, PType alpha, int seed): poisson_(alpha == 0.0), mu_(mu), rnd_(seed),
-                          gen_((alpha == PType(0) ? PType(1) : PType(1)/alpha), mu*alpha) {}
-    // Allow sampling of a Poisson distribution also to output floating point types.
-    typedef typename std::conditional<std::is_integral<DType>::value, DType, int>::type GType;
-    MSHADOW_XINLINE DType operator()() { return static_cast<DType>(
-        std::poisson_distribution<GType>(poisson_ ? mu_ : gen_(rnd_))(rnd_)); }
-  };
-};
-
 DMLC_REGISTER_PARAMETER(MultiSampleParam);
 
 #define MXNET_OPERATOR_REGISTER_SAMPLING(distr, sampler, num_inputs, \
                                          input_name_1, input_name_2, \
                                          input_desc_1, input_desc_2, \
                                          description) \
-  NNVM_REGISTER_OP(sample_##distr) \
+  NNVM_REGISTER_OP(_sample_##distr) \
+  .add_alias("sample_" #distr) \
   .describe(description()+std::string(ADD_FILELINE)) \
   .set_num_inputs(num_inputs) \
   .set_num_outputs(1) \
@@ -152,9 +47,9 @@ DMLC_REGISTER_PARAMETER(MultiSampleParam);
   .set_attr<nnvm::FInferShape>("FInferShape", MultiSampleOpShape) \
   .set_attr<nnvm::FInferType>("FInferType", MultiSampleOpType) \
   .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs) { \
-      return std::vector<ResourceRequest>(1, ResourceRequest::kRandom); \
+      return std::vector<ResourceRequest>{ResourceRequest::kRandom, ResourceRequest::kTempSpace}; \
     }) \
-  .set_attr<FCompute>("FCompute<cpu>", MultiSampleOpForward<cpu, sampler>) \
+  .set_attr<FCompute>("FCompute<cpu>", MultiSampleOpForward<cpu, sampler, num_inputs>) \
   .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes) \
   .add_argument(input_name_1, "NDArray-or-Symbol", input_desc_1) \
   .add_arguments(MultiSampleParam::__FIELDS__())
@@ -377,22 +272,22 @@ Examples::
 )code");
 }
 
-MXNET_OPERATOR_REGISTER_SAMPLING2(uniform, UniformSampler, "low", "high",
+MXNET_OPERATOR_REGISTER_SAMPLING2(uniform, UniformSampler<cpu>, "low", "high",
   "Lower bounds of the distributions.", "Upper bounds of the distributions.", uniform_desc)
-MXNET_OPERATOR_REGISTER_SAMPLING2(normal, NormalSampler, "mu", "sigma",
+MXNET_OPERATOR_REGISTER_SAMPLING2(normal, NormalSampler<cpu>, "mu", "sigma",
   "Means of the distributions.", "Standard deviations of the distributions.", normal_desc)
-MXNET_OPERATOR_REGISTER_SAMPLING2(gamma, GammaSampler, "alpha", "beta",
+MXNET_OPERATOR_REGISTER_SAMPLING2(gamma, GammaSampler<cpu>, "alpha", "beta",
   "Alpha (shape) parameters of the distributions.", "Beta (scale) parameters of the distributions.",
   gamma_desc)
-MXNET_OPERATOR_REGISTER_SAMPLING1(exponential, ExponentialSampler, "lam",
+MXNET_OPERATOR_REGISTER_SAMPLING1(exponential, ExponentialSampler<cpu>, "lam",
   "Lambda (rate) parameters of the distributions.", exponential_desc)
-MXNET_OPERATOR_REGISTER_SAMPLING1(poisson, PoissonSampler, "lam",
+MXNET_OPERATOR_REGISTER_SAMPLING1(poisson, PoissonSampler<cpu>, "lam",
   "Lambda (rate) parameters of the distributions.", poisson_desc)
-MXNET_OPERATOR_REGISTER_SAMPLING2(negative_binomial, NegativeBinomialSampler, "k", "p",
+MXNET_OPERATOR_REGISTER_SAMPLING2(negative_binomial, NegativeBinomialSampler<cpu>, "k", "p",
   "Limits of unsuccessful experiments.", "Failure probabilities in each experiment.",
   negative_binomial_desc)
 MXNET_OPERATOR_REGISTER_SAMPLING2(generalized_negative_binomial,
-  GeneralizedNegativeBinomialSampler, "mu", "alpha",
+  GeneralizedNegativeBinomialSampler<cpu>, "mu", "alpha",
   "Means of the distributions.", "Alpha (dispersion) parameters of the distributions.",
   generalized_negative_binomial_desc)
 
diff --git a/src/operator/random/multisample_op.cu b/src/operator/random/multisample_op.cu
new file mode 100644
index 000000000000..0bd9777db4e8
--- /dev/null
+++ b/src/operator/random/multisample_op.cu
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file multisample_op.cu
+ * \brief GPU Implementation of sample op
+ */
+
+#include "./multisample_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_sample_uniform)
+.set_attr<FCompute>("FCompute<gpu>", MultiSampleOpForward<gpu, UniformSampler<gpu>, 2>);
+
+NNVM_REGISTER_OP(_sample_normal)
+.set_attr<FCompute>("FCompute<gpu>", MultiSampleOpForward<gpu, NormalSampler<gpu>, 2>);
+
+NNVM_REGISTER_OP(_sample_gamma)
+.set_attr<FCompute>("FCompute<gpu>", MultiSampleOpForward<gpu, GammaSampler<gpu>, 2>);
+
+NNVM_REGISTER_OP(_sample_exponential)
+.set_attr<FCompute>("FCompute<gpu>", MultiSampleOpForward<gpu, ExponentialSampler<gpu>, 1>);
+
+NNVM_REGISTER_OP(_sample_poisson)
+.set_attr<FCompute>("FCompute<gpu>", MultiSampleOpForward<gpu, PoissonSampler<gpu>, 1>);
+
+NNVM_REGISTER_OP(_sample_negative_binomial)
+.set_attr<FCompute>("FCompute<gpu>", MultiSampleOpForward<gpu, NegativeBinomialSampler<gpu>, 2>);
+
+NNVM_REGISTER_OP(_sample_generalized_negative_binomial)
+.set_attr<FCompute>("FCompute<gpu>", MultiSampleOpForward<gpu,
+                    GeneralizedNegativeBinomialSampler<gpu>, 2>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/random/multisample_op.h b/src/operator/random/multisample_op.h
index 748b3ba0ccad..38ccbb692535 100644
--- a/src/operator/random/multisample_op.h
+++ b/src/operator/random/multisample_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file sampling_op.h
  * \brief Function definitions of operators for sampling from multiple distributions
  */
@@ -30,6 +31,8 @@
 #include "../mxnet_op.h"
 #include "../operator_common.h"
 #include "../elemwise_op_common.h"
+#include "./sampler.h"
+
 
 namespace mxnet {
 namespace op {
@@ -132,59 +135,57 @@ inline bool MultiSampleOpType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+template<typename xpu, typename IType, typename OType, typename Sampler, int inum>
+struct SamplerCaller;
+
+template<typename xpu, typename IType, typename OType, typename Sampler>
+struct SamplerCaller<xpu, IType, OType, Sampler, 1> {
+  static void op(const std::vector<TBlob>& inputs,
+                 const std::vector<TBlob>& outputs,
+                 const Tensor<xpu, 1, unsigned int>& seeds,
+                       mshadow::Stream<xpu> *s) {
+    Sampler sampler;
+    sampler.Sample(inputs[0].FlatTo1D<xpu, IType>(s),
+                   outputs[0].FlatTo1D<xpu, OType>(s),
+                   seeds, s);
+  }
+};
+
+template<typename xpu, typename IType, typename OType, typename Sampler>
+struct SamplerCaller<xpu, IType, OType, Sampler, 2> {
+  static void op(const std::vector<TBlob>& inputs,
+                 const std::vector<TBlob>& outputs,
+                 const Tensor<xpu, 1, unsigned int>& seeds,
+                       mshadow::Stream<xpu> *s) {
+    Sampler sampler;
+    sampler.Sample(inputs[0].FlatTo1D<xpu, IType>(s),
+                   inputs[1].FlatTo1D<xpu, IType>(s),
+                   outputs[0].FlatTo1D<xpu, OType>(s),
+                   seeds, s);
+  }
+};
 
-template<typename xpu, typename generator>
+template<typename xpu, typename Sampler, int inum>
 void MultiSampleOpForward(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
                        const std::vector<TBlob>& inputs,
                        const std::vector<OpReqType>& req,
                        const std::vector<TBlob>& outputs) {
   using namespace mshadow;
-  CHECK_GT(inputs.size(), 0);
-  CHECK_LT(inputs.size(), 3);
-  CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(req.size(), 1);
   using namespace mxnet_op;
-  const MultiSampleParam& param = nnvm::get<MultiSampleParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), inum);
+  CHECK_EQ(outputs.size(), 1);
+  CHECK_GT(inputs[0].Size(), 0);
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const TBlob& in0 = inputs[0];
-  const TBlob& in1 = (inputs.size() == 1 ? inputs[0] : inputs[1]);
-  const TBlob& out = outputs[0];
-  if (out.Size() == 0) return;
-  CHECK_EQ(in0.CheckContiguous(), true);
-  CHECK_EQ(in1.CheckContiguous(), true);
-  CHECK_GT(in0.Size(), 0);
-  CHECK_EQ(out.CheckContiguous(), true);
-  CHECK_EQ(out.Size() % in0.Size(), 0);
-  const int N(in0.Size()), M(out.Size()/in0.Size());
-
-  // Seed for the sampling process. In order to guarantee deterministic
-  // behaviour for single threaded cpu, this is taken from mshadow random generator.
-  const int seed(ctx.requested[0].get_random<xpu, float>(s)->GetRandInt());
-
-  MSHADOW_TYPE_SWITCH(in0.type_flag_, IType, {
-    MSHADOW_REAL_TYPE_SWITCH(out.type_flag_, OType, {
-      MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
-        // Get the output as a 2D-tensor with dimensions NxM
-        Tensor<xpu, 2, OType> samples = out.get_with_shape<xpu, 2, OType>(Shape2(N, M), s);
-        const IType *iptr1 = in0.dptr<IType>(), *iptr2 = in1.dptr<IType>();
-
-        // The seeds for the different generators are itself a random sequence. We don't
-        // want to create the same samples in case that we have two samplers with same
-        // input parameters.
-        std::mt19937 seed_generator(seed);
-        for (int i = 0; i < N; ++i) {
-          // Generate seed for this sampler. Must be mutexed as calling
-          // a random generator is not thread safe.
-          int seed = seed_generator();
-          typename generator::template Sampler<OType> sampler(iptr1[i], iptr2[i], seed);
-          // Get the sub-tensor that will hold the results of this sampler.
-          Tensor<xpu, 1, OType> slice = samples.Slice(i, i+1).FlatTo1D();
-          for (int j = 0; j < M; ++j) {
-            KERNEL_ASSIGN(slice[j], req_type, sampler());
-          }
-        }
-      });
+  // Generate multiple seeds for the different threads.
+  const int nSeeds(OptSampleSeedNum<xpu>(outputs[0].Size()));
+  Tensor<xpu, 1, unsigned> seeds
+    = ctx.requested[1].get_space_typed<xpu, 1, unsigned> (Shape1(nSeeds), ctx.get_stream<xpu>());
+  ctx.requested[0].get_random<xpu, float>(s)->GetRandInt(seeds);
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, IType, {
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+        SamplerCaller<xpu, IType, OType, Sampler, inum>
+            ::op(inputs, outputs, seeds, s);
     });
   });
 }
diff --git a/src/operator/random/sample_multinomial_op.cc b/src/operator/random/sample_multinomial_op.cc
index b358b3b2b4f2..a513f9866e5a 100644
--- a/src/operator/random/sample_multinomial_op.cc
+++ b/src/operator/random/sample_multinomial_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file sample_multinomial_op.h
  * \brief Operator for sampling from multinomial distributions
  */
@@ -29,7 +30,8 @@ namespace op {
 DMLC_REGISTER_PARAMETER(SampleMultinomialParam);
 
 
-NNVM_REGISTER_OP(sample_multinomial)
+NNVM_REGISTER_OP(_sample_multinomial)
+.add_alias("sample_multinomial")
 .describe(R"code(Concurrent sampling from multiple multinomial distributions.
 
 *data* is an *n* dimensional array whose last dimension has length *k*, where
diff --git a/src/operator/random/sample_multinomial_op.cu b/src/operator/random/sample_multinomial_op.cu
index c2bc99b7323e..27f288834a94 100644
--- a/src/operator/random/sample_multinomial_op.cu
+++ b/src/operator/random/sample_multinomial_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file sample_multinomial_op.h
  * \brief Operator for sampling from multinomial distributions
  */
@@ -26,7 +27,7 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(sample_multinomial)
+NNVM_REGISTER_OP(_sample_multinomial)
 .set_attr<FCompute>("FCompute<gpu>", SampleMultinomialForward<gpu>);
 
 
diff --git a/src/operator/random/sample_multinomial_op.h b/src/operator/random/sample_multinomial_op.h
index 2b016329f390..48b9897aa4c6 100644
--- a/src/operator/random/sample_multinomial_op.h
+++ b/src/operator/random/sample_multinomial_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file sample_multinomial_op.h
  * \brief Operator for sampling from multinomial distributions
  */
diff --git a/src/operator/random/sample_op.cc b/src/operator/random/sample_op.cc
index 8d87d2b99d14..a2b332456fb5 100644
--- a/src/operator/random/sample_op.cc
+++ b/src/operator/random/sample_op.cc
@@ -18,10 +18,13 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file sample_op.cc
  * \brief CPU Implementation of sample op
  */
+
 #include "./sample_op.h"
+#include "../tensor/init_op.h"
 
 namespace mxnet {
 namespace op {
@@ -45,9 +48,9 @@ DMLC_REGISTER_PARAMETER(SampleGenNegBinomialParam);
   .add_arguments(ParamType::__FIELDS__())
 
 // Add "uniform" alias for backward compatibility
-MXNET_OPERATOR_REGISTER_SAMPLE(random_uniform, SampleUniformParam)
+MXNET_OPERATOR_REGISTER_SAMPLE(_random_uniform, SampleUniformParam)
 .add_alias("uniform")
-.add_alias("_sample_uniform")
+.add_alias("random_uniform")
 .describe(R"code(Draw random samples from a uniform distribution.
 
 .. note:: The existing alias ``uniform`` is deprecated.
@@ -57,16 +60,18 @@ Samples are uniformly distributed over the half-open interval *[low, high)*
 
 Example::
 
-   random_uniform(low=0, high=1, shape=(2,2)) = [[ 0.60276335,  0.85794562],
-                                                 [ 0.54488319,  0.84725171]]
+   uniform(low=0, high=1, shape=(2,2)) = [[ 0.60276335,  0.85794562],
+                                          [ 0.54488319,  0.84725171]]
 
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SampleUniform_<cpu>);
+.set_attr<FInferStorageType>("FInferStorageType", InitStorageType<SampleUniformParam, true, false>)
+.set_attr<FCompute>("FCompute<cpu>", Sample_<cpu, UniformSampler<cpu>>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleEx_<cpu, UniformSampler<cpu>>);
 
 // Add "normal" alias for backward compatibility
-MXNET_OPERATOR_REGISTER_SAMPLE(random_normal, SampleNormalParam)
+MXNET_OPERATOR_REGISTER_SAMPLE(_random_normal, SampleNormalParam)
 .add_alias("normal")
-.add_alias("_sample_normal")
+.add_alias("random_normal")
 .describe(R"code(Draw random samples from a normal (Gaussian) distribution.
 
 .. note:: The existing alias ``normal`` is deprecated.
@@ -75,39 +80,46 @@ Samples are distributed according to a normal distribution parametrized by *loc*
 
 Example::
 
-   random_normal(loc=0, scale=1, shape=(2,2)) = [[ 1.89171135, -1.16881478],
-                                                 [-1.23474145,  1.55807114]]
+   normal(loc=0, scale=1, shape=(2,2)) = [[ 1.89171135, -1.16881478],
+                                          [-1.23474145,  1.55807114]]
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SampleNormal_<cpu>);
+.set_attr<FInferStorageType>("FInferStorageType", InitStorageType<SampleNormalParam, true, false>)
+.set_attr<FCompute>("FCompute<cpu>", Sample_<cpu, NormalSampler<cpu>>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleEx_<cpu, NormalSampler<cpu>>);
 
-MXNET_OPERATOR_REGISTER_SAMPLE(random_gamma, SampleGammaParam)
-.add_alias("_sample_gamma")
+MXNET_OPERATOR_REGISTER_SAMPLE(_random_gamma, SampleGammaParam)
+.add_alias("random_gamma")
 .describe(R"code(Draw random samples from a gamma distribution.
 
 Samples are distributed according to a gamma distribution parametrized by *alpha* (shape) and *beta* (scale).
 
 Example::
 
-   random_gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984,  3.37695289],
-                                                   [ 3.91697288,  3.65933681]]
+   gamma(alpha=9, beta=0.5, shape=(2,2)) = [[ 7.10486984,  3.37695289],
+                                            [ 3.91697288,  3.65933681]]
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SampleGamma_<cpu>);
+.set_attr<FInferStorageType>("FInferStorageType", InitStorageType<SampleGammaParam, true, false>)
+.set_attr<FCompute>("FCompute<cpu>", Sample_<cpu, GammaSampler<cpu>>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleEx_<cpu, GammaSampler<cpu>>);
 
-MXNET_OPERATOR_REGISTER_SAMPLE(random_exponential, SampleExponentialParam)
-.add_alias("_sample_exponential")
+MXNET_OPERATOR_REGISTER_SAMPLE(_random_exponential, SampleExponentialParam)
+.add_alias("random_exponential")
 .describe(R"code(Draw random samples from an exponential distribution.
 
 Samples are distributed according to an exponential distribution parametrized by *lambda* (rate).
 
 Example::
 
-   random_exponential(lam=4, shape=(2,2)) = [[ 0.0097189 ,  0.08999364],
-                                             [ 0.04146638,  0.31715935]]
+   exponential(lam=4, shape=(2,2)) = [[ 0.0097189 ,  0.08999364],
+                                      [ 0.04146638,  0.31715935]]
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SampleExponential_<cpu>);
+.set_attr<FInferStorageType>("FInferStorageType",
+                             InitStorageType<SampleExponentialParam, true, false>)
+.set_attr<FCompute>("FCompute<cpu>", Sample_<cpu, ExponentialSampler<cpu>>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleEx_<cpu, ExponentialSampler<cpu>>);
 
-MXNET_OPERATOR_REGISTER_SAMPLE(random_poisson, SamplePoissonParam)
-.add_alias("_sample_poisson")
+MXNET_OPERATOR_REGISTER_SAMPLE(_random_poisson, SamplePoissonParam)
+.add_alias("random_poisson")
 .describe(R"code(Draw random samples from a Poisson distribution.
 
 Samples are distributed according to a Poisson distribution parametrized by *lambda* (rate).
@@ -115,13 +127,15 @@ Samples will always be returned as a floating point data type.
 
 Example::
 
-   random_poisson(lam=4, shape=(2,2)) = [[ 5.,  2.],
-                                         [ 4.,  6.]]
+   poisson(lam=4, shape=(2,2)) = [[ 5.,  2.],
+                                  [ 4.,  6.]]
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SamplePoisson_<cpu>);
+.set_attr<FInferStorageType>("FInferStorageType", InitStorageType<SamplePoissonParam, true, false>)
+.set_attr<FCompute>("FCompute<cpu>", Sample_<cpu, PoissonSampler<cpu>>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleEx_<cpu, PoissonSampler<cpu>>);
 
-MXNET_OPERATOR_REGISTER_SAMPLE(random_negative_binomial, SampleNegBinomialParam)
-.add_alias("_sample_negbinomial")
+MXNET_OPERATOR_REGISTER_SAMPLE(_random_negative_binomial, SampleNegBinomialParam)
+.add_alias("random_negative_binomial")
 .describe(R"code(Draw random samples from a negative binomial distribution.
 
 Samples are distributed according to a negative binomial distribution parametrized by
@@ -130,13 +144,16 @@ Samples will always be returned as a floating point data type.
 
 Example::
 
-   random_negative_binomial(k=3, p=0.4, shape=(2,2)) = [[ 4.,  7.],
-                                                        [ 2.,  5.]]
+   negative_binomial(k=3, p=0.4, shape=(2,2)) = [[ 4.,  7.],
+                                                 [ 2.,  5.]]
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SampleNegBinomial_<cpu>);
+.set_attr<FInferStorageType>("FInferStorageType",
+                             InitStorageType<SampleNegBinomialParam, true, false>)
+.set_attr<FCompute>("FCompute<cpu>", Sample_<cpu, NegativeBinomialSampler<cpu>>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleEx_<cpu, NegativeBinomialSampler<cpu>>);
 
-MXNET_OPERATOR_REGISTER_SAMPLE(random_generalized_negative_binomial, SampleGenNegBinomialParam)
-.add_alias("_sample_gennegbinomial")
+MXNET_OPERATOR_REGISTER_SAMPLE(_random_generalized_negative_binomial, SampleGenNegBinomialParam)
+.add_alias("random_generalized_negative_binomial")
 .describe(R"code(Draw random samples from a generalized negative binomial distribution.
 
 Samples are distributed according to a generalized negative binomial distribution parametrized by
@@ -146,10 +163,13 @@ Samples will always be returned as a floating point data type.
 
 Example::
 
-   random_generalized_negative_binomial(mu=2.0, alpha=0.3, shape=(2,2)) = [[ 2.,  1.],
-                                                                           [ 6.,  4.]]
+   generalized_negative_binomial(mu=2.0, alpha=0.3, shape=(2,2)) = [[ 2.,  1.],
+                                                                    [ 6.,  4.]]
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", SampleGenNegBinomial_<cpu>);
+.set_attr<FInferStorageType>("FInferStorageType",
+                             InitStorageType<SampleGenNegBinomialParam, true, false>)
+.set_attr<FCompute>("FCompute<cpu>", Sample_<cpu, GeneralizedNegativeBinomialSampler<cpu>>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SampleEx_<cpu, GeneralizedNegativeBinomialSampler<cpu>>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/random/sample_op.cu b/src/operator/random/sample_op.cu
index 0d4b2e5a8270..7a593d0d36b5 100644
--- a/src/operator/random/sample_op.cu
+++ b/src/operator/random/sample_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file sample_op.cu
  * \brief GPU Implementation of sample op
  */
@@ -26,70 +27,33 @@
 namespace mxnet {
 namespace op {
 
-// GPU versions of uniform and normal distribution.
-template<>
-void SampleUniform_<gpu>(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  typedef gpu xpu;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const SampleUniformParam& param = nnvm::get<SampleUniformParam>(attrs.parsed);
-  mshadow::Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
-  if (outputs[0].type_flag_ != mshadow::kFloat32) {
-    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      // Not float32: use workspace and copy to output
-      mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 1, float> workspace =
-        ctx.requested[1].get_space_typed<xpu, 1, float>
-        (mshadow::Shape1(out.shape_.Size()), s);
-      prnd->SampleUniform(&workspace, param.low, param.high);
-      out = reshape(tcast<DType>(workspace), mshadow::Shape2(out.shape_[0], out.shape_[1]));
-    });
-  } else {
-    // float32: write directly into output
-    mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
-    prnd->SampleUniform(&out, param.low, param.high);
-  }
-}
+NNVM_REGISTER_OP(_random_uniform)
+.set_attr<FCompute>("FCompute<gpu>", Sample_<gpu, UniformSampler<gpu>>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleEx_<gpu, UniformSampler<gpu>>);
 
-template<>
-void SampleNormal_<gpu>(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  typedef gpu xpu;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const SampleNormalParam& param = nnvm::get<SampleNormalParam>(attrs.parsed);
-  mshadow::Random<xpu, float> *prnd = ctx.requested[0].get_random<xpu, float>(s);
-  if (outputs[0].type_flag_ != mshadow::kFloat32) {
-    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      // Not float32: use workspace and copy to output
-      mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
-      mshadow::Tensor<xpu, 1, float> workspace =
-        ctx.requested[1].get_space_typed<xpu, 1, float>
-        (mshadow::Shape1(out.shape_.Size()), s);
-      prnd->SampleGaussian(&workspace, param.loc, param.scale);
-      out = reshape(tcast<DType>(workspace), mshadow::Shape2(out.shape_[0], out.shape_[1]));
-    });
-  } else {
-    // float32: write directly into output
-    mshadow::Tensor<xpu, 2, float> out = outputs[0].FlatTo2D<xpu, float>(s);
-    prnd->SampleGaussian(&out, param.loc, param.scale);
-  }
-}
+NNVM_REGISTER_OP(_random_normal)
+.set_attr<FCompute>("FCompute<gpu>", Sample_<gpu, NormalSampler<gpu>>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleEx_<gpu, NormalSampler<gpu>>);
 
-NNVM_REGISTER_OP(random_uniform)
-.set_attr<FCompute>("FCompute<gpu>", SampleUniform_<gpu>);
+NNVM_REGISTER_OP(_random_gamma)
+.set_attr<FCompute>("FCompute<gpu>", Sample_<gpu, GammaSampler<gpu>>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleEx_<gpu, GammaSampler<gpu>>);
 
-NNVM_REGISTER_OP(random_normal)
-.set_attr<FCompute>("FCompute<gpu>", SampleNormal_<gpu>);
+NNVM_REGISTER_OP(_random_exponential)
+.set_attr<FCompute>("FCompute<gpu>", Sample_<gpu, ExponentialSampler<gpu>>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleEx_<gpu, ExponentialSampler<gpu>>);
+
+NNVM_REGISTER_OP(_random_poisson)
+.set_attr<FCompute>("FCompute<gpu>", Sample_<gpu, PoissonSampler<gpu>>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleEx_<gpu, PoissonSampler<gpu>>);
+
+NNVM_REGISTER_OP(_random_negative_binomial)
+.set_attr<FCompute>("FCompute<gpu>", Sample_<gpu, NegativeBinomialSampler<gpu>>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleEx_<gpu, NegativeBinomialSampler<gpu>>);
+
+NNVM_REGISTER_OP(_random_generalized_negative_binomial)
+.set_attr<FCompute>("FCompute<gpu>", Sample_<gpu, GeneralizedNegativeBinomialSampler<gpu>>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SampleEx_<gpu, GeneralizedNegativeBinomialSampler<gpu>>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/random/sample_op.h b/src/operator/random/sample_op.h
index a1a6a2345b1b..240825bfff93 100644
--- a/src/operator/random/sample_op.h
+++ b/src/operator/random/sample_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file sample_op.h
  * \brief Elementary sampling operators
  */
@@ -28,9 +29,11 @@
 #include <mshadow/base.h>
 #include <string>
 #include <vector>
+#include "../mxnet_op.h"
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "../tensor/init_op.h"
+#include "./sampler.h"
 
 namespace mxnet {
 namespace op {
@@ -55,9 +58,9 @@ struct SampleUniformParam : public dmlc::Parameter<SampleUniformParam> {
               " Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
     .add_enum("None", -1)
-    .add_enum("float32", mshadow::kFloat32)
-    .add_enum("float64", mshadow::kFloat64)
-    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("float32", kFloat32)
+    .add_enum("float64", kFloat64)
+    .add_enum("float16", kFloat16)
     .set_default(-1)
     .describe("DType of the output in case this can't be inferred. "
               "Defaults to float32 if not defined (dtype=None).");
@@ -84,9 +87,9 @@ struct SampleNormalParam : public dmlc::Parameter<SampleNormalParam> {
               " Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
     .add_enum("None", -1)
-    .add_enum("float32", mshadow::kFloat32)
-    .add_enum("float64", mshadow::kFloat64)
-    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("float32", kFloat32)
+    .add_enum("float64", kFloat64)
+    .add_enum("float16", kFloat16)
     .set_default(-1)
     .describe("DType of the output in case this can't be inferred. "
               "Defaults to float32 if not defined (dtype=None).");
@@ -113,9 +116,9 @@ struct SampleGammaParam : public dmlc::Parameter<SampleGammaParam> {
               " Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
     .add_enum("None", -1)
-    .add_enum("float32", mshadow::kFloat32)
-    .add_enum("float64", mshadow::kFloat64)
-    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("float32", kFloat32)
+    .add_enum("float64", kFloat64)
+    .add_enum("float16", kFloat16)
     .set_default(-1)
     .describe("DType of the output in case this can't be inferred. "
               "Defaults to float32 if not defined (dtype=None).");
@@ -139,9 +142,9 @@ struct SampleExponentialParam : public dmlc::Parameter<SampleExponentialParam> {
               " Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
     .add_enum("None", -1)
-    .add_enum("float32", mshadow::kFloat32)
-    .add_enum("float64", mshadow::kFloat64)
-    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("float32", kFloat32)
+    .add_enum("float64", kFloat64)
+    .add_enum("float16", kFloat16)
     .set_default(-1)
     .describe("DType of the output in case this can't be inferred. "
               "Defaults to float32 if not defined (dtype=None).");
@@ -165,9 +168,9 @@ struct SamplePoissonParam : public dmlc::Parameter<SamplePoissonParam> {
               " Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
     .add_enum("None", -1)
-    .add_enum("float32", mshadow::kFloat32)
-    .add_enum("float64", mshadow::kFloat64)
-    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("float32", kFloat32)
+    .add_enum("float64", kFloat64)
+    .add_enum("float16", kFloat16)
     .set_default(-1)
     .describe("DType of the output in case this can't be inferred. "
               "Defaults to float32 if not defined (dtype=None).");
@@ -194,9 +197,9 @@ struct SampleNegBinomialParam : public dmlc::Parameter<SampleNegBinomialParam> {
               " Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
     .add_enum("None", -1)
-    .add_enum("float32", mshadow::kFloat32)
-    .add_enum("float64", mshadow::kFloat64)
-    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("float32", kFloat32)
+    .add_enum("float64", kFloat64)
+    .add_enum("float16", kFloat16)
     .set_default(-1)
     .describe("DType of the output in case this can't be inferred. "
               "Defaults to float32 if not defined (dtype=None).");
@@ -223,143 +226,242 @@ struct SampleGenNegBinomialParam : public dmlc::Parameter<SampleGenNegBinomialPa
               " Only used for imperative calls.");
     DMLC_DECLARE_FIELD(dtype)
     .add_enum("None", -1)
-    .add_enum("float32", mshadow::kFloat32)
-    .add_enum("float64", mshadow::kFloat64)
-    .add_enum("float16", mshadow::kFloat16)
+    .add_enum("float32", kFloat32)
+    .add_enum("float64", kFloat64)
+    .add_enum("float16", kFloat16)
     .set_default(-1)
     .describe("DType of the output in case this can't be inferred. "
               "Defaults to float32 if not defined (dtype=None).");
   }
 };
 
+using FSampleCompute = std::function<void (const nnvm::NodeAttrs& attrs,
+                                           const OpContext& ctx,
+                                           const OpReqType& req,
+                                           TBlob* outputs)>;
+
+using mxnet::TBlob;
+
+// Convenience class that transfers a host based scalar into an
+// array on either the host or the device. Needed as
+// the core samplers expect parameters to be tensors located on the
+// appropriate device.
 template<typename xpu>
-void SampleUniform_(const nnvm::NodeAttrs& attrs,
-                    const OpContext& ctx,
-                    const std::vector<TBlob>& inputs,
-                    const std::vector<OpReqType>& req,
-                    const std::vector<TBlob>& outputs) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const SampleUniformParam& param = nnvm::get<SampleUniformParam>(attrs.parsed);
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Random<xpu, DType> *prnd = ctx.requested[0].get_random<xpu, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
-    prnd->SampleUniform(&out, param.low, param.high);
-  });
-}
+Context AllocContext();
+template<>
+MSHADOW_FORCE_INLINE Context AllocContext<cpu>() { return Context::CPU(); }
+template<>
+MSHADOW_FORCE_INLINE Context AllocContext<gpu>() { return Context::GPU(); }
 
+template<typename xpu, typename DType>
+struct Scalar2Array {
+  Storage::Handle array;
+  Scalar2Array(DType scalar, const OpContext& ctx) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    array = Storage::Get()->Alloc(sizeof(DType), AllocContext<xpu>());
+    Tensor<xpu, 1, DType> src(Ref(), Shape1(1), s);
+    Copy(src, Tensor<cpu, 1, DType>(&scalar, Shape1(1)), s);
+  }
+  ~Scalar2Array() {
+    Storage::Get()->Free(array);
+  }
+  DType *Ref() { return static_cast<DType*>(array.dptr); }
+  Tensor<xpu, 1, DType> GetTensor() { return Tensor<xpu, 1, DType>(Ref(), Shape1(1)); }
+};
+
+// Convienience function to generate the required number of seeds for sampling
 template<typename xpu>
-void SampleNormal_(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const SampleNormalParam& param = nnvm::get<SampleNormalParam>(attrs.parsed);
-  CHECK_GT(param.scale, 0) << "scale parameter in gaussian has to be positive";
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Random<xpu, DType> *prnd = ctx.requested[0].get_random<xpu, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
-    prnd->SampleGaussian(&out, param.loc, param.scale);  // NOLINT(*)
-  });
+MSHADOW_FORCE_INLINE Tensor<xpu, 1, unsigned int> GetSeeds(index_t N, const OpContext& ctx) {
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const index_t nSeeds(OptSampleSeedNum<xpu>(N));
+  Tensor<xpu, 1, unsigned int> seeds
+    = ctx.requested[1].get_space_typed<xpu, 1, unsigned int>(Shape1(nSeeds), ctx.get_stream<xpu>());
+  ctx.requested[0].get_random<xpu, float>(s)->GetRandInt(seeds);
+  return seeds;
 }
 
+template<typename xpu, typename Sampler>
+struct SampleMaster;
+
 template<typename xpu>
-void SampleGamma_(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const SampleGammaParam& param = nnvm::get<SampleGammaParam>(attrs.parsed);
-  CHECK_GT(param.alpha, 0) << "alpha parameter in gamma distribution has to be positive";
-  CHECK_GT(param.beta, 0) << "beta parameter in gamma distribution has to be positive";
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Random<xpu, DType> *prnd = ctx.requested[0].get_random<xpu, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
-    prnd->SampleGamma(&out, param.alpha, param.beta);  // NOLINT(*)
-  });
-}
+struct SampleMaster<xpu, UniformSampler<xpu>> {
+  static void op(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const OpReqType& req,
+                 TBlob* outputs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const SampleUniformParam& param = nnvm::get<SampleUniformParam>(attrs.parsed);
+    CHECK_GE(param.high, param.low) << "low must be less or equal to high in uniform distribution";
+    Scalar2Array<xpu, float> low(param.low, ctx), high(param.high, ctx);
+    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    UniformSampler<xpu> sampler;
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
+      sampler.Sample(low.GetTensor(), high.GetTensor(), out, seeds, s);
+    });
+  }
+};
 
 template<typename xpu>
-void SampleExponential_(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const SampleExponentialParam& param = nnvm::get<SampleExponentialParam>(attrs.parsed);
-  CHECK_GT(param.lam, 0) << "lambda parameter in exponential distribution has to be positive";
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Random<xpu, DType> *prnd = ctx.requested[0].get_random<xpu, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
-    prnd->SampleExponential(&out, param.lam);  // NOLINT(*)
-  });
-}
+struct SampleMaster<xpu, NormalSampler<xpu>> {
+  static void op(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const OpReqType& req,
+                 TBlob* outputs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const SampleNormalParam& param = nnvm::get<SampleNormalParam>(attrs.parsed);
+    CHECK_GT(param.scale, 0) << "scale parameter in gaussian has to be positive";
+    Scalar2Array<xpu, float> loc(param.loc, ctx), scale(param.scale, ctx);
+    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    NormalSampler<xpu> sampler;
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
+      sampler.Sample(loc.GetTensor(), scale.GetTensor(), out, seeds, s);
+    });
+  }
+};
 
 template<typename xpu>
-void SamplePoisson_(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const SamplePoissonParam& param = nnvm::get<SamplePoissonParam>(attrs.parsed);
-  CHECK_GE(param.lam, 0) << "lambda parameter in poisson distribution has to be non-negative";
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Random<xpu, DType> *prnd = ctx.requested[0].get_random<xpu, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
-    prnd->SamplePoisson(&out, param.lam);  // NOLINT(*)
-  });
-}
+struct SampleMaster<xpu, GammaSampler<xpu>> {
+  static void op(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const OpReqType& req,
+                 TBlob* outputs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const SampleGammaParam& param = nnvm::get<SampleGammaParam>(attrs.parsed);
+    CHECK_GT(param.alpha, 0) << "alpha parameter in gamma distribution has to be positive";
+    CHECK_GT(param.beta, 0) << "beta parameter in gamma distribution has to be positive";
+    Scalar2Array<xpu, float> alpha(param.alpha, ctx), beta(param.beta, ctx);
+    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    GammaSampler<xpu> sampler;
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
+      sampler.Sample(alpha.GetTensor(), beta.GetTensor(), out, seeds, s);
+    });
+  }
+};
 
 template<typename xpu>
-void SampleNegBinomial_(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const SampleNegBinomialParam& param = nnvm::get<SampleNegBinomialParam>(attrs.parsed);
-  CHECK_GE(param.k, 0) << "k parameter in negative binomial distribution has to be non-negative";
-  CHECK_GE(param.p, 0) << "p parameter in negative binomial distribution has to be non-negative";
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Random<xpu, DType> *prnd = ctx.requested[0].get_random<xpu, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
-    prnd->SampleNegativeBinomial(&out, param.k, param.p);  // NOLINT(*)
-  });
-}
+struct SampleMaster<xpu, ExponentialSampler<xpu>> {
+  static void op(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const OpReqType& req,
+                 TBlob* outputs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const SampleExponentialParam& param = nnvm::get<SampleExponentialParam>(attrs.parsed);
+    CHECK_GT(param.lam, 0) << "lambda parameter in exponential distribution has to be positive";
+    Scalar2Array<xpu, float> lam(param.lam, ctx);
+    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    ExponentialSampler<xpu> sampler;
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
+      sampler.Sample(lam.GetTensor(), out, seeds, s);
+    });
+  }
+};
 
 template<typename xpu>
-void SampleGenNegBinomial_(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  using namespace mxnet::op;
-  using namespace mshadow::expr;
+struct SampleMaster<xpu, PoissonSampler<xpu>> {
+  static void op(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const OpReqType& req,
+                 TBlob* outputs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const SamplePoissonParam& param = nnvm::get<SamplePoissonParam>(attrs.parsed);
+    CHECK_GE(param.lam, 0) << "lambda parameter in poisson distribution has to be non-negative";
+    Scalar2Array<xpu, float> lam(param.lam, ctx);
+    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    PoissonSampler<xpu> sampler;
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
+      sampler.Sample(lam.GetTensor(), out, seeds, s);
+    });
+  }
+};
+
+template<typename xpu>
+struct SampleMaster<xpu, NegativeBinomialSampler<xpu>> {
+  static void op(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const OpReqType& req,
+                 TBlob* outputs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const SampleNegBinomialParam& param = nnvm::get<SampleNegBinomialParam>(attrs.parsed);
+    CHECK_GE(param.k, 0) << "k parameter in negative binomial distribution has to be non-negative";
+    CHECK_GE(param.p, 0) << "p parameter in negative binomial distribution has to be non-negative";
+    Scalar2Array<xpu, float> k(param.k, ctx), p(param.p, ctx);
+    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    NegativeBinomialSampler<xpu> sampler;
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
+      sampler.Sample(k.GetTensor(), p.GetTensor(), out, seeds, s);
+    });
+  }
+};
+
+template<typename xpu>
+struct SampleMaster<xpu, GeneralizedNegativeBinomialSampler<xpu>> {
+  static void op(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const OpReqType& req,
+                 TBlob* outputs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const SampleGenNegBinomialParam& param = nnvm::get<SampleGenNegBinomialParam>(attrs.parsed);
+    CHECK_GE(param.mu, 0)
+      << "mu parameter in generalized negative binomial distribution has to be non-negative";
+    CHECK_GE(param.alpha, 0)
+      << "alpha parameter in generalized negative binomial distribution has to be non-negative";
+    Scalar2Array<xpu, float> mu(param.mu, ctx), alpha(param.alpha, ctx);
+    Tensor<xpu, 1, unsigned int> seeds(GetSeeds<xpu>(outputs->Size(), ctx));
+    GeneralizedNegativeBinomialSampler<xpu> sampler;
+    MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+      Tensor<xpu, 1, OType> out = outputs->FlatTo1D<xpu, OType>(s);
+      sampler.Sample(mu.GetTensor(), alpha.GetTensor(), out, seeds, s);
+    });
+  }
+};
+
+template<typename xpu, typename Sampler>
+void SampleComputeEx_(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<NDArray>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<NDArray>& outputs,
+                      SampleMaster<xpu, Sampler> sample_master) {
+  NDArray output = outputs[0];
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const SampleGenNegBinomialParam& param = nnvm::get<SampleGenNegBinomialParam>(attrs.parsed);
-  CHECK_GE(param.mu, 0)
-    << "mu parameter in generalized negative binomial distribution has to be non-negative";
-  CHECK_GE(param.alpha, 0)
-    << "alpha parameter in generalized negative binomial distribution has to be non-negative";
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Random<xpu, DType> *prnd = ctx.requested[0].get_random<xpu, DType>(s);
-    mshadow::Tensor<xpu, 2, DType> out = outputs[0].FlatTo2D<xpu, DType>(s);
-    prnd->SampleGeneralizedNegativeBinomial(&out, param.mu, param.alpha);  // NOLINT(*)
-  });
+  if (output.storage_type() == kRowSparseStorage) {
+    // indices
+    nnvm::dim_t nnr = output.shape()[0];
+    output.CheckAndAlloc({mshadow::Shape1(nnr)});
+    PopulateFullIdxRspImpl(s, &output);
+    // data
+    TBlob out_blob = output.data();
+    sample_master.op(attrs, ctx, req[0], &out_blob);
+  } else {
+    LOG(FATAL) << "Unexpected storage type for SampleComputeEx_: "
+               << output.storage_type();
+  }
+}
+
+template<typename xpu, typename Sampler>
+void Sample_(const nnvm::NodeAttrs& attrs,
+             const OpContext& ctx,
+             const std::vector<TBlob>& inputs,
+             const std::vector<OpReqType>& req,
+             const std::vector<TBlob>& outputs) {
+  TBlob out = outputs[0];
+  SampleMaster<xpu, Sampler>::op(attrs, ctx, req[0], &out);
+}
+
+template<typename xpu, typename Sampler>
+void SampleEx_(const nnvm::NodeAttrs& attrs,
+               const OpContext& ctx,
+               const std::vector<NDArray>& inputs,
+               const std::vector<OpReqType>& req,
+               const std::vector<NDArray>& outputs) {
+  SampleMaster<xpu, Sampler> sample_master;
+  SampleComputeEx_<xpu, Sampler>(attrs, ctx, inputs, req, outputs, sample_master);
 }
 
 template<typename ParamType>
@@ -386,14 +488,14 @@ inline bool SampleOpType(const nnvm::NodeAttrs& attrs,
       dtype = param.dtype;
     } else {
       // Use default
-      dtype = mshadow::kFloat32;
+      dtype = kFloat32;
     }
   }
-  bool dtype_ok = (dtype == mshadow::kFloat16) || (dtype == mshadow::kFloat32) ||
-  (dtype == mshadow::kFloat64);
+  bool dtype_ok = (dtype == kFloat16) || (dtype == kFloat32) ||
+  (dtype == kFloat64);
   CHECK_EQ(dtype_ok, true) << "Output type must be float16, float32, or float64: dtype is "
-  << dtype_out << " vs " << mshadow::kFloat16 << " or " << mshadow::kFloat32 << " or "
-  << mshadow::kFloat64;
+  << dtype_out << " vs " << kFloat16 << " or " << kFloat32 << " or "
+  << kFloat64;
   TYPE_ASSIGN_CHECK(*out_type, 0, dtype);
   return true;
 }
diff --git a/src/operator/random/sampler.h b/src/operator/random/sampler.h
new file mode 100644
index 000000000000..d544aec88d8d
--- /dev/null
+++ b/src/operator/random/sampler.h
@@ -0,0 +1,357 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file sampler.h
+ * \brief implementations of random sampling functors.
+ */
+
+#ifndef MXNET_OPERATOR_RANDOM_SAMPLER_H_
+#define MXNET_OPERATOR_RANDOM_SAMPLER_H_
+
+#ifdef __CUDACC__
+#include <curand.h>
+#include <curand_kernel.h>
+#endif  // __CUDACC__
+
+using namespace mshadow;
+using namespace mxnet::op::mxnet_op;
+
+namespace mxnet {
+namespace op {
+
+// Elementary random number generation for int/uniform/gaussian in CPU and GPU.
+// Will use float data type whenever instantiated for half_t or any other non
+// standard real type.
+template<typename xpu, typename DType>
+class RandGenerator;
+
+template<typename DType>
+class RandGenerator<cpu, DType> {
+ public:
+  typedef typename std::conditional<std::is_floating_point<DType>::value,
+                                    DType, float>::type FType;
+  std::mt19937 engine;
+  std::uniform_real_distribution<FType> uniformNum;
+  std::normal_distribution<FType> normalNum;
+  explicit RandGenerator(unsigned int seed): engine(seed) {}
+  MSHADOW_XINLINE int rand() { return engine(); }
+  MSHADOW_XINLINE FType uniform() { return uniformNum(engine); }
+  MSHADOW_XINLINE FType normal() { return normalNum(engine); }
+};
+
+#ifdef __CUDACC__
+
+// uniform number generation in Cuda made consistent with stl (include 0 but exclude 1)
+// by using 1.0-curand_uniform(). Needed as some samplers below won't be able to deal with
+// one of the boundary cases.
+template<typename DType>
+class RandGenerator<gpu, DType> {
+ public:
+  curandState_t state;
+  __device__ RandGenerator(unsigned int seed) { curand_init(seed, 0, 0, &state); }
+  MSHADOW_FORCE_INLINE __device__ int rand() { return curand(&state); }
+  MSHADOW_FORCE_INLINE __device__ float uniform()
+                              { return static_cast<float>(1.0) - curand_uniform(&state); }
+  MSHADOW_FORCE_INLINE __device__ float normal() { return curand_normal(&state); }
+};
+
+template<>
+class RandGenerator<gpu, double> {
+ public:
+  curandState_t state;
+  __device__ RandGenerator(unsigned int seed) { curand_init(seed, 0, 0, &state); }
+  MSHADOW_FORCE_INLINE __device__ int rand() { return curand(&state); }
+  MSHADOW_FORCE_INLINE __device__ double uniform()
+                            { return static_cast<double>(1.0) - curand_uniform_double(&state); }
+  MSHADOW_FORCE_INLINE __device__ double normal() { return curand_normal_double(&state); }
+};
+
+#endif  // __CUDACC__
+
+// Number of seeds/threads when sampling on cpu/gpu.
+template<typename xpu>
+MSHADOW_XINLINE index_t OptSampleSeedNum(index_t N);
+template<>
+MSHADOW_XINLINE index_t OptSampleSeedNum<cpu>(index_t N) {
+  return omp_get_num_threads();
+}
+template<>
+MSHADOW_XINLINE index_t OptSampleSeedNum<gpu>(index_t N) {
+  return N;
+}
+
+template<typename xpu>
+struct SampleUniformKernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
+                     const IType *lower, const IType *upper, OType *out, const unsigned *seed) {
+    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
+            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
+    RandGenerator<xpu, OType> gen(seed[i]);
+    for ( index_t j = start; j < end; ++j ) {
+      out[j] = OType(lower[j/nBatch] + (upper[j/nBatch] - lower[j/nBatch]) * gen.uniform());
+    }
+  }
+};
+
+template<typename xpu>
+struct UniformSampler {
+  template<typename IType, typename OType>
+  MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lower,
+                                   const Tensor<xpu, 1, IType>& upper,
+                                   const Tensor<xpu, 1, OType>& out,
+                                   const Tensor<xpu, 1, unsigned>& seed,
+                                         Stream<xpu> *s) {
+    Kernel<SampleUniformKernel<xpu>, xpu>
+      ::Launch(s, seed.size(0), lower.size(0), out.size(0), seed.size(0),
+               lower.dptr_, upper.dptr_, out.dptr_, seed.dptr_);
+  }
+};
+
+template<typename xpu>
+struct SampleNormalKernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
+                            const IType *mean, const IType *std, OType *out, const unsigned *seed) {
+    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
+            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
+    RandGenerator<xpu, OType> gen(seed[i]);
+    for ( index_t j = start; j < end; ++j ) {
+      out[j] = OType(gen.normal() * std[j/nBatch] + mean[j/nBatch]);
+    }
+  }
+};
+
+template<typename xpu>
+struct NormalSampler {
+  template<typename IType, typename OType>
+  MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& mean,
+                                   const Tensor<xpu, 1, IType>& std,
+                                   const Tensor<xpu, 1, OType>& out,
+                                   const Tensor<xpu, 1, unsigned>& seed,
+                                         Stream<xpu> *s) {
+    Kernel<SampleNormalKernel<xpu>, xpu>
+      ::Launch(s, seed.size(0), mean.size(0), out.size(0), seed.size(0),
+               mean.dptr_, std.dptr_, out.dptr_, seed.dptr_);
+  }
+};
+
+template<typename xpu>
+struct SampleExponentialKernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
+                                  const IType *lambda, OType *out, const unsigned *seed) {
+    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
+            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
+    RandGenerator<xpu, OType> gen(seed[i]);
+    for ( index_t j = start; j < end; ++j ) {
+      out[j] = OType(-log(1.0-gen.uniform()) / lambda[j/nBatch]);
+    }
+  }
+};
+
+template<typename xpu>
+struct ExponentialSampler {
+  template<typename IType, typename OType>
+  MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lambda,
+                                   const Tensor<xpu, 1, OType>& out,
+                                   const Tensor<xpu, 1, unsigned>& seed,
+                                         Stream<xpu> *s) {
+    Kernel<SampleExponentialKernel<xpu>, xpu>
+      ::Launch(s, seed.size(0), lambda.size(0), out.size(0), seed.size(0),
+               lambda.dptr_, out.dptr_, seed.dptr_);
+  }
+};
+
+template<typename xpu, typename IType, typename OType>
+MSHADOW_XINLINE OType SampleGamma(IType a, IType b, RandGenerator<xpu, OType> *gen) {
+  // Generate one sample of the gamma distribution
+  OType sample;
+  OType d = a < 1 ? a + 2.0 / 3.0 : a - 1.0 / 3.0;
+  OType k = sqrt(9.0 * d);
+  OType c = 1.0 / k;
+  while (1) {
+    OType Z = gen->normal();
+    if (Z > -k) {
+      OType x = 1.0 + c * Z;
+      OType V = x * x * x;
+      if (log(1.0-gen->uniform()) < 0.5 * Z * Z + d * (1.0 - V + log(V))) {
+        sample = d * V * b;
+        break;
+      }
+    }
+  }
+  return a < 1 ? sample * pow(gen->uniform(), OType(1.0 / a)) : sample;
+}
+
+template<typename xpu>
+struct SampleGammaKernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
+                      const IType *alpha, const IType *beta, OType *out, const unsigned *seed) {
+    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
+            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
+    typedef typename std::conditional<std::is_floating_point<OType>::value,
+                                     OType, float>::type FType;
+    RandGenerator<xpu, FType> gen(seed[i]);
+    for ( index_t j = start; j < end; ++j ) {
+      out[j] = OType(SampleGamma(alpha[j/nBatch], beta[j/nBatch], &gen));
+    }
+  }
+};
+
+template<typename xpu>
+struct GammaSampler {
+  template<typename IType, typename OType>
+  MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& alpha,
+                                   const Tensor<xpu, 1, IType>& beta,
+                                   const Tensor<xpu, 1, OType>& out,
+                                   const Tensor<xpu, 1, unsigned>& seed,
+                                         Stream<xpu> *s) {
+    Kernel<SampleGammaKernel<xpu>, xpu>
+      ::Launch(s, seed.size(0), alpha.size(0), out.size(0), seed.size(0),
+               alpha.dptr_, beta.dptr_, out.dptr_, seed.dptr_);
+  }
+};
+
+template<typename xpu>
+MSHADOW_XINLINE int SamplePoisson(float lambda, RandGenerator<xpu, float> *gen) {
+  // Generate one sample of the poisson distribution. Intentionally written
+  // towards a specific type (float) for internal computation which is sufficient
+  // for accurate enough computation.
+  if ( lambda < 12.0 ) {
+    float t = expf(-lambda);
+    int x = 0;
+    for ( float prod = gen->uniform(); prod > t; prod *= gen->uniform() ) { x += 1; }
+    return x;
+  } else {
+    // Approximation for high lambda according to:
+    // Numerical Recipes in C: The Art of Scientific Computing
+    // Cambridge University Press
+    const float pi(3.1415926);
+    const float sq(sqrt(2.0*lambda));
+    const float loglambda(log(lambda));
+    const float g(lambda*loglambda-lgammaf(lambda+1.0));
+    float em(0), t(0), y(0);
+    do {
+      do {
+        y = tanf(pi * gen->uniform());
+        em = sq * y + lambda;
+      } while (em < 0.0);
+      em = floorf(em);
+      t = 0.9 * (1.0 + y * y) * expf(em * loglambda - lgammaf(em + 1.0) - g);
+    } while (gen->uniform() > t);
+    return static_cast<int>(em);
+  }
+}
+
+template<typename xpu>
+struct SamplePoissonKernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
+                                  const IType *lambda, OType *out, const unsigned *seed) {
+    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
+            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
+    RandGenerator<xpu, float> gen(seed[i]);
+    for ( index_t j = start; j < end; ++j ) {
+      out[j] = OType(SamplePoisson(lambda[j/nBatch], &gen));
+    }
+  }
+};
+
+template<typename xpu>
+struct PoissonSampler {
+  template<typename IType, typename OType>
+  MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& lambda,
+                                   const Tensor<xpu, 1, OType>& out,
+                                   const Tensor<xpu, 1, unsigned>& seed,
+                                         Stream<xpu> *s) {
+    Kernel<SamplePoissonKernel<xpu>, xpu>
+      ::Launch(s, seed.size(0), lambda.size(0), out.size(0), seed.size(0),
+               lambda.dptr_, out.dptr_, seed.dptr_);
+  }
+};
+
+template<typename xpu>
+struct SampleNegativeBinomialKernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
+                             const IType *k, const IType *p, OType *out, const unsigned *seed) {
+    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
+            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
+    RandGenerator<xpu, float> gen(seed[i]);
+    for ( index_t j = start; j < end; ++j ) {
+      float alpha = k[j/nBatch];
+      float prob = p[j/nBatch];
+      float beta = (1.0 - prob) / prob;
+      float lambda = SampleGamma(alpha, beta, &gen);
+      out[j] = OType(SamplePoisson(lambda, &gen));
+    }
+  }
+};
+
+template<typename xpu>
+struct NegativeBinomialSampler {
+  template<typename IType, typename OType>
+  MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& k,
+                                   const Tensor<xpu, 1, IType>& p,
+                                   const Tensor<xpu, 1, OType>& out,
+                                   const Tensor<xpu, 1, unsigned>& seed,
+                                         Stream<xpu> *s) {
+    Kernel<SampleNegativeBinomialKernel<xpu>, xpu>
+      ::Launch(s, seed.size(0), k.size(0), out.size(0), seed.size(0),
+               k.dptr_, p.dptr_, out.dptr_, seed.dptr_);
+  }
+};
+
+template<typename xpu>
+struct SampleGeneralizedNegativeBinomialKernel {
+  template<typename IType, typename OType>
+  MSHADOW_XINLINE static void Map(int i, index_t nParm, index_t nSample, index_t nSeed,
+                        const IType *mu, const IType *alpha, OType *out, const unsigned *seed) {
+    index_t nBatch(nSample/nParm), nChunk((nSample+nSeed-1)/nSeed),
+            start(i*nChunk), end((i+1)*nChunk < nSample ? (i+1)*nChunk : nSample);
+    RandGenerator<xpu, float> gen(seed[i]);
+    for ( index_t j = start; j < end; ++j ) {
+      float lambda = alpha[j/nBatch] == 0 ? static_cast<float>(mu[j/nBatch])
+              : SampleGamma(IType(1) / alpha[j/nBatch], alpha[j/nBatch] * mu[j/nBatch], &gen);
+      out[j] = OType(SamplePoisson(lambda, &gen));
+    }
+  }
+};
+
+template<typename xpu>
+struct GeneralizedNegativeBinomialSampler {
+  template<typename IType, typename OType>
+  MSHADOW_FORCE_INLINE void Sample(const Tensor<xpu, 1, IType>& mu,
+                                   const Tensor<xpu, 1, IType>& alpha,
+                                   const Tensor<xpu, 1, OType>& out,
+                                   const Tensor<xpu, 1, unsigned>& seed,
+                                         Stream<xpu> *s) {
+    Kernel<SampleGeneralizedNegativeBinomialKernel<xpu>, xpu>
+      ::Launch(s, seed.size(0), mu.size(0), out.size(0), seed.size(0),
+               mu.dptr_, alpha.dptr_, out.dptr_, seed.dptr_);
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_RANDOM_SAMPLER_H_
diff --git a/src/operator/regression_output-inl.h b/src/operator/regression_output-inl.h
index 0de312cff8d6..08b2f0a4a813 100644
--- a/src/operator/regression_output-inl.h
+++ b/src/operator/regression_output-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file regression_ouput-inl.h
  * \brief Regression output operator.
  */
diff --git a/src/operator/regression_output.cc b/src/operator/regression_output.cc
index d19f336d2aa2..0c70a86b266e 100644
--- a/src/operator/regression_output.cc
+++ b/src/operator/regression_output.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file regression_output.cc
  * \brief regression output operator
 */
diff --git a/src/operator/regression_output.cu b/src/operator/regression_output.cu
index 64dcef3df6f0..255b020d20c9 100644
--- a/src/operator/regression_output.cu
+++ b/src/operator/regression_output.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file regression_output.cu
  * \brief regression output operator
 */
diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h
index 4f09ebe9c3ea..b4735b8eec64 100644
--- a/src/operator/rnn-inl.h
+++ b/src/operator/rnn-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file rnn-inl.h
  * \brief
  * \author Sebastian Bodenstein
@@ -256,9 +257,7 @@ class RNNProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/rnn.cc b/src/operator/rnn.cc
index 4c7954f3e5a6..908428b383ca 100644
--- a/src/operator/rnn.cc
+++ b/src/operator/rnn.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file rnn.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/rnn.cu b/src/operator/rnn.cu
index 0daee32abe5b..59517932b78c 100644
--- a/src/operator/rnn.cu
+++ b/src/operator/rnn.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file rnn.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/roi_pooling-inl.h b/src/operator/roi_pooling-inl.h
index f2f836408c5a..ae29e43b3382 100644
--- a/src/operator/roi_pooling-inl.h
+++ b/src/operator/roi_pooling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file roi_pooling-inl.h
  * \brief roi pooling operator and symbol
  * \author Kye-Hyeon Kim, Jian Guo
diff --git a/src/operator/roi_pooling.cc b/src/operator/roi_pooling.cc
index 7518392e37eb..2d0f8f35ac68 100644
--- a/src/operator/roi_pooling.cc
+++ b/src/operator/roi_pooling.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file roi_pooling.cc
  * \brief roi pooling operator
  * \author Ross Girshick, Kye-Hyeon Kim, Jian Guo
@@ -53,13 +54,12 @@ inline void ROIPoolForward(const Tensor<cpu, 4, Dtype> &out,
   const int pooled_width_ = out.size(3);
 
   const int num_rois = bbox.size(0);
-  const int batch_size = data.size(0);
   const int data_size = data.size(1) * data.size(2) * data.size(3);
   // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
   for (int n = 0; n < num_rois; ++n) {
     int roi_batch_ind = bottom_rois[0];
     assert(roi_batch_ind >= 0);
-    assert(roi_batch_ind < batch_size);
+    assert(roi_batch_ind < data.size(0) /* batch size */);
 
     Dtype pad_w = (bottom_rois[3] - bottom_rois[1] + 1) * pad_ratio_;
     Dtype pad_h = (bottom_rois[4] - bottom_rois[2] + 1) * pad_ratio_;
diff --git a/src/operator/roi_pooling.cu b/src/operator/roi_pooling.cu
index 28981c11ebd6..a5ed17cf53c9 100644
--- a/src/operator/roi_pooling.cu
+++ b/src/operator/roi_pooling.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file roi_pooling.cu
  * \brief roi pooling operator
  * \author Ross Girshick, Kye-Hyeon Kim, Jian Guo
diff --git a/src/operator/sequence_last-inl.h b/src/operator/sequence_last-inl.h
index 0551ee933f0a..f71b8cf8e37f 100644
--- a/src/operator/sequence_last-inl.h
+++ b/src/operator/sequence_last-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file sequence_last-inl.h
  * \brief
  * \author Sebastian Bodenstien
@@ -184,8 +185,8 @@ class SequenceLastProp : public OperatorProperty {
         << "Input:[data, sequence_length]";
 
     const TShape &dshape = (*in_shape)[seq_last::kData];
-    CHECK_GT(dshape.ndim(), 2U)
-        << "The data array must be of rank 3 or greater.";
+    CHECK_GT(dshape.ndim(), 1U)
+        << "The data array must be of rank 2 or greater.";
     // seq length vector is same as batch size
     if (param_.use_sequence_length)
       SHAPE_ASSIGN_CHECK(*in_shape, seq_last::kSequenceLength,
@@ -210,10 +211,7 @@ class SequenceLastProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at "
-                                       << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/sequence_last.cc b/src/operator/sequence_last.cc
index 6c04bdd34d86..d943cd68a3cc 100644
--- a/src/operator/sequence_last.cc
+++ b/src/operator/sequence_last.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_last.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/sequence_last.cu b/src/operator/sequence_last.cu
index 9215b2478c1d..c63369d5800f 100644
--- a/src/operator/sequence_last.cu
+++ b/src/operator/sequence_last.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_last.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/sequence_mask-inl.h b/src/operator/sequence_mask-inl.h
index dec1f2a2b7ed..7f53a0ba82d7 100644
--- a/src/operator/sequence_mask-inl.h
+++ b/src/operator/sequence_mask-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file wl_sequence_mask-inl.h
  * \brief
  * \author Sebastian Bodenstien
@@ -36,7 +37,7 @@
 #include <utility>
 #include "./operator_common.h"
 #include "./mshadow_op.h"
-#include "./operator_common.h"
+#include "./nn/sequence_mask-inl.h"
 
 namespace mxnet {
 namespace op {
@@ -90,7 +91,7 @@ class SequenceMaskOp : public Operator {
     if (param_.use_sequence_length) {
       Tensor<xpu, 1, DType> indices =
           in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
-      SequenceMask(out, indices, static_cast<DType>(param_.value));
+      mxnet_op::SequenceMask(out, indices, static_cast<DType>(param_.value));
     }
   }
 
@@ -126,7 +127,7 @@ class SequenceMaskOp : public Operator {
     if (param_.use_sequence_length) {
       Tensor<xpu, 1, DType> indices =
           in_data[seq_mask::kSequenceLength].get<xpu, 1, DType>(s);
-      SequenceMask(data_grad, indices, DType(0));
+      mxnet_op::SequenceMask(data_grad, indices, DType(0));
     }
   }
 
@@ -169,8 +170,8 @@ class SequenceMaskProp : public OperatorProperty {
         << "Input:[data, sequence_length]";
 
     const TShape &dshape = (*in_shape)[seq_mask::kData];
-    CHECK_GT(dshape.ndim(), 2U)
-        << "The data array must be of rank 3 or greater.";
+    CHECK_GT(dshape.ndim(), 1U)
+        << "The data array must be of rank 2 or greater.";
     // seq length vector is same as batch size
     if (param_.use_sequence_length)
       SHAPE_ASSIGN_CHECK(*in_shape, seq_mask::kSequenceLength,
@@ -191,10 +192,7 @@ class SequenceMaskProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at "
-                                       << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/sequence_mask.cc b/src/operator/sequence_mask.cc
index ed90f3ee4a4f..48a97680b596 100644
--- a/src/operator/sequence_mask.cc
+++ b/src/operator/sequence_mask.cc
@@ -18,25 +18,13 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_mask.cc
  * \brief
  * \author Sebastian Bodenstein
 */
 #include "./sequence_mask-inl.h"
 
-namespace mshadow {
-
-template <typename DType>
-inline void SequenceMask(const Tensor<cpu, 3, DType> &dst,
-                         const Tensor<cpu, 1, DType> label, DType value) {
-  for (index_t b = 0; b < dst.size(1); ++b)
-    for (index_t s = label[b]; s < dst.size(0); ++s)
-      for (index_t r = 0; r < dst.size(2); ++r)
-        dst[s][b][r] = value;
-}
-
-}  // namespace mshadow
-
 namespace mxnet {
 namespace op {
 template <>
diff --git a/src/operator/sequence_mask.cu b/src/operator/sequence_mask.cu
index d370ff3d13ec..b82157a3b61f 100644
--- a/src/operator/sequence_mask.cu
+++ b/src/operator/sequence_mask.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_mask.cu
  * \brief
  * \author Sebastian Bodenstein
@@ -25,57 +26,9 @@
 
 #include "./sequence_mask-inl.h"
 
-
-namespace mshadow {
-namespace cuda {
-
-////////////////////////////////////////////////////////////////////////////////
-// Cross-Entropy loss
-template<int n_bits, typename DType>
-__global__ void SequenceMaskKernel(Tensor<gpu, 3, DType> dst,
-                    const Tensor<gpu, 1, DType> lengths, DType value) {
-  const index_t smax = dst.size(0);
-  const index_t bmax = lengths.size(1);
-  const index_t nmax = dst.size(2);
-  unsigned int batch = threadIdx.x + blockIdx.x * blockDim.x;
-
-  // early return if out of bounds
-  if (batch >= bmax)
-    return;
-
-  // loop over batches
-    for (index_t s = lengths[batch]; s < smax; ++s)
-      for (index_t r = 0; r < nmax; ++r)
-      dst[s][batch][r] = value;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-template<typename DType>
-inline void SequenceMask(const Tensor<gpu, 3, DType> &dst,
-                         const Tensor<gpu, 1, DType> &lengths, DType value) {
-  dim3 dimBlock(kBaseThreadNum);
-  dim3 dimGrid(dst.size(1));
-  CheckLaunchParam(dimGrid, dimBlock, "SequenceMask");
-  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
-  SequenceMaskKernel<kBaseThreadBits, DType><<<dimGrid, dimBlock, 0, stream>>>(dst, lengths, value);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-}  // namespace cuda
-
-template<typename DType>
-inline void SequenceMask(Tensor<gpu, 3, DType> dst,
-                   const Tensor<gpu, 1, DType> &lengths, DType value) {
-  cuda::SequenceMask(dst, lengths, value);
-}
-
-}  // namespace mshadow
-
-////////////////////////////////////////////////////////////////////////////////
-
 namespace mxnet {
 namespace op {
+
 template <> Operator *CreateOp<gpu>(SequenceMaskParam param, int dtype) {
   Operator *op = NULL;
   MSHADOW_REAL_TYPE_SWITCH(dtype, DType,
diff --git a/src/operator/sequence_op_common.h b/src/operator/sequence_op_common.h
index 9e5843161087..c2a1165d330f 100644
--- a/src/operator/sequence_op_common.h
+++ b/src/operator/sequence_op_common.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_op_common.h
  * \brief common function used for sequence layers
  * \author Sebastian Bodenstein
@@ -32,9 +33,10 @@
 namespace mxnet {
 namespace op {
 
-template <typename DType>
-void IndexTensorToVector(mshadow::Tensor<gpu, 1, DType> data,
-                         std::vector<index_t> *index_vec) {
+template <typename DType, typename RType>
+typename std::enable_if<std::is_integral<RType>::value>::type
+inline IndexTensorToVector(mshadow::Tensor<gpu, 1, DType> data,
+                           std::vector<RType> *index_vec) {
   int max_seq_len = data.shape_.Size();
 #if MXNET_USE_CUDA
   DType *temp_index =
@@ -44,18 +46,19 @@ void IndexTensorToVector(mshadow::Tensor<gpu, 1, DType> data,
                       cudaMemcpyDeviceToHost, data.stream_->stream_);
   CHECK_EQ(cuda_status, cudaSuccess) << "cuda memcpy label error";
   for (int i = 0; i < max_seq_len; ++i) {
-    (*index_vec)[i] = static_cast<index_t>(temp_index[i]);
+    (*index_vec)[i] = static_cast<RType>(std::lround(temp_index[i]));
   }
   free(temp_index);
 #endif
 }
-template <typename DType>
-void IndexTensorToVector(mshadow::Tensor<cpu, 1, DType> data,
-                         std::vector<index_t> *index_vec) {
+template <typename DType, typename RType>
+typename std::enable_if<std::is_integral<RType>::value>::type
+inline IndexTensorToVector(mshadow::Tensor<cpu, 1, DType> data,
+                           std::vector<RType> *index_vec) {
   int max_seq_len = data.shape_.Size();
   DType *index_array = static_cast<DType *>(data.dptr_);
   for (int i = 0; i < max_seq_len; ++i)
-    (*index_vec)[i] = static_cast<index_t>(index_array[i]);
+    (*index_vec)[i] = static_cast<RType>(std::lround(index_array[i]));
 }
 
 }  // namespace op
diff --git a/src/operator/sequence_reverse-inl.h b/src/operator/sequence_reverse-inl.h
index 0a43138a085c..47154011bcbe 100644
--- a/src/operator/sequence_reverse-inl.h
+++ b/src/operator/sequence_reverse-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*
+ * Copyright (c) 2016 by Contributors
  * \file sequence_reverse-inl.h
  * \brief
  * \author Sebastian Bodenstien
@@ -221,8 +222,8 @@ class SequenceReverseProp : public OperatorProperty {
         << "Input:[data, sequence_length]";
 
     const TShape &dshape = (*in_shape)[seq_reverse::kData];
-    CHECK_GT(dshape.ndim(), 2U)
-        << "The data array must be of rank 3 or greater.";
+    CHECK_GT(dshape.ndim(), 1U)
+        << "The data array must be of rank 2 or greater.";
     // seq length vector is same as batch size
     if (param_.use_sequence_length)
       SHAPE_ASSIGN_CHECK(*in_shape, seq_reverse::kSequenceLength,
@@ -243,10 +244,7 @@ class SequenceReverseProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at "
-                                       << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/sequence_reverse.cc b/src/operator/sequence_reverse.cc
index 61821d3945f7..9a2f6983f0b2 100644
--- a/src/operator/sequence_reverse.cc
+++ b/src/operator/sequence_reverse.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_reverse.cc
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/sequence_reverse.cu b/src/operator/sequence_reverse.cu
index c6cc3f66d0fe..531fde167c9b 100644
--- a/src/operator/sequence_reverse.cu
+++ b/src/operator/sequence_reverse.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file sequence_reverse.cu
  * \brief
  * \author Sebastian Bodenstein
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
index a48c52f0b70e..3b14a26ea649 100644
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file slice_channel-inl.h
  * \brief
  * \author Bing Xu
@@ -41,7 +42,6 @@ namespace op {
 
 namespace slice_enum {
 enum SliceChannelOpInputs {kData};
-enum SliceChannelOpOutputs {kOut0, kOut1, kOut2, kOut3, kOut4};
 }  // namespace slice_enum
 
 struct SliceChannelParam : public dmlc::Parameter<SliceChannelParam> {
diff --git a/src/operator/slice_channel.cc b/src/operator/slice_channel.cc
index 7293ba6afcf3..7c633bb8196f 100644
--- a/src/operator/slice_channel.cc
+++ b/src/operator/slice_channel.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file slice_channel.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/slice_channel.cu b/src/operator/slice_channel.cu
index eb1c9c8b6e93..adc8741a5a81 100644
--- a/src/operator/slice_channel.cu
+++ b/src/operator/slice_channel.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file slice_channel.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/softmax_activation-inl.h b/src/operator/softmax_activation-inl.h
index b1b76930b483..1e65434acfee 100644
--- a/src/operator/softmax_activation-inl.h
+++ b/src/operator/softmax_activation-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_activation-inl.h
  * \brief SoftmaxActivation operator
  * \author Junyuan Xie
diff --git a/src/operator/softmax_activation.cc b/src/operator/softmax_activation.cc
index 115b0a730cde..23e4eb89db28 100644
--- a/src/operator/softmax_activation.cc
+++ b/src/operator/softmax_activation.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file activation.cc
  * \brief softmax_activation op
  * \author Junyuan Xie
diff --git a/src/operator/softmax_activation.cu b/src/operator/softmax_activation.cu
index 5bebed2846b8..ad9628bbf255 100644
--- a/src/operator/softmax_activation.cu
+++ b/src/operator/softmax_activation.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_activation.cu
  * \brief
  * \author Junyuan Xie
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index fa158c120d1a..41be3e5e6f24 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_output-inl.h
  * \brief
  * \author Bing Xu
@@ -54,6 +55,7 @@ struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
   bool is_hidden_layer;
   int normalization;
   bool out_grad;
+  float smooth_alpha;
   DMLC_DECLARE_PARAMETER(SoftmaxOutputParam) {
     DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
     .describe("Scales the gradient by a float factor.");
@@ -83,6 +85,13 @@ struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
     DMLC_DECLARE_FIELD(out_grad)
     .set_default(false)
     .describe("Multiplies gradient with output gradient element-wise.");
+    DMLC_DECLARE_FIELD(smooth_alpha)
+    .set_default(0.0f)
+    .set_range(0.0f, 1.0f)
+    .describe("Constant for computing a label smoothed version of cross-entropy"
+              "for the backwards pass.  This constant gets subtracted from the"
+              "one-hot encoding of the gold label and distributed uniformly to"
+              "all other labels.");
   };
 };
 
@@ -243,9 +252,18 @@ class SoftmaxOutputOp : public Operator {
           in_grad[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(data_shape, s);
       index_t valid_cnt = label.shape_.Size();
       if (param_.use_ignore) {
-        SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
+        if (param_.smooth_alpha == 0.0f) {
+          SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
+        } else {
+          SmoothSoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label),
+                            param_.smooth_alpha);
+        }
       } else {
-        SoftmaxGrad(grad, out, label);
+        if (param_.smooth_alpha == 0.0f) {
+          SoftmaxGrad(grad, out, label);
+        } else {
+          SmoothSoftmaxGrad(grad, out, label, param_.smooth_alpha);
+        }
       }
       if (param_.normalization == softmaxout_enum::kBatch) {
         valid_cnt = label.size(0);
@@ -367,9 +385,7 @@ class SoftmaxOutputProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
index 52bb2a400755..27b3295654b9 100644
--- a/src/operator/softmax_output.cc
+++ b/src/operator/softmax_output.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_output.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/softmax_output.cu b/src/operator/softmax_output.cu
index 8de5df6655f7..afcc8f4fc6bd 100644
--- a/src/operator/softmax_output.cu
+++ b/src/operator/softmax_output.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file softmax_output.cu
  * \brief
  * \author Bing Xu
diff --git a/src/operator/spatial_transformer-inl.h b/src/operator/spatial_transformer-inl.h
index 77967579340f..301c55c93719 100644
--- a/src/operator/spatial_transformer-inl.h
+++ b/src/operator/spatial_transformer-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file spatial_transformer-inl.h
  * \brief
  *  Reproducing paper: aderberg M, Simonyan K, Zisserman A. "Spatial transformer networks"
@@ -35,6 +36,7 @@
 #include <string>
 #include <utility>
 #include "./operator_common.h"
+#include "./linalg.h"
 
 
 namespace mxnet {
@@ -100,7 +102,9 @@ class SpatialTransformerOp : public Operator {
     Copy(grid_dst, workspace, grid_dst.stream_);
     for (index_t batch = 0; batch < data.size(0); batch++) {
         if (param_.transform_type == st::kAffine) {
-          grid_src[batch] = dot(loc[batch], grid_dst);
+          // Legacy approach shown here for comparison:
+          //    grid_src[batch] = dot(loc[batch], grid_dst);
+          linalg_gemm(loc[batch], grid_dst, grid_src[batch], false, false, s);
         }
     }
     if (param_.sampler_type == st::kBilinear) {
@@ -133,7 +137,9 @@ class SpatialTransformerOp : public Operator {
     }
     for (index_t batch = 0; batch < data.size(0); batch++) {
         if (param_.transform_type == st::kAffine) {
-          gloc[batch] = dot(grid_src[batch], grid_dst.T());
+          // Legacy approach shown here for comparison:
+          //   gloc[batch] = dot(grid_src[batch], grid_dst.T());
+          linalg_gemm(grid_src[batch], grid_dst, gloc[batch], false, true, s);
         }
     }
   }
diff --git a/src/operator/spatial_transformer.cc b/src/operator/spatial_transformer.cc
index 51b0ebfde1f0..78f64a7059c6 100644
--- a/src/operator/spatial_transformer.cc
+++ b/src/operator/spatial_transformer.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file spatial_transformer.cc
  * \brief
  * \author Wei Wu
@@ -27,9 +28,12 @@
 
 namespace mshadow {
 template<typename DType>
-bool between(DType value, int lowerBound, int upperBound) {
-  return (value >= lowerBound && value <= upperBound);
+static MSHADOW_CINLINE bool between(const DType value,
+                                    const DType lowerBound,
+                                    const DType upperBound) {
+  return value >= lowerBound && value <= upperBound;
 }
+
 template<typename DType>
 inline void BilinearSamplingForward(const Tensor<cpu, 4, DType> &output,
                                     const Tensor<cpu, 4, DType> &input,
@@ -37,22 +41,22 @@ inline void BilinearSamplingForward(const Tensor<cpu, 4, DType> &output,
   DType *out = output.dptr_;
   const DType *data = input.dptr_;
   const DType *grid = grid_src.dptr_;
-  int o_n = output.size(0), o_c = output.size(1), o_h = output.size(2), o_w = output.size(3);
-  int i_c = input.size(1), i_h = input.size(2), i_w = input.size(3);
+  const int o_n = output.size(0), o_c = output.size(1), o_h = output.size(2), o_w = output.size(3);
+  const int i_c = input.size(1), i_h = input.size(2), i_w = input.size(3);
   for (index_t n = 0; n < static_cast<index_t>(o_n); ++n) {
     for (index_t c = 0; c < static_cast<index_t>(o_c); ++c) {
       for (index_t h = 0; h < static_cast<index_t>(o_h); ++h) {
         for (index_t w = 0; w < static_cast<index_t>(o_w); ++w) {
-          index_t out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
-          index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
-          DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
-          DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
-          int top_left_y = static_cast<int>(floor(y_real));
-          int top_left_x = static_cast<int>(floor(x_real));
-          DType top_left_y_w = 1.0 - (y_real - top_left_y);
-          DType top_left_x_w = 1.0 - (x_real - top_left_x);
-          int data_index = n * i_c * i_h * i_w + c * i_h * i_w +
-                           top_left_y * i_w + top_left_x;
+          const index_t out_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
+          const index_t grid_index = n * o_h * o_w * 2 + h * o_w + w;
+          const DType y_real = (*(grid + grid_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+          const DType x_real = (*(grid + grid_index) + 1) * (i_w - 1) / 2;
+          const auto top_left_y = static_cast<int>(floor(y_real));
+          const auto top_left_x = static_cast<int>(floor(x_real));
+          const DType top_left_y_w = 1.0 - (y_real - top_left_y);
+          const DType top_left_x_w = 1.0 - (x_real - top_left_x);
+          const int data_index = n * i_c * i_h * i_w + c * i_h * i_w +
+                                 top_left_y * i_w + top_left_x;
           DType top_left_v = 0;
           DType top_right_v = 0;
           DType bottom_left_v = 0;
@@ -66,9 +70,9 @@ inline void BilinearSamplingForward(const Tensor<cpu, 4, DType> &output,
           if (between(top_left_x+1, 0, i_w-1) && between(top_left_y + 1, 0, i_h-1))
             bottom_right_v = *(data + data_index + i_w + 1);
           *(out+out_index) = top_left_v * top_left_y_w * top_left_x_w +
-                              top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
-                              bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
-                              bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
+                             top_right_v * top_left_y_w * (1.0 - top_left_x_w) +
+                             bottom_left_v * (1.0 - top_left_y_w) * top_left_x_w +
+                             bottom_right_v * (1.0 - top_left_y_w) * (1.0 - top_left_x_w);
         }
       }
     }
@@ -84,21 +88,21 @@ inline void BilinearSamplingBackward(const Tensor<cpu, 4, DType> &input_grad,
   DType *grid_src = grid_src_data.dptr_;
   const DType *grad = output_grad.dptr_;
   const DType *data = input_data.dptr_;
-  int o_n = output_grad.size(0), o_c = output_grad.size(1),
-      o_h = output_grad.size(2), o_w = output_grad.size(3);
-  int i_c = input_data.size(1), i_h = input_data.size(2), i_w = input_data.size(3);
+  const int o_n = output_grad.size(0), o_c = output_grad.size(1),
+    o_h = output_grad.size(2), o_w = output_grad.size(3);
+  const int i_c = input_data.size(1), i_h = input_data.size(2), i_w = input_data.size(3);
   for (index_t n = 0; n < static_cast<index_t>(o_n); ++n) {
      for (index_t h = 0; h < static_cast<index_t>(o_h); ++h) {
         for (index_t w = 0; w < static_cast<index_t>(o_w); ++w) {
           DType top_left_y_gw = 0.0;
           DType top_left_x_gw = 0.0;
-          index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
-          DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
-          DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
-          index_t top_left_y = static_cast<int>(floor(y_real));
-          index_t top_left_x = static_cast<int>(floor(x_real));
-          DType top_left_y_w = 1.0 - (y_real - top_left_y);
-          DType top_left_x_w = 1.0 - (x_real - top_left_x);
+          const index_t grid_src_index = n * o_h * o_w * 2 + h * o_w + w;
+          const DType y_real = (*(grid_src + grid_src_index + o_h * o_w) + 1) * (i_h - 1) / 2;
+          const DType x_real = (*(grid_src + grid_src_index) + 1) * (i_w - 1) / 2;
+          const auto top_left_y = static_cast<int>(floor(y_real));
+          const auto top_left_x = static_cast<int>(floor(x_real));
+          const DType top_left_y_w = 1.0 - (y_real - top_left_y);
+          const DType top_left_x_w = 1.0 - (x_real - top_left_x);
           for (index_t c = 0; c < static_cast<index_t>(o_c); ++c) {
             index_t grad_index = n * o_c * o_h * o_w + c * o_h * o_w + h * o_w + w;
             index_t data_index = n * i_c * i_h * i_w + c * i_h * i_w + top_left_y * i_w
diff --git a/src/operator/spatial_transformer.cu b/src/operator/spatial_transformer.cu
index d5e4480dc187..27fe73ee26c7 100644
--- a/src/operator/spatial_transformer.cu
+++ b/src/operator/spatial_transformer.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2016 by Contributors
  * \file spatial_transformer.cu
  * \brief
  * \author Wei Wu
diff --git a/src/operator/special_functions-inl.h b/src/operator/special_functions-inl.h
index b9460a3e7f0f..743391e0fce0 100644
--- a/src/operator/special_functions-inl.h
+++ b/src/operator/special_functions-inl.h
@@ -1,23 +1,5 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file special_functions-inl.h
  * \brief
  * \author Valentin Flunkert
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
index f02546144107..9ae0ced7a74a 100644
--- a/src/operator/svm_output-inl.h
+++ b/src/operator/svm_output-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file svm_output-inl.h
  * \brief
  * \author Jonas Amaro
@@ -162,9 +163,7 @@ class SVMOutputProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/svm_output.cc b/src/operator/svm_output.cc
index 766968dfaf0f..c84c2af28483 100644
--- a/src/operator/svm_output.cc
+++ b/src/operator/svm_output.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file svm_output.cc
  * \brief
  * \author Jonas Amaro
diff --git a/src/operator/svm_output.cu b/src/operator/svm_output.cu
index 250df9147f87..d9501071fd55 100644
--- a/src/operator/svm_output.cu
+++ b/src/operator/svm_output.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file svm_output.cu
  * \brief
  * \author Jonas Amaro
diff --git a/src/operator/swapaxis-inl.h b/src/operator/swapaxis-inl.h
index 89c724556b8b..e4bdfd5a664c 100644
--- a/src/operator/swapaxis-inl.h
+++ b/src/operator/swapaxis-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file swapaxis-inl.h
  * \brief
  * \author Ming Zhang
diff --git a/src/operator/swapaxis.cc b/src/operator/swapaxis.cc
index a6c3e8bff0c7..0b32db7b46cb 100644
--- a/src/operator/swapaxis.cc
+++ b/src/operator/swapaxis.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file swapaxis.cc
  * \brief
  * \author Ming Zhang
diff --git a/src/operator/swapaxis.cu b/src/operator/swapaxis.cu
index e9b105d71ea4..e622958a47fc 100644
--- a/src/operator/swapaxis.cu
+++ b/src/operator/swapaxis.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file swapaxis.cu
  * \brief
  * \author Ming Zhang
diff --git a/src/operator/tensor/broadcast_reduce-inl.cuh b/src/operator/tensor/broadcast_reduce-inl.cuh
index b1a259f9b791..f5d0152e0818 100644
--- a/src/operator/tensor/broadcast_reduce-inl.cuh
+++ b/src/operator/tensor/broadcast_reduce-inl.cuh
@@ -95,8 +95,8 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
       Shape<ndim> coord = unravel(idx, small_shape);
       int idx_big0 = ravel(coord, big_shape0);
 
-      DType val;
-      Reducer::SetInitValue(val);
+      DType val, residual;
+      Reducer::SetInitValue(val, residual);
       if (idx < N) {
         for (int k = tidy + Mstart; k < Mend; k += by*unroll) {
           int idx_big[unroll];
@@ -113,7 +113,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
           }
           #pragma unroll
           for (int u=0;u < unroll;u++) {
-            if (k + u*by < Mend) Reducer::Reduce(val, tmp[u]);
+            if (k + u*by < Mend) Reducer::Reduce(val, tmp[u], residual);
           }
         }
       }
@@ -126,11 +126,11 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
         shTile[it0] = val;
         __syncthreads();
         for (int t=1;t < by;t <<= 1) {
-          DType tmp;
-          Reducer::SetInitValue(tmp);
+          DType tmp, residual;
+          Reducer::SetInitValue(tmp, residual);
           if (tidy + t < by) tmp = shTile[it0 + t*fbx];
           __syncthreads();
-          Reducer::Reduce(shTile[it0], tmp);
+          Reducer::Reduce(shTile[it0], tmp, residual);
           __syncthreads();
         }
         if (idx < N && tidy == 0) {
@@ -175,8 +175,8 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
       int idx_lhs0 = ravel(coord, lhs_shape0);
       int idx_rhs0 = ravel(coord, rhs_shape0);
 
-      DType val;
-      Reducer::SetInitValue(val);
+      DType val, residual;
+      Reducer::SetInitValue(val, residual);
       if (idx < N) {
         for (int k = tidy + Mstart; k < Mend; k += by*unroll) {
           int idx_big[unroll];
@@ -197,7 +197,7 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
           }
           #pragma unroll
           for (int u=0;u < unroll;u++) {
-            if (k + u*by < Mend) Reducer::Reduce(val, tmp[u]);
+            if (k + u*by < Mend) Reducer::Reduce(val, tmp[u], residual);
           }
         }
       }
@@ -210,11 +210,11 @@ __global__ void reduce_kernel(const int N, const int M, const bool addto,
         shTile[it0] = val;
         __syncthreads();
         for (int t=1;t < by;t <<= 1) {
-          DType tmp;
-          Reducer::SetInitValue(tmp);
+          DType tmp, residual;
+          Reducer::SetInitValue(tmp, residual);
           if (tidy + t < by) tmp = shTile[it0 + t*fbx];
           __syncthreads();
-          Reducer::Reduce(shTile[it0], tmp);
+          Reducer::Reduce(shTile[it0], tmp, residual);
           __syncthreads();
         }
         if (idx < N && tidy == 0) {
@@ -237,10 +237,10 @@ __global__ void reduce_lines_kernel(const int N, const int M, const bool addto,
   const int small_in_stride, const DType* __restrict small_in, DType *small_out) {
   for (int idx = threadIdx.x + blockIdx.x*blockDim.x; idx < N; idx += blockDim.x*gridDim.x) {
 
-    DType val;
-    Reducer::SetInitValue(val);
+    DType val, residual;
+    Reducer::SetInitValue(val, residual);
     for (int k = 0; k < M; k++) {
-      Reducer::Reduce(val, small_in[idx + k*small_in_stride]);
+      Reducer::Reduce(val, small_in[idx + k*small_in_stride], residual);
     }
 
     if (idx < N) {
diff --git a/src/operator/tensor/broadcast_reduce-inl.h b/src/operator/tensor/broadcast_reduce-inl.h
index 744308d9a486..1bfe68a771d5 100644
--- a/src/operator/tensor/broadcast_reduce-inl.h
+++ b/src/operator/tensor/broadcast_reduce-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015-2017 by Contributors
  * \file broadcast_reduce_kernel.h
  * \brief Function definition of elementwise unary operators
  */
@@ -30,9 +31,6 @@
 #include <string>
 #include <utility>
 #include "../mshadow_op.h"
-#include "../elemwise_op_common.h"
-#include "./elemwise_binary_op.h"
-#include "../operator_common.h"
 
 namespace mxnet {
 namespace op {
@@ -161,11 +159,11 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
                                        const Shape<ndim>& rshape, const Shape<ndim>& rstride) {
   Shape<ndim> coord = unravel(idx, sshape);
   int j = ravel(coord, bshape);
-  DType val;
-  Reducer::SetInitValue(val);
+  DType val, residual;
+  Reducer::SetInitValue(val, residual);
   for (int k = 0; k < M; ++k) {
     coord = unravel(k, rshape);
-    Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]));
+    Reducer::Reduce(val, OP::Map(big[j + dot(coord, rstride)]), residual);
   }
   assign(&small[idx], addto, val);
 }
@@ -210,7 +208,7 @@ void Reduce(Stream<cpu> *s, const TBlob& small, const OpReqType req,
             const Tensor<cpu, 1, char>& workspace, const TBlob& big) {
   if (req == kNullOp) return;
   Shape<ndim> rshape, rstride;
-  int mdim = diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
+  diff(small.shape_.get<ndim>(), big.shape_.get<ndim>(), &rshape, &rstride);
   int N = small.shape_.Size(), M = rshape.Size();
   seq_reduce_compute<Reducer, ndim, DType, OP>(
     N, M, req == kAddTo, big.dptr<DType>(), small.dptr<DType>(), big.shape_.get<ndim>(),
@@ -243,8 +241,8 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
   const int idx_big0 = ravel(coord, big_shape);
   const int idx_lhs0 = ravel(coord, lhs_shape0);
   const int idx_rhs0 = ravel(coord, rhs_shape0);
-  DType val;
-  Reducer::SetInitValue(val);
+  DType val, residual;
+  Reducer::SetInitValue(val, residual);
   for (int k = 0; k < M; ++k) {
     Shape<ndim> coord_big = unravel(k, rshape);
     int idx_big = idx_big0 + dot(coord_big, rstride);
@@ -255,7 +253,7 @@ MSHADOW_XINLINE void seq_reduce_assign(const int idx, const int M, const bool ad
     Shape<ndim> coord_rhs = unravel(k, rhs_shape);
     int idx_rhs = idx_rhs0 + dot(coord_rhs, rhs_stride);
 
-    Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs]) ) );
+    Reducer::Reduce(val, OP1::Map(big[idx_big], OP2::Map(lhs[idx_lhs], rhs[idx_rhs])), residual);
   }
   assign(&small[idx], addto, val);
 }
diff --git a/src/operator/tensor/broadcast_reduce_op.h b/src/operator/tensor/broadcast_reduce_op.h
index aa678fd7dd82..79f7c39c870c 100644
--- a/src/operator/tensor/broadcast_reduce_op.h
+++ b/src/operator/tensor/broadcast_reduce_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file elementwise_unary_op-inl.h
  * \brief Function definition of elementwise unary operators
  */
@@ -339,6 +340,49 @@ inline void BroadcastReduceShapeCompact(const TShape& big, const TShape& small,
   }
 }
 
+inline bool SumOpForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int>* in_attrs,
+                                         std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  const int in_stype = in_attrs->at(0);
+  int& out_stype = out_attrs->at(0);
+  bool dispatched = false;
+  // sum only supported for CPU for now. TODO: Remove when support for GPU added
+  const bool invalid_ctx = dev_mask != mshadow::cpu::kDevMask;
+  const auto dispatch_ex =
+      invalid_ctx ? DispatchMode::kFComputeFallback : DispatchMode::kFComputeEx;
+  if (!dispatched && in_stype == kDefaultStorage) {
+    // When input is dense output storage is set as  dense and dispatched to
+    // dense operator
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode,
+                                     DispatchMode::kFCompute);
+  }
+
+  if (!dispatched && in_stype == kCSRStorage &&
+      (param.axis[0] == 0 || param.axis[0] == 1) && !param.keepdims &&
+      !param.exclude) {
+    // If input is csr and axis is 0 or 1, and neither of keepdims or exclude
+    // are set, dipsatch to sparse operator and output storage is set as dense
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode,
+                                     dispatch_ex);
+  }
+
+  if (!dispatched) {
+    // If input is csr, but keepdims or exclude is set or summing along a axis
+    // different from 0 or 1
+    dispatch_fallback(out_attrs, dispatch_mode);
+  }
+  if (*dispatch_mode == DispatchMode::kFComputeFallback) {
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+
+  return true;
+}
+
 template<typename xpu, typename reducer>
 void SearchAxisCompute(const nnvm::NodeAttrs& attrs,
                        const OpContext& ctx,
@@ -411,6 +455,239 @@ void ReduceAxesCompute(const nnvm::NodeAttrs& attrs,
   ReduceAxesComputeImpl<xpu, reducer, normalize>(attrs, ctx, inputs, req, outputs, small);
 }
 
+template <int req, int axis>
+struct SumCsrKernel;
+
+template <int req>
+/* \brief The number of columns are divided equally among the number of threads
+ * available.
+ * Each thread gets a subset of columns. It iterates through all rows for the
+ * subset of columns.
+ * In each iteration, it tries to do a binary search for the first column
+ * index between in_idx[in_indptr[row]] in_idx[in_indptr[row+1]]. After we find
+ * an index that is equal to the first column or close to the first column,
+ * it does a linear search for the rest of the indices and adds their data
+ * to the intermediate sum. At the end of iteration through all
+ * rows we have the sum along the axis for the subset of columns.
+ */
+struct SumCsrKernel<req, 0> {
+  template <typename RType, typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int j, DType* out_data,
+                                  const RType* in_indptr, const IType* in_idx,
+                                  const DType* in_data,
+                                  DType* sum,
+                                  DType* residual,
+                                  RType num_rows,
+                                  IType num_cols,
+                                  const nnvm::dim_t seg_len) {
+    const IType seg_start = j * seg_len;
+    if (seg_start >= num_cols) return;
+    const IType seg_end = std::min(seg_start + seg_len, num_cols);
+
+    for (RType row = 0; row < num_rows; ++row) {
+      // row specific seg starts
+      IType row_seg_start = seg_start;
+      IType row_seg_end = seg_end;
+
+      // Cache starting and ending indptr values for the row
+      IType row_indptr_start = in_indptr[row];
+      IType row_indptr_end = in_indptr[row + 1] - 1;
+      if (row_indptr_start == (row_indptr_end + 1)) continue;
+
+      // If row_seg_start is less than the first index for the row, move the
+      // row_seg_start forward
+      while (row_seg_start < in_idx[row_indptr_start] &&
+             row_seg_start < row_seg_end) {
+        row_seg_start++;
+      }
+
+      // If row_seg_start is greater than last index for the row, move on to
+      // the next row
+      if (row_seg_start > in_idx[row_indptr_end]) continue;
+
+      // Do binary search for row_seg_start between in_idx[in_indptr[i]] and
+      // in_idx[in_indptr[i + 1]]
+      IType start = row_indptr_start;
+      IType end = row_indptr_end;
+
+      // Initialize mid with the first indice of the row
+      IType mid = start;
+      while (start <= end) {
+        mid = start + (end - start) / 2;
+        if (in_idx[mid] == row_seg_start) {
+          break;
+        } else if (in_idx[mid] < row_seg_start) {
+          start = mid + 1;
+        } else {
+          end = mid - 1;
+        }
+      }
+
+      // At this point we have a in_idx[mid] which is close to row_seg_start
+      // Safety check to make sure mid is a valid indptr value
+      if (mid < row_indptr_start || mid > row_indptr_end)
+          mid = row_indptr_start;
+
+
+      // Linear search for nnzs for column subset between row_seg_start
+      // and row_seg_end
+      for (IType col = row_seg_start;
+           col < row_seg_end && mid <= row_indptr_end;) {
+        if (col == in_idx[mid]) {
+          mshadow::red::sum::Reduce(sum[col], in_data[mid],
+                                   residual[col]);
+          mid++;
+          col++;
+        } else if (in_idx[mid] < col) {
+          mid++;
+        } else {
+          col++;
+        }
+      }
+    }
+
+    for (IType col = seg_start; col < seg_end; col++) {
+        KERNEL_ASSIGN(out_data[col], req, sum[col]);
+    }
+  }
+};
+
+template <int req>
+struct SumCsrKernel<req, 1> {
+  template <typename RType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data,
+                                  const RType* in_indptr,
+                                  const DType* in_data) {
+    DType sum, residual;
+    mshadow::red::sum::SetInitValue(sum, residual);
+    for (RType k = in_indptr[i]; k < in_indptr[i + 1]; k++) {
+      mshadow::red::sum::Reduce(sum, in_data[k], residual);
+    }
+    KERNEL_ASSIGN(out_data[i], req, sum);
+  }
+};
+
+/*! \brief If normalize is true, the mean should be computed instead of sum */
+template <typename xpu, bool normalize = false>
+void SumCsrImpl(const nnvm::NodeAttrs& attrs, mshadow::Stream<xpu>* s, const OpContext& ctx,
+                const NDArray& input, const OpReqType req, NDArray* output) {
+  if (req == kNullOp) return;
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  CHECK_EQ(param.axis.ndim(), 1U) << "sum(csr)/mean(csr) only supports axis 0 or 1";
+  CHECK(param.axis[0] == 0 || param.axis[0] == 1)
+      << "sum(csr)/mean(csr) only support axis 0 or 1";
+  CHECK(!param.keepdims) << "keepdims not supported for sparse";
+  CHECK(!param.exclude) << "exclude not supported for sparse";
+  int64_t out_data_size = 0;
+  if (param.axis[0] == 0) {
+    out_data_size = input.shape()[1];
+  } else {
+    out_data_size = input.shape()[0];
+  }
+  // only dense output storage type is supported
+  CHECK_EQ(output->storage_type(), kDefaultStorage);
+
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  using namespace csr;
+  using nnvm::dim_t;
+
+  if (req == kWriteTo || req == kWriteInplace) {
+    MSHADOW_TYPE_SWITCH(output->data().type_flag_, DType, {
+      Kernel<set_zero, xpu>::Launch(s, out_data_size,
+                                    output->data().dptr<DType>());
+    })
+  }
+
+  if (!input.storage_initialized()) {
+    return;
+  }
+
+  if (0 == param.axis[0]) {
+    MSHADOW_IDX_TYPE_SWITCH(input.aux_type(kIndPtr), RType, {
+      MSHADOW_IDX_TYPE_SWITCH(input.aux_type(kIdx), IType, {
+        MSHADOW_TYPE_SWITCH(input.dtype(), DType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            const RType* in_indptr = input.aux_data(kIndPtr).dptr<RType>();
+            const IType* in_idx = input.aux_data(kIdx).dptr<IType>();
+            const DType* in_data = input.data().dptr<DType>();
+            const RType num_rows = input.shape()[0];
+            const IType num_cols = input.shape()[1];
+            dim_t num_threads = mxnet_op::get_num_threads<xpu>(16);
+            dim_t seg_len = (out_data_size + num_threads - 1) / num_threads;
+            mshadow::Tensor<xpu, 1, DType> workspace =
+                ctx.requested[0].get_space_typed<xpu, 1, DType>(
+                    Shape1(2 * out_data_size), s);
+            mshadow::Tensor<xpu, 1, DType> sum(
+                reinterpret_cast<DType*>(workspace.dptr_),
+                Shape1(out_data_size));
+            mshadow::Tensor<xpu, 1, DType> residual(
+                reinterpret_cast<DType*>(workspace.dptr_ +
+                                         out_data_size),
+                Shape1(out_data_size));
+
+            Kernel<set_zero, xpu>::Launch(s, out_data_size, sum.dptr_);
+            Kernel<set_zero, xpu>::Launch(s, out_data_size, residual.dptr_);
+            Kernel<SumCsrKernel<req_type, 0>, xpu>::Launch(
+                s, num_threads, output->data().dptr<DType>(), in_indptr, in_idx,
+                in_data, sum.dptr_, residual.dptr_, num_rows, num_cols,
+                seg_len);
+            if (normalize) {
+              mxnet_op::Kernel<
+                  mxnet_op::op_with_req<mshadow::op::div, req_type>,
+                  xpu>::Launch(s, out_data_size, output->data().dptr<DType>(),
+                               output->data().dptr<DType>(), DType(num_rows));
+            }
+          });
+        });
+      });
+    });
+  } else if (1 == param.axis[0]) {
+    MSHADOW_IDX_TYPE_SWITCH(input.aux_type(kIndPtr), RType, {
+      MSHADOW_IDX_TYPE_SWITCH(input.aux_type(kIdx), IType, {
+        MSHADOW_TYPE_SWITCH(input.dtype(), DType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            const RType* in_indptr = input.aux_data(kIndPtr).dptr<RType>();
+            const DType* in_data = input.data().dptr<DType>();
+            const IType num_cols = input.shape()[1];
+            Kernel<SumCsrKernel<req_type, 1>, xpu>::Launch(
+                s, out_data_size, output->data().dptr<DType>(), in_indptr,
+                in_data);
+            if (normalize) {
+              mxnet_op::Kernel<
+                  mxnet_op::op_with_req<mshadow::op::div, req_type>,
+                  xpu>::Launch(s, out_data_size, output->data().dptr<DType>(),
+                               output->data().dptr<DType>(), DType(num_cols));
+            }
+          });
+        });
+      });
+    });
+  }
+}
+
+template <typename xpu, typename reducer, bool normalize = false>
+void SumOpForwardEx(const nnvm::NodeAttrs& attrs, const OpContext& ctx,
+                    const std::vector<NDArray>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  const NDArrayStorageType istype = inputs[0].storage_type();
+  if (istype == kCSRStorage) {
+    CHECK_EQ(inputs[0].shape().ndim(), 2U)
+        << "sum(csr)/mean(csr) op only supports 2D ndarray as input";
+    NDArray output = outputs[0];
+    SumCsrImpl<xpu, normalize>(attrs, s, ctx, inputs[0], req[0], &output);
+  } else {
+    LOG(FATAL) << "Not implemented: "
+               << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
 // works when shape inference of output is given
 template<typename xpu, typename OP, bool normalize = false>
 void ReduceAxesBackwardUseInOut(const nnvm::NodeAttrs& attrs,
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cc b/src/operator/tensor/broadcast_reduce_op_index.cc
index 6887955880bc..dc07e67fb6e5 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cc
+++ b/src/operator/tensor/broadcast_reduce_op_index.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file broadcast_reduce_op.cc
  * \brief CPU Implementation of broadcast and reduce functions.
  */
@@ -154,8 +155,9 @@ Examples::
 .set_attr<FCompute>("FCompute<cpu>", PickOpForward<cpu>)
 .set_attr<nnvm::FGradient>("FGradient",
   [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
-    auto ret = MakeNonlossGradNode("_backward_pick", n, ograds,
-                                   {n->inputs[1]}, n->attrs.dict);
+    if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
+    auto ret = MakeGradNode("_backward_pick", n, {ograds[0], n->inputs[1]},
+                            n->attrs.dict);
     auto p = MakeNode("zeros_like", n->attrs.name + "_index_backward",
                       {n->inputs[1]}, nullptr, &n);
     ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
diff --git a/src/operator/tensor/broadcast_reduce_op_index.cu b/src/operator/tensor/broadcast_reduce_op_index.cu
index defa35ea6227..0d7b29dfa64e 100644
--- a/src/operator/tensor/broadcast_reduce_op_index.cu
+++ b/src/operator/tensor/broadcast_reduce_op_index.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file broadcast_reduce_op.cu
  * \brief GPU Implementation of broadcast and reduce functions.
  */
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cc b/src/operator/tensor/broadcast_reduce_op_value.cc
index 551ee8be89d5..29e1f5b0516b 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cc
+++ b/src/operator/tensor/broadcast_reduce_op_value.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file broadcast_reduce_op.cc
  * \brief CPU Implementation of broadcast and reduce functions.
  */
@@ -45,12 +46,15 @@ Defined in )code";
 }
 
 MXNET_OPERATOR_REGISTER_REDUCE(sum)
+MXNET_ADD_SPARSE_OP_ALIAS(sum)
 .add_alias("sum_axis")
 .describe(R"code(Computes the sum of array elements over given axes.
 
 .. Note::
 
   `sum` and `sum_axis` are equivalent.
+  For ndarray of csr storage type summation along axis 0 and axis 1 is supported.
+  Setting keepdims or exclude to True will cause a fallback to dense operator.
 
 Example::
 
@@ -66,8 +70,22 @@ Example::
   sum(data, axis=[1,2])
   [ 12.  19.  27.]
 
+  data = [[1,2,0],
+          [3,0,1],
+          [4,1,0]]
+
+  csr = cast_storage(data, 'csr')
+
+  sum(csr, axis=0)
+  [ 8.  2.  2.]
+
+  sum(csr, axis=1)
+  [ 3.  4.  5.]
+
 )code" ADD_FILELINE)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SumOpForwardEx<cpu, mshadow::red::sum>)
+.set_attr<FInferStorageType>("FInferStorageType", SumOpForwardInferStorageType)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
@@ -79,8 +97,11 @@ MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_sum)
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesBackwardUseNone<cpu>);
 
 MXNET_OPERATOR_REGISTER_REDUCE(mean)
+MXNET_ADD_SPARSE_OP_ALIAS(mean)
 .describe(get_reduce_axes_description("mean", __LINE__))
 .set_attr<FCompute>("FCompute<cpu>", ReduceAxesCompute<cpu, mshadow::red::sum, true>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SumOpForwardEx<cpu, mshadow::red::sum, true>)
+.set_attr<FInferStorageType>("FInferStorageType", SumOpForwardInferStorageType)
 .set_attr<FResourceRequest>("FResourceRequest",
   [](const NodeAttrs& attrs) {
     return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
diff --git a/src/operator/tensor/broadcast_reduce_op_value.cu b/src/operator/tensor/broadcast_reduce_op_value.cu
index 2c216e78982d..73c32f09cc5d 100644
--- a/src/operator/tensor/broadcast_reduce_op_value.cu
+++ b/src/operator/tensor/broadcast_reduce_op_value.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file broadcast_reduce_op.cu
  * \brief GPU Implementation of broadcast and reduce functions.
  */
diff --git a/src/operator/tensor/cast_storage-inl.cuh b/src/operator/tensor/cast_storage-inl.cuh
new file mode 100644
index 000000000000..c441341eafd0
--- /dev/null
+++ b/src/operator/tensor/cast_storage-inl.cuh
@@ -0,0 +1,610 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file cast_storage-inl.cuh
+ * \brief implementation of cast_storage op on GPU
+ */
+#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
+#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
+
+#include <cub/cub.cuh>
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include <nnvm/tuple.h>
+#include "./util/tensor_util-inl.h"
+#include "../mxnet_op.h"
+#include "./util/tensor_util-inl.cuh"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief GPU Kernel for filling the value array of the rsp tensor.
+ * Parallelized by rsp tensor elements: 1 thread/element
+ */
+struct CastDnsRspValsKernel {
+  /*!
+   * \brief
+   * \param tid         global thread id
+   * \param rsp_val     value array of rsp tensor to store data
+   * \param row_idx     indices of non-zero rows
+   * \param dns         dense matrix data
+   * \param nnr         number of non-zero rows
+   * \param row_length  number of elements per row
+   */
+  template<typename DType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* rsp_val,
+                                             const RType* row_idx,
+                                             const DType* dns,
+                                             const nnvm::dim_t nnr,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    if (tid < nnr*row_length) {
+      const dim_t row_id = tid / row_length;
+      const dim_t row_el = tid % row_length;
+      const dim_t dns_idx = row_idx[row_id] * row_length + row_el;
+      rsp_val[tid] = dns[dns_idx];
+    }
+  }
+};
+
+/*!
+ * \brief Inline implementation of typed CastStorageDnsRspImpl
+ * \tparam DType Data type
+ * \tparam RType Index type
+ * \param ctx Operator context
+ * \param dns Dense array (source)
+ * \param rsp Row-sparse array (destination)
+ */
+template<typename DType, typename RType>
+void CastStorageDnsRspGPUImpl_(const OpContext& ctx,
+                               const TBlob& dns,
+                               NDArray* rsp) {
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
+  using nnvm::dim_t;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  const dim_t num_rows = dns.shape_[0];
+  const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
+  const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+  const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum;
+  const dim_t min_num_warps = 512;
+  dim_t num_threads;
+  // TODO: remove kernel dependency on warpSize=32
+  if (threads_per_warp != 32) {
+    LOG(FATAL) << "CastStorageDnsRspImpl GPU kernels expect warpSize=32";
+  }
+  // Determine temporary device storage requirements
+  dim_t *row_flg = NULL;
+  void *d_temp_storage = NULL;
+  size_t temp_storage_bytes = 0;
+  cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                temp_storage_bytes,
+                                row_flg,
+                                row_flg,
+                                num_rows,
+                                mshadow::Stream<gpu>::GetStream(s));
+
+  // Allocate temp storage for marking non-zero rows and for cub's prefix sum
+  CHECK_GT(ctx.requested.size(), 0);
+  // The resource is located at the end of requested resource array
+  mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[ctx.requested.size() - 1]
+    .get_space_typed<gpu, 1, char>(Shape1(num_rows * sizeof(RType) + temp_storage_bytes), s);
+
+  row_flg = reinterpret_cast<RType *>(workspace.dptr_);
+  d_temp_storage = workspace.dptr_ + num_rows * sizeof(RType);
+
+  // Mark non-zero rows as 'one' in row_flg
+  // Different kernel versions are optimized for different matrix instances
+  // (1) 'Thread kernel' (one thread       computing one row)
+  // (2) 'Warp kernel'   (one warp         computing one row)
+  // (3) 'Block kernel'  (one thread block computing one row)
+  const int kernel_version = 0;
+  switch (kernel_version) {
+    case 1:
+      num_threads = num_rows;
+      Kernel<MarkRspRowThreadKernel, gpu>::Launch(s, num_threads,
+                                                  row_flg, dns.dptr<DType>(), num_rows, row_length);
+      break;
+    case 2:
+      num_threads = num_rows * threads_per_warp;
+      Kernel<MarkRspRowWarpKernel, gpu>::Launch(s, num_threads,
+                                                row_flg, dns.dptr<DType>(), num_rows, row_length);
+      break;
+    case 3:
+      num_threads = num_rows * threads_per_block;
+      Kernel<MarkRspRowBlockKernel, gpu>::Launch(s, num_threads,
+                                                 row_flg, dns.dptr<DType>(), num_rows, row_length);
+      break;
+    default:
+      if (row_length < threads_per_warp) {
+        num_threads = num_rows;
+        Kernel<MarkRspRowThreadKernel, gpu>::Launch(s, num_threads,
+                                                    row_flg, dns.dptr<DType>(), num_rows,
+                                                    row_length);
+      } else if (row_length < threads_per_block || num_rows > min_num_warps) {
+        num_threads = num_rows * threads_per_warp;
+        Kernel<MarkRspRowWarpKernel, gpu>::Launch(s, num_threads,
+                                                  row_flg, dns.dptr<DType>(), num_rows, row_length);
+      } else {
+        num_threads = num_rows * threads_per_block;
+        Kernel<MarkRspRowBlockKernel, gpu>::Launch(s, num_threads,
+                                                   row_flg, dns.dptr<DType>(), num_rows,
+                                                   row_length);
+      }
+      break;
+  }
+  // Compute non-zero row indices through inclusive prefix sum
+  cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                temp_storage_bytes,
+                                row_flg,
+                                row_flg,
+                                num_rows,
+                                mshadow::Stream<gpu>::GetStream(s));
+
+  // Get total number of non-zero rows from device
+  dim_t nnr = 0;
+  CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows - 1], sizeof(dim_t), cudaMemcpyDeviceToHost));
+
+  // Allocate rsp tensor row index array and fill
+  rsp->CheckAndAllocAuxData(rowsparse::kIdx, Shape1(nnr));
+  if (0 == nnr) return;
+  RType *row_idx = rsp->aux_data(rowsparse::kIdx).dptr<RType>();
+  num_threads = num_rows;
+  Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_threads,
+                                           row_idx, row_flg, num_rows);
+
+  // Construct shape of rsp tensor data, allocate, and fill
+  auto storage_shape = dns.shape_;
+  storage_shape[0] = nnr;
+  rsp->CheckAndAllocData(storage_shape);
+  num_threads = nnr * row_length;
+  Kernel<CastDnsRspValsKernel, gpu>::Launch(s, num_threads,
+                                            rsp->data().dptr<DType>(), row_idx, dns.dptr<DType>(),
+                                            nnr, row_length);
+}
+
+
+/*!
+ * \brief GPU implementation of casting a dns tensor to rsp type.
+ */
+inline void CastStorageDnsRspImpl(const OpContext& ctx,
+                                  const gpu& gpu_dev,
+                                  const TBlob& dns,
+                                  NDArray* rsp) {
+  CHECK(rsp != nullptr);
+  CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
+  CHECK_EQ(dns.shape_, rsp->shape());
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
+  using nnvm::dim_t;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(rowsparse::kIdx), RType, {  // row idx type
+      CastStorageDnsRspGPUImpl_<DType, RType>(ctx, dns, rsp);
+    });
+  });
+}
+
+/*!
+ * \brief Thread kernel for initializing the indptr in a csr matrix.
+ * Parallelized by matrix rows: 1 thread/row
+ */
+struct CastDnsCsrIndPtrThreadKernel {
+  /*!
+   * \brief
+   * \param tid       global thread id
+   * \param indptr    index pointer array of the csr matrix
+   * \param dns       dense matrix
+   * \param num_rows  number of rows of the dense matrix
+   * \param num_cols  number of columns of the dense matrix
+   */
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    if (tid == 0) {
+      indptr[tid] = 0;
+    }
+    if (tid < num_rows) {
+      dim_t nnz = 0;
+      const dim_t offset = tid * num_cols;
+      for (dim_t j = 0; j < num_cols; ++j) {
+        if (dns[offset+j] != 0) {
+          nnz++;
+        }
+      }
+      indptr[tid+1] = nnz;
+    }
+  }
+};
+
+/*!
+ * \brief Thread kernel for initializing the col_idx and value array of the csr matrix.
+ * Parallelized by matrix rows: 1 thread/row
+ */
+struct CastDnsCsrColIdxAndValsThreadKernel {
+  /*!
+   * \brief
+   * \param tid       global thread id
+   * \param val       data array of the csr matrix
+   * \param col_idx   column index array of the csr matrix
+   * \param indptr    index pointer array of the csr matrix
+   * \param dns       dense matrix
+   * \param num_rows  number of rows of the dense matrix
+   * \param num_cols  number of columns of the dense matrix
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* val,
+                                             CType* col_idx,
+                                             const IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    if (tid < num_rows) {
+      const dim_t offset = tid * num_cols;
+      dim_t k = indptr[tid];
+      for (dim_t j = 0; j < num_cols; ++j) {
+        if (dns[offset+j] != 0) {
+          val[k] = dns[offset+j];
+          col_idx[k] = j;
+          ++k;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Warp kernel for initializing the indptr in a csr matrix.
+ * Parallelized by matrix rows: 1 warp/row
+ */
+struct CastDnsCsrIndPtrWarpKernel {
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    typedef cub::WarpReduce<dim_t> WarpReduce;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+    __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
+
+    if (tid == 0) {
+      indptr[tid] = 0;
+    }
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
+    if (warp_id < num_rows) {
+      dim_t lane_nnz = 0;
+      const dim_t offset = warp_id * num_cols;
+      for (dim_t j = lane; j < num_cols; j+=32) {
+        if (dns[offset+j] != 0) {
+          lane_nnz++;
+        }
+      }
+      dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(lane_nnz);
+      if (lane == 0) {
+        indptr[warp_id+1] = aggr;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Warp kernel for initializing the col_idx and value array of the csr matrix.
+ * Parallelized by matrix rows: 1 warp/row
+ */
+struct CastDnsCsrColIdxAndValsWarpKernel {
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* val,
+                                             CType* col_idx,
+                                             const IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    typedef cub::WarpScan<dim_t> WarpScan;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+    __shared__ typename WarpScan::TempStorage temp_storage[warps_per_block];
+    __shared__ volatile dim_t warp_nnz[warps_per_block];
+
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
+    if (warp_id < num_rows) {
+      const dim_t offset = warp_id * num_cols;
+      dim_t k = indptr[warp_id];
+      dim_t nnz;
+      for (dim_t j = lane; j < num_cols+lane; j+=32) {
+        nnz = 0;
+        if (j < num_cols) {
+          if (dns[offset+j] != 0) {
+            nnz++;
+          }
+        }
+        if (lane == 31) {
+          warp_nnz[warp_lane] = nnz;
+        }
+        // Compute index each thread has to write to
+        WarpScan(temp_storage[warp_lane]).ExclusiveSum(nnz, nnz);
+        if (j < num_cols) {
+          if (dns[offset+j] != 0) {
+            val[k+nnz] = dns[offset+j];
+            col_idx[k+nnz] = j;
+          }
+        }
+        if (lane == 31) {
+          warp_nnz[warp_lane] += nnz;
+        }
+        __syncwarp();
+        k += warp_nnz[warp_lane];
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Block kernel for initializing the indptr in a csr matrix.
+ * Parallelized by matrix rows: 1 threadBlock/row
+ */
+struct CastDnsCsrIndPtrBlockKernel {
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using mshadow::cuda::kBaseThreadNum;
+    using nnvm::dim_t;
+    typedef cub::BlockReduce<dim_t, kBaseThreadNum> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    if (tid == 0) {
+      indptr[tid] = 0;
+    }
+    if (blockIdx.x < num_rows) {
+      dim_t lane_nnz = 0;
+      const dim_t offset = blockIdx.x * num_cols;
+      for (dim_t j = threadIdx.x; j < num_cols; j+=kBaseThreadNum) {
+        if (dns[offset+j] != 0) {
+          lane_nnz++;
+        }
+      }
+      dim_t aggr = BlockReduce(temp_storage).Sum(lane_nnz);
+      if (threadIdx.x == 0) {
+        indptr[blockIdx.x+1] = aggr;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Block kernel for initializing the col_idx and value array of the csr matrix.
+ * Parallelized by matrix rows: 1 threadBlock/row
+ */
+struct CastDnsCsrColIdxAndValsBlockKernel {
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* val,
+                                             CType* col_idx,
+                                             const IType* indptr,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using mshadow::cuda::kBaseThreadNum;
+    using nnvm::dim_t;
+    typedef cub::BlockScan<dim_t, kBaseThreadNum> BlockScan;
+    __shared__ typename BlockScan::TempStorage temp_storage;
+    __shared__ volatile dim_t block_nnz;
+
+    if (blockIdx.x < num_rows) {
+      const dim_t offset = blockIdx.x * num_cols;
+      dim_t k = indptr[blockIdx.x];
+      dim_t nnz;
+      for (dim_t j = threadIdx.x; j < num_cols+threadIdx.x; j+=kBaseThreadNum) {
+        nnz = 0;
+        if (j < num_cols) {
+          if (dns[offset+j] != 0) {
+            nnz++;
+          }
+        }
+        if (threadIdx.x == kBaseThreadNum-1) {
+          block_nnz = nnz;
+        }
+        // Compute index each thread has to write to
+        BlockScan(temp_storage).ExclusiveSum(nnz, nnz);
+        if (j < num_cols) {
+          if (dns[offset+j] != 0) {
+            val[k+nnz] = dns[offset+j];
+            col_idx[k+nnz] = j;
+          }
+        }
+        if (threadIdx.x == kBaseThreadNum-1) {
+          block_nnz += nnz;
+        }
+        __syncthreads();
+        k += block_nnz;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU implementation of casting a dense matrix to csr type.
+ */
+inline void CastStorageDnsCsrImpl(const OpContext& ctx,
+                                  const gpu& gpu_dev,
+                                  const TBlob& dns,
+                                  NDArray* csr) {
+  CHECK(csr != nullptr);
+  CHECK_EQ(csr->storage_type(), kCSRStorage);
+  CHECK_EQ(dns.shape_.ndim(), 2);
+  CHECK_EQ(dns.shape_, csr->shape());
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
+  using nnvm::dim_t;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {                     // data type
+    MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, {   // col_idx type
+        const dim_t num_rows = dns.shape_[0];
+        const dim_t num_cols = dns.shape_[1];
+        const dim_t threads_per_warp  = mxnet_op::cuda_get_device_prop().warpSize;
+        const dim_t threads_per_block = mshadow::cuda::kBaseThreadNum;
+        const dim_t min_num_warps = 512;
+        dim_t num_threads;
+        // TODO: remove kernel dependency on warpSize=32
+        if (threads_per_warp != 32) {
+          LOG(FATAL) << "CastStorageDnsCsrImpl GPU kernels expect warpSize=32";
+        }
+        csr->CheckAndAllocAuxData(csr::kIndPtr, Shape1(num_rows+1));
+        IType* indptr = csr->aux_data(csr::kIndPtr).dptr<IType>();
+        DType* dns_data = dns.dptr<DType>();
+
+        // Different kernel versions are optimized for different matrix instances
+        // (1) 'Thread kernel' (one thread       computing one row)
+        // (2) 'Warp kernel'   (one warp         computing one row)
+        // (3) 'Block kernel'  (one thread block computing one row)
+        const int kernel_version = 0;
+        switch (kernel_version) {
+          case 1:
+            num_threads = num_rows;
+            Kernel<CastDnsCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          case 2:
+            num_threads = num_rows * threads_per_warp;
+            Kernel<CastDnsCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          case 3:
+            num_threads = num_rows * threads_per_block;
+            Kernel<CastDnsCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          default:
+            if (num_cols < threads_per_warp) {
+              num_threads = num_rows;
+              Kernel<CastDnsCsrIndPtrThreadKernel, gpu>::Launch(s, num_threads,
+                  indptr, dns_data, num_rows, num_cols);
+            } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
+              num_threads = num_rows * threads_per_warp;
+              Kernel<CastDnsCsrIndPtrWarpKernel, gpu>::Launch(s, num_threads,
+                  indptr, dns_data, num_rows, num_cols);
+            } else {
+              num_threads = num_rows * threads_per_block;
+              Kernel<CastDnsCsrIndPtrBlockKernel, gpu>::Launch(s, num_threads,
+                  indptr, dns_data, num_rows, num_cols);
+            }
+            break;
+        }
+
+        // Determine temporary device storage requirements
+        void *d_temp_storage = NULL;
+        size_t temp_storage_bytes = 0;
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      indptr,
+                                      indptr,
+                                      num_rows+1,
+                                      mshadow::Stream<gpu>::GetStream(s));
+
+        // Allocate temporary storage from requested resource.
+       CHECK_GT(ctx.requested.size(), 0);
+       // The resource is located at the end of requested resource array
+       auto workspace = ctx.requested[ctx.requested.size() - 1].
+          get_space_typed<gpu, 1, char>(Shape1(temp_storage_bytes), s);
+       d_temp_storage = workspace.dptr_;
+
+        // Compute indptr through inclusive prefix sum
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      indptr,
+                                      indptr,
+                                      num_rows+1,
+                                      mshadow::Stream<gpu>::GetStream(s));
+
+        // Receive total number of nnz values from device
+        IType nnz = 0;
+        CUDA_CALL(cudaMemcpy(&nnz, &(indptr[num_rows]), sizeof(IType), cudaMemcpyDeviceToHost));
+
+        // Allocate column index array and data array of the csr matrix
+        csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<dim_t>(nnz)));
+        csr->CheckAndAllocData(Shape1(static_cast<dim_t>(nnz)));
+
+        // Compute and fill column index array and data array of the csr matrix
+        switch (kernel_version) {
+          case 1:
+            num_threads = num_rows;
+            Kernel<CastDnsCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          case 2:
+            num_threads = num_rows * threads_per_warp;
+            Kernel<CastDnsCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          case 3:
+            num_threads = num_rows * threads_per_block;
+            Kernel<CastDnsCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            break;
+          default:
+            if (num_cols < threads_per_warp) {
+              num_threads = num_rows;
+              Kernel<CastDnsCsrColIdxAndValsThreadKernel, gpu>::Launch(s, num_threads,
+                  csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                  indptr, dns_data, num_rows, num_cols);
+            } else if (num_cols < threads_per_block || num_rows > min_num_warps) {
+              num_threads = num_rows * threads_per_warp;
+              Kernel<CastDnsCsrColIdxAndValsWarpKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            } else {
+              num_threads = num_rows * threads_per_block;
+              Kernel<CastDnsCsrColIdxAndValsBlockKernel, gpu>::Launch(s, num_threads,
+                csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+                indptr, dns_data, num_rows, num_cols);
+            }
+            break;
+        }
+      });
+    });
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_CUH_
diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h
new file mode 100644
index 000000000000..ebe19d41bbc4
--- /dev/null
+++ b/src/operator/tensor/cast_storage-inl.h
@@ -0,0 +1,423 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file cast_storage-inl.h
+ * \brief cast_storage implementation for dense and sparse tensors
+ */
+#ifndef MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
+#define MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
+
+#include <dmlc/timer.h>
+#include <mxnet/ndarray.h>
+#include <vector>
+#include "../mxnet_op.h"
+#include "../operator_common.h"
+#ifdef __CUDACC__
+#include "./cast_storage-inl.cuh"
+#endif  // __CUDACC__
+
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief CPU Kernel for marking row_idx of a RSP tensor per row.
+ */
+struct MarkRspRowIdx {
+  // i represents the row index of the tensor data
+  template<typename DType, typename RType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  RType* row_idx,
+                                  const DType* data,
+                                  const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    dim_t j = 0;
+    dim_t offset = i * row_length;
+    for (; j < row_length; ++j) {
+      if (data[offset+j] != 0) {
+        break;
+      }
+    }
+    if (row_length == j) {
+      row_idx[i] = 0;  // mark as zero for zero row
+    } else {
+      row_idx[i] = 1;  // mark as one for non-zero row
+    }
+  }
+};
+
+/*!
+ * \brief CPU implementation of casting a dns tensor to rsp type.
+ */
+inline void CastStorageDnsRspImpl(const OpContext& ctx,
+                                  const cpu& cpu_dev,
+                                  const TBlob& dns,
+                                  NDArray* rsp) {
+  using namespace rowsparse;
+  using namespace mshadow;
+  using nnvm::dim_t;
+  CHECK(rsp != nullptr);
+  CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
+  CHECK_EQ(dns.shape_, rsp->shape());
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(rsp->aux_type(kIdx), RType, {  // row idx type
+      const dim_t num_rows = dns.shape_[0];
+      const dim_t row_length = dns.shape_.ProdShape(1, dns.shape_.ndim());
+      rsp->CheckAndAllocAuxData(kIdx, Shape1(num_rows));
+      TBlob row_idx_blob = rsp->aux_data(kIdx);
+      RType* row_idx = row_idx_blob.dptr<RType>();
+      dim_t num_threads = num_rows;
+      mxnet_op::Kernel<MarkRspRowIdx, cpu>::Launch(s, num_threads,
+          row_idx, dns.dptr<DType>(), row_length);
+      dim_t nnr = 0;
+      nnr = common::ParallelAccumulate(row_idx, num_rows, nnr);
+      rsp->set_aux_shape(kIdx, Shape1(nnr));
+      if (0 == nnr) return;
+      auto storage_shape = dns.shape_;
+      storage_shape[0] = nnr;
+      rsp->CheckAndAllocData(storage_shape);
+      auto dns_data = dns.get_with_shape<cpu, 2, DType>(Shape2(num_rows, row_length), s);
+      auto rsp_data = rsp->data().get_with_shape<cpu, 2, DType>(Shape2(nnr, row_length), s);
+      dim_t idx = 0;
+      for (dim_t i = 0; i < num_rows; ++i) {
+        if (row_idx[i] > 0) {
+          row_idx[idx] = i;
+          Copy(rsp_data[idx], dns_data[i], s);
+          ++idx;
+        }
+      }
+    });
+  });
+}
+
+// TODO(haibin) Use memcopy instead will be much faster than assigning each individual element
+struct CastStorageRspDnsKernel {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  const nnvm::dim_t row_length,
+                                  const IType* idx,
+                                  const DType *data,
+                                  DType* dns) {
+    using nnvm::dim_t;
+    IType rid = idx[i];
+    dim_t dns_offset = rid * row_length;
+    dim_t rsp_offset = i * row_length;
+    for (dim_t col = 0; col < row_length; col++) {
+      dns[dns_offset + col] = data[rsp_offset + col];
+    }
+  }
+};
+
+/*!
+ * \brief This function assumes that the memory for dns has been allocated already
+ * since the shape is known at binding stage.
+ */
+template<typename xpu>
+void CastStorageRspDnsImpl(const OpContext& ctx,
+                           const NDArray& rsp,
+                           TBlob* dns) {
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  CHECK_EQ(rsp.storage_type(), kRowSparseStorage);
+  using nnvm::dim_t;
+  MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {
+    MSHADOW_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, {
+      // assign zeros
+      mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, dns->Size(), dns->dptr<DType>());
+      if (rsp.storage_initialized()) {
+        // copy over row by row
+        auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s).dptr_;
+        auto in_data = rsp.data().dptr<DType>();
+        auto out_data = dns->dptr<DType>();
+        auto shape = rsp.shape();
+        const dim_t num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
+        const dim_t row_length = shape.ProdShape(1, shape.ndim());
+        const dim_t num_threads = num_rows;
+        mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_threads,
+            row_length, in_idx, in_data, out_data);
+      }
+    });
+  });
+}
+
+/*!
+ * \brief CPU kernel for initializing the indptr in a csr matrix.
+ */
+struct FillCsrIndPtr {
+  /*!
+   * \brief
+   * \param i         the i-th row of the dns tensor
+   * \param indptr    the indptr of the csr tensor
+   * \param dns       the dns tensor
+   * \param num_rows  number of rows of the dns tensor
+   * \param num_cols  number of columns of the dns tensor
+   */
+  template<typename DType, typename IType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  IType* indptr,
+                                  const DType* dns,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    indptr[i+1] = 0;
+    const dim_t offset = i * num_cols;
+    for (dim_t j = 0; j < num_cols; ++j) {
+      if (dns[offset+j] != 0) {
+        ++indptr[i+1];
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU kernel for initializing the col_idx and value array of the csr matrix.
+ */
+struct FillCsrColIdxAndVals {
+  /*!
+   * \brief
+   * \param i         the i-th row of the dns tensor
+   * \param val       value array of the csr tensor
+   * \param col_idx   column idx array of the csr tensor
+   * \param indptr    indptr array of the csr tensor
+   * \param dns       dns tensor
+   * \param num_rows  number of rows of the dns tensor
+   * \param num_cols  number of columns of the dns tensor
+   */
+  template<typename DType, typename IType, typename CType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* val,
+                                  CType* col_idx,
+                                  const IType* indptr,
+                                  const DType* dns,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    const dim_t offset = i * num_cols;
+    IType k = indptr[i];
+    for (dim_t j = 0; j < num_cols; ++j) {
+      if (dns[offset+j] != 0) {
+        val[k] = dns[offset+j];
+        col_idx[k] = j;
+        ++k;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU implementation of casting a dns matrix to csr type.
+ */
+inline void CastStorageDnsCsrImpl(const OpContext& ctx,
+                                  const cpu& cpu_dev,
+                                  const TBlob& dns,
+                                  NDArray* csr) {
+  CHECK(csr != nullptr);
+  CHECK_EQ(csr->storage_type(), kCSRStorage);
+  CHECK_EQ(dns.shape_.ndim(), 2);
+  CHECK_EQ(dns.shape_, csr->shape());
+  using mshadow::Shape1;
+  using nnvm::dim_t;
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, {  // col idx type
+        const dim_t num_rows = dns.shape_[0];
+        const dim_t num_cols = dns.shape_[1];
+        csr->CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(num_rows+1));
+        IType* indptr = csr->aux_data(csr::kIndPtr).dptr<IType>();
+        DType* dns_data = dns.dptr<DType>();
+        dim_t num_threads = num_rows;
+        mxnet_op::Kernel<FillCsrIndPtr, cpu>::Launch(s, num_threads,
+            indptr, dns_data, num_rows, num_cols);
+        // single thread to accumulate indptr
+        // indptr[num_rows] indicates the number of non-zero elements
+        indptr[0] = 0;
+        for (dim_t i = 0; i < num_rows; ++i) {
+          indptr[i+1] += indptr[i];
+        }
+        // allocate column idx array and value array
+        csr->CheckAndAllocAuxData(csr::kIdx, Shape1(static_cast<index_t>(indptr[num_rows])));
+        csr->CheckAndAllocData(Shape1(static_cast<index_t>(indptr[num_rows])));
+        // fill col_idx and value arrays of the csr
+        mxnet_op::Kernel<FillCsrColIdxAndVals, cpu>::Launch(s, num_threads,
+            csr->data().dptr<DType>(), csr->aux_data(csr::kIdx).dptr<CType>(),
+            indptr, dns_data, num_rows, num_cols);
+      });
+    });
+  });
+}
+
+/*!
+ * \brief This is the kernel for copying csr.data to its corresponding dns matrix.
+ */
+struct CopyCsrDataToDns {
+  /*!
+   * \brief
+   * \param i         the i-th row of the dns tensor
+   * \param dns_data  data blob of the dns tensor
+   * \param col_idx   column idx array of the csr tensor
+   * \param indptr    indptr array of the csr tensor
+   * \param csr_data  data blob of the csr tensor
+   * \param num_cols  number of columns of the dns tensor
+   */
+  template<typename DType, typename IType, typename CType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  DType* dns_data,
+                                  const CType* col_idx,
+                                  const IType* indptr,
+                                  const DType* csr_data,
+                                  const nnvm::dim_t num_cols) {
+    const nnvm::dim_t offset = i * num_cols;
+    for (IType j = indptr[i]; j < indptr[i+1]; ++j) {
+      dns_data[offset+col_idx[j]] = csr_data[j];
+    }
+  }
+};
+
+/*!
+ * \brief Casts a csr matrix to dns format.
+ */
+template<typename xpu>
+void CastStorageCsrDnsImpl(const OpContext& ctx,
+                           const NDArray& csr,
+                           TBlob* dns) {
+  CHECK(dns != nullptr);
+  CHECK_EQ(csr.storage_type(), kCSRStorage);
+  CHECK_EQ(dns->shape_.ndim(), 2);
+  CHECK_EQ(dns->shape_, csr.shape());
+  using nnvm::dim_t;
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, {  // col idx type
+        const dim_t num_rows = dns->shape_[0];
+        const dim_t num_cols = dns->shape_[1];
+        DType* dns_data = dns->dptr<DType>();
+        dim_t num_threads = dns->shape_.Size();
+        mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, num_threads, dns_data);
+        if (!csr.storage_initialized()) return;
+        const IType* indptr = csr.aux_data(csr::kIndPtr).dptr<IType>();
+        const CType* col_idx = csr.aux_data(csr::kIdx).dptr<CType>();
+        const DType* csr_data = csr.data().dptr<DType>();
+        num_threads = num_rows;
+        mxnet_op::Kernel<CopyCsrDataToDns, xpu>::Launch(s, num_threads,
+            dns_data, col_idx, indptr, csr_data, num_cols);
+      });
+    });
+  });
+}
+
+template<typename xpu>
+void CastStorageComputeImpl(const OpContext& ctx,
+                            const NDArray& input,
+                            const NDArray& output) {
+  const auto src_stype = input.storage_type();
+  const auto dst_stype = output.storage_type();
+  if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) {
+    TBlob ret = output.data();
+    CastStorageRspDnsImpl<xpu>(ctx, input, &ret);
+  } else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) {
+    NDArray ret = output;  // get rid of the const qualifer
+    CastStorageDnsRspImpl(ctx, xpu(), input.data(), &ret);
+  } else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) {
+    NDArray ret = output;  // get rid of the const qualifer
+    CastStorageDnsCsrImpl(ctx, xpu(), input.data(), &ret);
+  } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
+    TBlob ret = output.data();
+    CastStorageCsrDnsImpl<xpu>(ctx, input, &ret);
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
+struct CastStorageParam : public dmlc::Parameter<CastStorageParam> {
+  int stype;
+  DMLC_DECLARE_PARAMETER(CastStorageParam) {
+    DMLC_DECLARE_FIELD(stype)
+    .add_enum("default", kDefaultStorage)
+    .add_enum("row_sparse", kRowSparseStorage)
+    .add_enum("csr", kCSRStorage)
+    .describe("Output storage type.");
+  }
+};
+
+inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_NE(in_attrs->at(0), kUndefinedStorage)
+    << "src ndarray's storage type must be specified";
+  const CastStorageParam& param = nnvm::get<CastStorageParam>(attrs.parsed);
+  CHECK_NE(param.stype, kUndefinedStorage)
+    << "dst ndarray's storage type must be specified";
+  const auto& in_stype = in_attrs->at(0);
+  const auto& param_stype = static_cast<NDArrayStorageType>(param.stype);
+  bool dispatched = false;
+  // dns -> dns, dns -> rsp, dns -> csr
+  if (!dispatched && in_stype == kDefaultStorage && param_stype == kDefaultStorage) {
+    // dns -> dns
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched && in_stype == kDefaultStorage &&
+    (param_stype == kRowSparseStorage || param_stype == kCSRStorage)) {
+    // dns -> rsp, dns -> csr
+    dispatched = storage_type_assign(out_attrs, param_stype,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched && in_stype == kRowSparseStorage &&
+      (param_stype == kRowSparseStorage || param_stype == kDefaultStorage)) {
+    // rsp -> rsp, rsp -> dns
+    dispatched = storage_type_assign(out_attrs, param_stype,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched && in_stype == kCSRStorage &&
+      (param_stype == kCSRStorage || param_stype == kDefaultStorage)) {
+    // csr -> csr, csr -> dns
+    dispatched = storage_type_assign(out_attrs, param_stype,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched) {
+    LOG(FATAL) << "Not implemented: "
+               << operator_stype_string(attrs, dev_mask, *in_attrs, *out_attrs);
+  }
+  return true;
+}
+
+template<typename xpu>
+void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1);
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(req[0], kWriteTo) << "CastStorageComputeEx expects req[0] == kWriteTo";
+  CastStorageComputeImpl<xpu>(ctx, inputs[0], outputs[0]);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_CAST_STORAGE_INL_H_
diff --git a/src/operator/tensor/cast_storage.cc b/src/operator/tensor/cast_storage.cc
new file mode 100644
index 000000000000..9f257b140f7b
--- /dev/null
+++ b/src/operator/tensor/cast_storage.cc
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file cast_storage.cc
+ * \brief CPU Implementation of cast_storage operator.
+ */
+
+#include "./cast_storage-inl.h"
+#include "../elemwise_op_common.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+DMLC_REGISTER_PARAMETER(CastStorageParam);
+NNVM_REGISTER_OP(cast_storage)
+.add_alias("_sparse_cast_storage")
+.describe(R"code(Casts tensor storage type to the new type.
+
+When an NDArray with default storage type is cast to csr or row_sparse storage,
+the result is compact, which means:
+
+- for csr, zero values will not be retained
+- for row_sparse, row slices of all zeros will not be retained
+
+The storage type of ``cast_storage`` output depends on stype parameter:
+
+- cast_storage(csr, 'default') = default
+- cast_storage(row_sparse, 'default') = default
+- cast_storage(default, 'csr') = csr
+- cast_storage(default, 'row_sparse') = row_sparse
+
+Example::
+
+    dense = [[ 0.,  1.,  0.],
+             [ 2.,  0.,  3.],
+             [ 0.,  0.,  0.],
+             [ 0.,  0.,  0.]]
+
+    # cast to row_sparse storage type
+    rsp = cast_storage(dense, 'row_sparse')
+    rsp.indices = [0, 1]
+    rsp.values = [[ 0.,  1.,  0.],
+                  [ 2.,  0.,  3.]]
+
+    # cast to csr storage type
+    csr = cast_storage(dense, 'csr')
+    csr.indices = [1, 0, 2]
+    csr.values = [ 1.,  2.,  3.]
+    csr.indptr = [0, 1, 3, 3, 3]
+
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<CastStorageParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", CastStorageInferStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", CastStorageComputeEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"})
+.add_argument("data", "NDArray-or-Symbol", "The input.")
+.add_arguments(CastStorageParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/cast_storage.cu b/src/operator/tensor/cast_storage.cu
new file mode 100644
index 000000000000..48e387b264d7
--- /dev/null
+++ b/src/operator/tensor/cast_storage.cu
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file cast_storage.cu
+ * \brief GPU Implementation of cast_storage operator.
+ */
+#include "./cast_storage-inl.h"
+#include "../tensor/elemwise_unary_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(cast_storage)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", CastStorageComputeEx<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/control_flow_op.cc b/src/operator/tensor/control_flow_op.cc
index bf08fe7e9d94..9e1091effe8b 100644
--- a/src/operator/tensor/control_flow_op.cc
+++ b/src/operator/tensor/control_flow_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file control_flow_op.cc
  * \brief CPU Implementation of flow control
  */
diff --git a/src/operator/tensor/control_flow_op.cu b/src/operator/tensor/control_flow_op.cu
index da2c47247f28..cc5198ddb1c5 100644
--- a/src/operator/tensor/control_flow_op.cu
+++ b/src/operator/tensor/control_flow_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file control_flow_op.cu
  * \brief
  */
diff --git a/src/operator/tensor/control_flow_op.h b/src/operator/tensor/control_flow_op.h
index c240247202e8..f1136c8e3774 100644
--- a/src/operator/tensor/control_flow_op.h
+++ b/src/operator/tensor/control_flow_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file control_flow.h
  * \brief Function definitions of operators for controlling flow
  */
diff --git a/src/operator/tensor/dot-inl.cuh b/src/operator/tensor/dot-inl.cuh
new file mode 100644
index 000000000000..c546c4351a28
--- /dev/null
+++ b/src/operator/tensor/dot-inl.cuh
@@ -0,0 +1,901 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file dot-inl.cuh
+ * \brief implementation of matrix dot op on GPU
+ */
+#ifndef MXNET_OPERATOR_TENSOR_DOT_INL_CUH_
+#define MXNET_OPERATOR_TENSOR_DOT_INL_CUH_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+#include "./util/tensor_util-inl.h"
+#include "./util/tensor_util-inl.cuh"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief GPU scalar kernel of dot(csr, dns1) = dns2
+ * Parallelization by output matrix elements: 1 thread/element
+ */
+template<int req>
+struct DotCsrDnsDnsScalarKernel {
+  /*!
+   * \brief This function represents performing an inner product between a row of lhs
+   * and a column of rhs and then assigning the value to out[tid].
+   * \param tid         global thread id
+   * \param out         output matrix data
+   * \param data_l      csr matrix data
+   * \param indptr_l    csr matrix row index pointer
+   * \param col_idx_l   csr matrix column indices
+   * \param data_r      dns1 matrix data
+   * \param num_cols_r  dns1 matrix number of columns
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    const nnvm::dim_t irow = tid / num_cols_r;  // row id of the lhs
+    const nnvm::dim_t icol = tid % num_cols_r;  // col id of the rhs
+    DType sum = 0;
+    for (IType j = indptr_l[irow]; j < indptr_l[irow+1]; ++j) {
+      const CType cur_col = col_idx_l[j];  // corresponding row id of the rhs
+      sum += data_l[j] * data_r[cur_col*num_cols_r+icol];
+    }
+    KERNEL_ASSIGN(out[tid], req, sum);
+  }
+};
+
+/*!
+ * \brief GPU vector kernel of dot(csr, dns1) = dns2
+ * Parallelization by output matrix elements: 1 warp/element
+ */
+template<int req>
+struct DotCsrDnsDnsVectorKernel {
+  /*!
+   * \brief see DotCsrDnsDnsScalarKernel Map for documentation.
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    __shared__ volatile DType vals[mshadow::cuda::kBaseThreadNum];
+    const dim_t warp_id = tid / 32;           // global warp id
+    const dim_t lane = tid & (32-1);          // local thread id within warp
+    const dim_t irow = warp_id / num_cols_r;  // lhs row that this warp computes
+    const dim_t kcol = warp_id % num_cols_r;  // rhs column that this warp computes
+
+    // Range of nnz elements in this row
+    const dim_t low  = static_cast<dim_t>(indptr_l[irow]);
+    const dim_t high = static_cast<dim_t>(indptr_l[irow+1]);
+
+    // Compute running sum per thread
+    DType sum = 0;
+    for (dim_t j = low+lane; j < high; j+=32) {
+      sum += data_l[j] * data_r[col_idx_l[j]*num_cols_r + kcol];
+    }
+    vals[threadIdx.x] = sum; __syncwarp();
+
+    // Parallel reduction in shared memory
+    if (lane < 16) {vals[threadIdx.x] += vals[threadIdx.x+16];} __syncwarp();
+    if (lane <  8) {vals[threadIdx.x] += vals[threadIdx.x+ 8];} __syncwarp();
+    if (lane <  4) {vals[threadIdx.x] += vals[threadIdx.x+ 4];} __syncwarp();
+    if (lane <  2) {vals[threadIdx.x] += vals[threadIdx.x+ 2];} __syncwarp();
+    if (lane <  1) {vals[threadIdx.x] += vals[threadIdx.x+ 1];} __syncwarp();
+
+    if (lane == 0) {
+      KERNEL_ASSIGN(out[irow*num_cols_r+kcol], req, vals[threadIdx.x]);
+    }
+  }
+};
+
+/*!
+ * \brief GPU scalar kernel of dot(csr.T, dns1) = dns2
+ * Parallelization by output matrix elements: 1 thread/element
+ */
+template<int req>
+struct DotCsrTransDnsDnsScalarKernel {
+  /*!
+   * \brief This function represents performing an inner product between a column of lhs
+   * and a column of rhs and then assigning the value to out[tid].
+   * \param tid         global thread id
+   * \param out         output matrix
+   * \param data_l      csr matrix data
+   * \param indptr_l    csr matrix row index pointer
+   * \param col_idx_l   csr matrix column indices
+   * \param data_r      dns1 matrix data of rhs
+   * \param num_rows_l  csr matrix number of rows (= number of columns of csr.T)
+   * \param num_cols_r  dns1 matrix number of columns
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_rows_l,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t irow = tid / num_cols_r;  // col id of the lhs
+    const dim_t icol = tid % num_cols_r;  // col id of the rhs
+    DType sum = 0;
+
+    // Each thread scans each column with binary search to find nnz elements in its row
+    for (dim_t k = 0; k < num_rows_l; ++k) {
+      const dim_t low = static_cast<dim_t>(indptr_l[k]);
+      const dim_t high = static_cast<dim_t>(indptr_l[k+1]);
+      if (low == high || irow < col_idx_l[low] || irow > col_idx_l[high-1]) continue;
+      dim_t j = high, l = low, r = high - 1;
+      while (l <= r) {
+        dim_t m = l + (r - l) / 2;
+        if (col_idx_l[m] == irow) {
+          j = m; break;
+        }
+        if (col_idx_l[m] < irow) {
+          l = m + 1;
+        } else {
+          r = m - 1;
+        }
+      }
+      if (j < high) {
+        sum += data_l[j] * data_r[k*num_cols_r+icol];
+      }
+    }
+    KERNEL_ASSIGN(out[tid], req, sum);
+  }
+};
+
+/*!
+ * \brief GPU warp kernel of dot(csr.T, dns1) = dns2
+ * Parallelization by columns: 1 warp computes one lhs column for one rhs column
+ */
+struct DotCsrTransDnsDnsWarpKernel {
+  /*!
+   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t warp_id = tid / 32;           // global warp id
+    const dim_t lane = tid & (32-1);          // local thread id within warp
+    const dim_t icol = warp_id / num_cols_r;  // lhs column that this warp computes
+    const dim_t kcol = warp_id % num_cols_r;  // rhs column that this warp computes
+
+    // Compute range of nnz elements in this column
+    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
+    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
+
+    // Iterate through the nnz elements in this column
+    for (dim_t j = low+lane; j < high; j+=32) {
+      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
+      const DType val = data_l[j]*data_r[icol*num_cols_r+kcol];
+      atomicAdd(static_cast<DType *>(&(out[irow*num_cols_r+kcol])), val);
+    }
+  }
+};
+
+/*!
+ * \brief GPU thread block kernel of dot(csr.T, dns1) = dns2
+ * Parallelization by columns: 1 thread block computes one lhs column for all rhs columns
+ */
+struct DotCsrTransDnsDnsThreadBlockKernel {
+  /*!
+   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t warps_per_block = blockDim.x / 32;  // number of warps in this thread block
+    const dim_t warp_id = tid / 32;                 // global warp id
+    const dim_t lane = tid & (32-1);                // local thread id within warp
+    const dim_t icol = blockIdx.x;                  // lhs column that this thread block computes
+    const dim_t kcol = warp_id % warps_per_block;   // rhs column where warp starts computing (offset)
+
+    // Compute range of nnz elements in this lhs column
+    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
+    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
+
+    // Iterate through the nnz elements in this lhs column
+    for (dim_t j = low+lane; j < high; j+=32) {
+      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
+      const DType datum_l = data_l[j];
+      // Iterate over rhs columns that this warp computes
+      for (dim_t k = kcol; k < num_cols_r; k+=warps_per_block) {
+        const DType val = datum_l*data_r[icol*num_cols_r+k];
+        atomicAdd(static_cast<DType *>(&(out[irow*num_cols_r+k])), val);
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU warp block kernel of dot(csr.T, dns1) = dns2
+ * Parallelization by columns: 1 warp computes one lhs column for all rhs columns
+ */
+struct DotCsrTransDnsDnsWarpBlockKernel {
+  /*!
+   * \brief see DotCsrTransDnsDnsScalarKernel Map for documentation.
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t warp_id = tid / 32;   // global warp id
+    const dim_t lane = tid & (32-1);  // local thread id within warp
+    const dim_t icol = warp_id;       // lhs column that this warp computes
+
+    // Compute range of nnz elements in this column
+    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
+    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
+
+    // Iterate through the nnz elements in lhs column
+    for (dim_t j = low+lane; j < high; j+=32) {
+      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
+      const DType datum_l = data_l[j];
+      // Iterate over all rhs columns
+      for (dim_t k = 0; k < num_cols_r; k++) {
+        const DType val = datum_l*data_r[icol*num_cols_r+k];
+        atomicAdd(static_cast<DType *>(&(out[irow*num_cols_r+k])), val);
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU warp kernel of dot(csr.T, dns) = rsp
+ * Parallelization by columns: 1 warp computes one lhs column for one rhs column
+ */
+struct DotCsrTransDnsRspWarpKernel {
+  /*!
+   * \brief
+   * \param tid              global thread id
+   * \param out              output rsp matrix data
+   * \param row_flg_sum_out  inclusive prefix sum array over 0/1 marked row flag array
+   * \param data_l           csr matrix data
+   * \param indptr_l         csr matrix row index pointer
+   * \param col_idx_l        csr matrix column indices
+   * \param data_r           dns matrix data
+   * \param num_cols_r       dns matrix number of columns
+   */
+  template<typename DType, typename IType, typename CType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const nnvm::dim_t* row_flg_sum_out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const nnvm::dim_t num_cols_r) {
+    using nnvm::dim_t;
+    const dim_t warp_id = tid / 32;           // global warp id
+    const dim_t lane = tid & (32-1);          // local thread id within warp
+    const dim_t icol = warp_id / num_cols_r;  // lhs column that this warp computes
+    const dim_t kcol = warp_id % num_cols_r;  // rhs column that this warp computes
+
+    // Compute range of nnz elements in this column
+    const dim_t low  = static_cast<dim_t>(indptr_l[icol]);
+    const dim_t high = static_cast<dim_t>(indptr_l[icol+1]);
+
+    // Iterate through the nnz elements in this column
+    for (dim_t j = low+lane; j < high; j+=32) {
+      const dim_t irow = static_cast<dim_t>(col_idx_l[j]);
+      const dim_t rsp_row = row_flg_sum_out[irow]-1;
+      const DType val = data_l[j]*data_r[icol*num_cols_r+kcol];
+      atomicAdd(static_cast<DType *>(&(out[rsp_row*num_cols_r+kcol])), val);
+    }
+  }
+};
+
+/*!
+ * \brief GPU Kernel of dot(csr.T, rsp1) = rsp2
+ * Parallelization by rows: 1 thread/row
+ * TODO: write a faster kernel optimized for GPU
+ */
+struct DotCsrTransRspRspByRowsKernel {
+  /*!
+   * \brief
+   * \param tid           global thread id
+   * \param out           output rsp matrix data
+   * \param row_idx_out   output rsp matrix non-zero row indices
+   * \param data_l        csr matrix data
+   * \param indptr_l      csr matrix row index pointer
+   * \param col_idx_l     csr matrix column indices
+   * \param data_r        rsp1 matrix data
+   * \param row_idx_r     rsp1 matrix non-zero row indices
+   * \param num_cols_r    rsp1 matrix number of cols
+   * \param nnr_r         rsp1 matrix number of non-zero rows
+   * \param nnr_out       output rsp matrix number of non-zero rows
+   */
+  template<typename DType, typename IType, typename CType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const RType* row_idx_out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const RType* row_idx_r,
+                                             const nnvm::dim_t num_cols_r,
+                                             const nnvm::dim_t nnr_r,
+                                             const nnvm::dim_t nnr_out) {
+    using nnvm::dim_t;
+    // This thread computes non-zero row 'tid' of the output matrix
+    // The actual row id corresponding to the lhs row is row_idx_out[tid]
+    if (tid < nnr_out) {
+      const dim_t offset_out = tid * num_cols_r;
+      // Iterate over rhs matrix rows (or, equivalently, lhs columns worthy taking a look at)
+      for (dim_t i = 0; i < nnr_r; i++) {
+        const RType j = row_idx_r[i];  // j is the actual rhs row id (= lhs column id)
+        if (indptr_l[j] == indptr_l[j+1]) continue;
+        const dim_t offset_r = i * num_cols_r;
+        // Iterate over lhs column j to find possible non-zero value in this row
+        // TODO: remove sequential search, this is a bottleneck
+        for (IType k = indptr_l[j]; k < indptr_l[j+1]; k++) {
+          const CType col_idx = col_idx_l[k];
+          if (col_idx == row_idx_out[tid]) {
+            for (dim_t l = 0; l < num_cols_r; l++) {
+              out[offset_out+l] += data_l[k] * data_r[offset_r+l];
+            }
+          } else if (col_idx > row_idx_out[tid]) {
+            break;
+          }
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU Kernel of dot(csr, rsp) = dns
+ * Parallelization by output elements: 1 thread/element
+ */
+struct DotCsrRspDnsScalarKernel {
+  /*!
+   * \brief
+   * \param tid        global thread id
+   * \param out        output dns matrix data
+   * \param data_l     csr matrix data
+   * \param indptr_l   csr matrix row index pointer
+   * \param col_idx_l  csr matrix column indices
+   * \param data_r     rsp matrix data
+   * \param row_idx_r  rsp matrix non-zero row indices
+   * \param row_flg_r  rsp matrix auxiliary array holding storage indices of non-zero rows
+   * \param nnr_r      rsp matrix number of non-zero rows
+   * \param num_rows   output dns matrix number of rows
+   * \param num_cols   output dns matrix number of columns
+   */
+  template<typename DType, typename IType, typename CType, typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const DType* data_l,
+                                             const IType* indptr_l,
+                                             const CType* col_idx_l,
+                                             const DType* data_r,
+                                             const RType* row_idx_r,
+                                             const RType* row_flg_r,
+                                             const nnvm::dim_t nnr_r,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    if (tid < num_rows*num_cols) {
+      const dim_t i = static_cast<dim_t>(tid) / num_cols;  // i = row this thread computes
+      const dim_t k = static_cast<dim_t>(tid) % num_cols;  // k = col this thread computes
+      // Compute inner product of i-th row and k-th col
+      DType sum = 0;
+      for (IType j = indptr_l[i]; j < indptr_l[i+1]; j++) {
+        const dim_t csr_col = col_idx_l[j];
+        const dim_t rsp_row_idx = row_flg_r[csr_col];
+        if (rsp_row_idx > 0) {
+          sum += data_l[j] * data_r[(rsp_row_idx-1)*num_cols+k];
+        }
+      }
+      if (sum != 0) {
+        out[i*num_cols+k] += sum;
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2
+ */
+inline void DotCsrDnsDnsImpl(const OpContext& ctx,
+                             const gpu& gpu_dev,
+                             const NDArray& lhs,
+                             const TBlob& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             TBlob* ret) {
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  if (!lhs.storage_initialized()) {
+    Fill(s, *ret, req, 0);
+    return;
+  }
+
+  using mshadow::cuda::kBaseThreadNum;
+  using mxnet_op::Kernel;
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+
+  const dim_t num_rows_l = lhs.shape()[0];
+  const dim_t num_cols_r = rhs.shape_[1];
+  const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+  const dim_t threads_per_block = kBaseThreadNum;
+  dim_t num_threads;
+  // TODO: remove kernel dependency on warpSize=32
+  if (threads_per_warp != 32) {
+    LOG(FATAL) << "DotCsrDnsDnsImpl GPU kernels expect warpSize=32";
+  }
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob& data_r = rhs;
+  const TBlob data_out = *ret;
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        if (kWriteTo == req) {
+          num_threads = data_out.Size();
+          Kernel<set_zero, gpu>::Launch(s, num_threads, data_out.dptr<DType>());
+        }
+        if (trans_lhs) {
+          // Different kernel versions are optimized for different matrix instances
+          // TODO: switch between kernel versions depending on input
+          // (1) 'Scalar kernel'       (one thread       computing one output element                )
+          // (2) 'Warp kernel'         (one warp         computing one lhs column for one rhs column )
+          // (3) 'Thread block kernel' (one thread block computing one lhs column for all rhs columns)
+          // (4) 'Warp block kernel'   (one warp         computing one lhs column for all rhs columns)
+          const int kernel_version = 0;
+          switch (kernel_version) {
+            case 1:
+              num_threads = data_out.Size();
+              MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                Kernel<DotCsrTransDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
+                    data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                    col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_rows_l, num_cols_r);
+              });
+              break;
+            case 2:
+              num_threads = threads_per_warp * num_rows_l * num_cols_r;
+              Kernel<DotCsrTransDnsDnsWarpKernel, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              break;
+            case 3:
+              num_threads = threads_per_block * num_rows_l;
+              Kernel<DotCsrTransDnsDnsThreadBlockKernel, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              break;
+            case 4:
+              num_threads = threads_per_warp * num_rows_l;
+              Kernel<DotCsrTransDnsDnsWarpBlockKernel, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              break;
+            default:
+              num_threads = threads_per_warp * num_rows_l * num_cols_r;
+              Kernel<DotCsrTransDnsDnsWarpKernel, gpu>::Launch(s, num_threads,
+                  data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                  col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              break;
+          }
+        } else {
+          // Different kernel versions are optimized for different matrix instances
+          // (1) 'Scalar kernel' (one thread computing one output element)
+          // (2) 'Vector kernel' (one warp   computing one output element)
+          const int kernel_version = 0;
+          switch (kernel_version) {
+            case 1:
+              num_threads = data_out.Size();
+              MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
+                    data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                    col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              });
+              break;
+            case 2:
+              num_threads = threads_per_warp * num_rows_l * num_cols_r;
+              MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
+                    data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                    col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+              });
+              break;
+            default:
+              if (num_cols_r > 4) {
+                num_threads = data_out.Size();
+                MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                  Kernel<DotCsrDnsDnsScalarKernel<ReqType>, gpu>::Launch(s, num_threads,
+                      data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                      col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+                });
+              } else {
+                num_threads = threads_per_warp * num_rows_l * num_cols_r;
+                MXNET_ASSIGN_REQ_SWITCH(req, ReqType, {
+                  Kernel<DotCsrDnsDnsVectorKernel<ReqType>, gpu>::Launch(s, num_threads,
+                      data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+                      col_idx_l.dptr<CType>(), data_r.dptr<DType>(), num_cols_r);
+                });
+              }
+              break;
+          }
+        }
+      });
+    });
+  });
+}
+
+/*!
+ * \brief GPU Impl of dot(csr, dns) = rsp and dot(csr.T, dns) = rsp
+ */
+inline void DotCsrDnsRspImpl(const OpContext& ctx,
+                             const gpu& gpu_dev,
+                             const NDArray& lhs,
+                             const TBlob& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             NDArray* ret) {
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(ret->storage_type(), kRowSparseStorage);
+  CHECK_EQ(req, kWriteTo);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  if (!lhs.storage_initialized()) {
+    FillZerosRspImpl(s, *ret);
+    return;
+  }
+
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob& data_r = rhs;
+
+  const dim_t num_rows_l = lhs.shape()[0];
+  const dim_t num_cols_l = lhs.shape()[1];
+  const dim_t num_cols_r = rhs.shape_[1];
+  const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+  dim_t num_threads;
+  // TODO: remove kernel dependency on warpSize=32
+  if (threads_per_warp != 32) {
+    LOG(FATAL) << "DotCsrDnsRspImpl GPU kernels expect warpSize=32";
+  }
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        if (trans_lhs) {
+          // Compute number of non-zero rows (nnr) of output matrix
+          // - alloc temp storage for row_flg array and for cub's prefix sum
+          // - mark non-zero columns of csr matrix in row_flg
+          // - compute inclusive prefix sum over marked array
+          // - copy last value (nnr_out) from device to host
+          dim_t* row_flg_out = NULL;
+          void* d_temp_storage = NULL;
+          size_t temp_storage_bytes = 0;
+          cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                        temp_storage_bytes,
+                                        row_flg_out,
+                                        row_flg_out,
+                                        num_cols_l,
+                                        mshadow::Stream<gpu>::GetStream(s));
+          mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[0]
+              .get_space_typed<gpu, 1, char>(Shape1(num_cols_l * sizeof(dim_t) +
+                                                    temp_storage_bytes), s);
+          row_flg_out = reinterpret_cast<dim_t*>(workspace.dptr_);
+          d_temp_storage = workspace.dptr_ + num_cols_l*sizeof(dim_t);
+          num_threads = num_cols_l;
+          Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_out);
+          num_threads = num_rows_l * threads_per_warp;
+          Kernel<MarkCsrColWarpKernel, gpu>::Launch(s, num_threads,
+              row_flg_out, col_idx_l.dptr<CType>(), indptr_l.dptr<IType>(),
+              num_rows_l, num_cols_l);
+          cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                        temp_storage_bytes,
+                                        row_flg_out,
+                                        row_flg_out,
+                                        num_cols_l,
+                                        mshadow::Stream<gpu>::GetStream(s));
+          dim_t nnr_out = 0;
+          CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
+                               cudaMemcpyDeviceToHost));
+          if (0 == nnr_out) {
+            FillZerosRspImpl(s, *ret);
+            return;
+          }
+
+          // Allocate output matrix space
+          ret->CheckAndAlloc({Shape1(nnr_out)});
+          const TBlob data_out_blob = ret->data();
+          const TBlob row_idx_out_blob = ret->aux_data(rowsparse::kIdx);
+          MSHADOW_IDX_TYPE_SWITCH(row_idx_out_blob.type_flag_, RType, {  // row idx type
+            DType* data_out = data_out_blob.dptr<DType>();
+            RType* row_idx_out = row_idx_out_blob.dptr<RType>();
+            num_threads = nnr_out * num_cols_r;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, data_out);
+            num_threads = nnr_out;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, row_idx_out);
+
+            // Fill row_idx array of output matrix, using the row_flg values
+            num_threads = num_cols_l;
+            Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_threads,
+                row_idx_out, row_flg_out, num_cols_l);
+
+            // Perform matrix-matrix multiply
+            num_threads = threads_per_warp * num_rows_l * num_cols_r;
+            Kernel<DotCsrTransDnsRspWarpKernel, gpu>::Launch(s, num_threads,
+                data_out, row_flg_out,
+                data_l.dptr<DType>(), indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(),
+                data_r.dptr<DType>(), num_cols_r);
+          });
+        } else {
+          LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns) = rsp yet.";
+        }
+      });
+    });
+  });
+}
+
+/*!
+ * \brief GPU Impl of dot(csr, rsp1) = rsp2 and dot(csr.T, rsp1) = rsp2
+ * TODO: Optimize for GPU; this is a baseline implementation providing
+ *       the operator functionality, it is not yet fully optimized for GPU.
+ */
+inline void DotCsrRspRspImpl(const OpContext& ctx,
+                             const gpu& gpu_dev,
+                             const NDArray& lhs,
+                             const NDArray& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             NDArray* ret) {
+  if (kNullOp == req) return;
+  // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense
+  if (rhs.storage_shape()[0] == rhs.shape()[0]) {
+    DotCsrDnsRspImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
+    return;
+  }
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
+  CHECK_EQ(ret->storage_type(), kRowSparseStorage);
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
+    FillZerosRspImpl(s, *ret);
+    return;
+  }
+  CHECK_EQ(req, kWriteTo);
+
+  using mshadow::Shape1;
+  using mxnet_op::Kernel;
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob data_r = rhs.data();
+  const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);
+
+  const dim_t num_rows_l = lhs.shape()[0];
+  const dim_t num_cols_l = lhs.shape()[1];
+  const dim_t num_cols_r = rhs.shape()[1];
+  const dim_t nnr_r = rhs.storage_shape()[0];
+  const dim_t threads_per_warp = mxnet_op::cuda_get_device_prop().warpSize;
+  dim_t num_threads;
+  // TODO: remove kernel dependency on warpSize=32
+  if (threads_per_warp != 32) {
+    LOG(FATAL) << "DotCsrRspRspImpl GPU kernels expect warpSize=32";
+  }
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
+          if (trans_lhs) {
+            // Compute number of non-zero rows (nnr) of output matrix
+            // - alloc temp storage for row_flg array and for cub's prefix sum
+            // - mark non-zero columns of csr matrix in row_flg
+            // - compute inclusive prefix sum over marked array
+            // - copy last value (nnr_out) from device to host
+            dim_t* row_flg_out = NULL;
+            void* d_temp_storage = NULL;
+            size_t temp_storage_bytes = 0;
+            cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                          temp_storage_bytes,
+                                          row_flg_out,
+                                          row_flg_out,
+                                          num_cols_l,
+                                          mshadow::Stream<gpu>::GetStream(s));
+            mshadow::Tensor<gpu, 1, char> workspace = ctx.requested[0]
+                .get_space_typed<gpu, 1, char>(Shape1(num_cols_l * sizeof(dim_t) +
+                                                      temp_storage_bytes), s);
+            row_flg_out = reinterpret_cast<dim_t*>(workspace.dptr_);
+            d_temp_storage = workspace.dptr_ + num_cols_l*sizeof(dim_t);
+            num_threads = num_cols_l;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_out);
+            num_threads = num_rows_l * threads_per_warp;
+            Kernel<MarkCsrColWarpKernel, gpu>::Launch(s, num_threads,
+                row_flg_out, col_idx_l.dptr<CType>(), indptr_l.dptr<IType>(),
+                num_rows_l, num_cols_l);
+            cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                          temp_storage_bytes,
+                                          row_flg_out,
+                                          row_flg_out,
+                                          num_cols_l,
+                                          mshadow::Stream<gpu>::GetStream(s));
+            dim_t nnr_out = 0;
+            CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
+                                 cudaMemcpyDeviceToHost));
+            if (0 == nnr_out) {
+              FillZerosRspImpl(s, *ret);
+              return;
+            }
+
+            // Allocate output matrix space
+            ret->CheckAndAlloc({mshadow::Shape1(nnr_out)});
+            const TBlob data_out_blob = ret->data();
+            const TBlob row_idx_out_blob = ret->aux_data(rowsparse::kIdx);
+            DType* data_out = data_out_blob.dptr<DType>();
+            RType* row_idx_out = row_idx_out_blob.dptr<RType>();
+            num_threads = nnr_out * num_cols_r;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, data_out);
+            num_threads = nnr_out;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, row_idx_out);
+
+            // Fill row_idx array of output matrix, using the row_flg values
+            num_threads = num_cols_l;
+            Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_threads,
+                row_idx_out, row_flg_out, num_cols_l);
+
+            // Perform matrix-matrix multiply
+            num_threads = nnr_out;
+            Kernel<DotCsrTransRspRspByRowsKernel, gpu>::Launch(s, num_threads,
+                data_out, row_idx_out,
+                data_l.dptr<DType>(), indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(),
+                data_r.dptr<DType>(), row_idx_r.dptr<RType>(),
+                num_cols_r, nnr_r, nnr_out);
+          } else {
+            LOG(FATAL) << "DotCsrRspRspImpl has not implemented dot(csr, rsp1) = rsp2 yet.";
+          }
+        });
+      });
+    });
+  });
+}
+
+/*!
+ * \brief GPU Impl of dot(csr, rsp) = dns and dot(csr.T, rsp) = dns
+ */
+inline void DotCsrRspDnsImpl(const OpContext& ctx,
+                             const gpu& gpu_dev,
+                             const NDArray& lhs,
+                             const NDArray& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             TBlob* ret) {
+  // Reuse dot(csr, dns) implementation if rhs rsp matrix is in fact dense
+  if (rhs.storage_shape()[0] == rhs.shape()[0]) {
+    DotCsrDnsDnsImpl(ctx, gpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
+    return;
+  }
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
+
+  using mxnet_op::Kernel;
+  using mxnet_op::set_zero;
+  mshadow::Stream<gpu>* s = ctx.get_stream<gpu>();
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
+    if (kWriteTo == req) {
+      MSHADOW_TYPE_SWITCH(ret->type_flag_, DType, {  // data type
+        Kernel<set_zero, gpu>::Launch(s, ret->Size(), ret->dptr<DType>());
+      });
+    }
+    return;
+  }
+
+  using nnvm::dim_t;
+  const dim_t num_rows = ret->shape_[0];
+  const dim_t num_cols = ret->shape_[1];
+  const dim_t nnr_r = rhs.storage_shape()[0];
+  dim_t num_threads;
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob data_r = rhs.data();
+  const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
+          if (kWriteTo == req) {
+            num_threads = num_rows*num_cols;
+            Kernel<set_zero, gpu>::Launch(s, num_threads, ret->dptr<DType>());
+          }
+          if (trans_lhs) {
+            LOG(FATAL) << "DotCsrRspDnsImpl has not implemented dot(csr.T, rsp) = dns yet.";
+          } else {
+            // TODO: Consider implementing a vector kernel for SpMV (similar to DotCsrDnsDns)
+            // Alloc temp storage for row_flg array
+            RType* row_flg_r = ctx.requested[0]
+                .get_space_typed<gpu, 1, RType>(mshadow::Shape1(rhs.shape()[0]), s).dptr_;
+            num_threads = rhs.shape()[0];
+            Kernel<set_zero, gpu>::Launch(s, num_threads, row_flg_r);
+            // Set row_flg index array
+            num_threads = nnr_r;
+            Kernel<IndexRspRowFlgKernel, gpu>::Launch(s, num_threads,
+                row_flg_r, row_idx_r.dptr<RType>(), nnr_r);
+            // Perform sparse matrix-matrix multiply
+            num_threads = num_rows*num_cols;
+            Kernel<DotCsrRspDnsScalarKernel, gpu>::Launch(s, num_threads,
+                ret->dptr<DType>(),
+                data_l.dptr<DType>(), indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(),
+                data_r.dptr<DType>(), row_idx_r.dptr<RType>(), row_flg_r, rhs.storage_shape()[0],
+                num_rows, num_cols);
+          }
+        });
+      });
+    });
+  });
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_DOT_INL_CUH_
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
new file mode 100644
index 000000000000..2432703291f9
--- /dev/null
+++ b/src/operator/tensor/dot-inl.h
@@ -0,0 +1,1095 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dot-inl.h
+ * \brief Function definition of matrix dot operator
+ */
+
+#ifndef MXNET_OPERATOR_TENSOR_DOT_INL_H_
+#define MXNET_OPERATOR_TENSOR_DOT_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include <type_traits>
+#include "./util/tensor_util-inl.h"
+#include "../mshadow_op.h"
+#include "../elemwise_op_common.h"
+#include "./init_op.h"
+#include "../mxnet_op.h"
+#ifdef __CUDACC__
+#include "./dot-inl.cuh"
+#endif  // __CUDACC__
+
+namespace mxnet {
+namespace op {
+
+struct DotParam : public dmlc::Parameter<DotParam> {
+  bool transpose_a;
+  bool transpose_b;
+  DMLC_DECLARE_PARAMETER(DotParam) {
+    DMLC_DECLARE_FIELD(transpose_a)
+      .describe("If true then transpose the first input before dot.")
+      .set_default(false);
+    DMLC_DECLARE_FIELD(transpose_b)
+      .describe("If true then transpose the second input before dot.")
+      .set_default(false);
+  }
+};
+
+template<typename xpu>
+void DotForward_(const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx,
+                 const std::vector<TBlob>& inputs,
+                 const std::vector<OpReqType>& req,
+                 const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
+      << "dot only supports float32 and float64";
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) {
+      CHECK_NE(req[0], kAddTo) << "AddTo not yet supported";
+      Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
+      VectorDot(out,
+                inputs[0].get<xpu, 1, DType>(s),
+                inputs[1].get<xpu, 1, DType>(s));
+    } else {
+      int ma, na, mb, nb, m, n;
+      if (param.transpose_a) {
+        ma = inputs[0].size(0);
+        na = inputs[0].Size()/ma;
+        m = na;
+      } else {
+        na = inputs[0].size(inputs[0].ndim()-1);
+        ma = inputs[0].Size()/na;
+        m = ma;
+      }
+      if (param.transpose_b) {
+        nb = inputs[1].size(inputs[1].ndim()-1);
+        mb = inputs[1].Size()/nb;
+        n = mb;
+      } else {
+        mb = inputs[1].size(0);
+        nb = inputs[1].Size()/mb;
+        n = nb;
+      }
+      Tensor<xpu, 2, DType> input0 =
+      inputs[0].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+      Tensor<xpu, 2, DType> input1 =
+      inputs[1].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
+      Tensor<xpu, 2, DType> out =
+      outputs[0].get_with_shape<xpu, 2, DType>(Shape2(m, n), s);
+      if (param.transpose_a && param.transpose_b) {
+        ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1.T()));
+      } else if (!param.transpose_a && param.transpose_b) {
+        ASSIGN_DISPATCH(out, req[0], dot(input0, input1.T()));
+      } else if (param.transpose_a && !param.transpose_b) {
+        ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1));
+      } else {
+        ASSIGN_DISPATCH(out, req[0], dot(input0, input1));
+      }
+    }
+  });
+}
+
+template<typename xpu>
+void DotBackward_(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<TBlob>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_NE(req[0], kWriteInplace);
+  CHECK_NE(req[1], kWriteInplace);
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) {
+      Tensor<xpu, 1, DType> mout_grad = inputs[0].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> mlhs_data = inputs[1].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> mrhs_data = inputs[2].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> mlhs_grad = outputs[0].get<xpu, 1, DType>(s);
+      Tensor<xpu, 1, DType> mrhs_grad = outputs[1].get<xpu, 1, DType>(s);
+      ASSIGN_DISPATCH(mrhs_grad, req[1],
+                      broadcast_scalar(mout_grad, mlhs_data.shape_) * mlhs_data);
+      ASSIGN_DISPATCH(mlhs_grad, req[0],
+                      broadcast_scalar(mout_grad, mlhs_data.shape_) * mrhs_data);
+    } else {
+      int ma, na, mb, nb, m, n;
+      if (param.transpose_a) {
+        ma = outputs[0].size(0);
+        na = outputs[0].Size()/ma;
+        m = na;
+      } else {
+        na = outputs[0].size(outputs[0].ndim()-1);
+        ma = outputs[0].Size()/na;
+        m = ma;
+      }
+      if (param.transpose_b) {
+        nb = outputs[1].size(outputs[1].ndim()-1);
+        mb = outputs[1].Size()/nb;
+        n = mb;
+      } else {
+        mb = outputs[1].size(0);
+        nb = outputs[1].Size()/mb;
+        n = nb;
+      }
+      Tensor<xpu, 2, DType> mout_grad =
+      inputs[0].get_with_shape<xpu, 2, DType>(Shape2(m, n), s);
+      Tensor<xpu, 2, DType> mlhs_data =
+      inputs[1].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+      Tensor<xpu, 2, DType> mrhs_data =
+      inputs[2].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
+      Tensor<xpu, 2, DType> mlhs_grad =
+      outputs[0].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
+      Tensor<xpu, 2, DType> mrhs_grad =
+      outputs[1].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
+      if (param.transpose_a && param.transpose_b) {
+        // Gradient of z = dot(x.T, y.T)
+        // dy = dot(x, dz).T = dot(dz.T, x.T)
+        // dx = dot(dz, y).T = dot(y.T, dz.T)
+        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data.T()));
+        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data.T(), mout_grad.T()));
+      } else if (!param.transpose_a && param.transpose_b) {
+        // Gradient of z = dot(x, y.T)
+        // dy = dot(x.T, dz).T = dot(dz.T, x)
+        // dx = dot(dz, y)
+        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data));
+        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data));
+      } else if (param.transpose_a && !param.transpose_b) {
+        // Gradient of z = dot(x.T, y)
+        // dy = dot(x, dz)
+        // dx = dot(dz, y.T).T = dot(y, dz.T)
+        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data, mout_grad));
+        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data, mout_grad.T()));
+      } else {
+        // Gradient of z = dot(x, y)
+        // dy = dot(x.T, dz)
+        // dx = dot(dz, y.T)
+        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data.T(), mout_grad));
+        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data.T()));
+      }
+    }
+  });
+}
+
+inline bool DotForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                       const int dev_mask,
+                                       DispatchMode* dispatch_mode,
+                                       std::vector<int> *in_attrs,
+                                       std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  // csr has many zero columns, so the result of dot(csr.T, matrix) should be rsp
+  const auto& lhs_stype = in_attrs->at(0);
+  const auto& rhs_stype = in_attrs->at(1);
+  auto& out_stype = out_attrs->at(0);
+  bool dispatched = false;
+  bool only_lhs_transpose = param.transpose_a && !param.transpose_b;
+  bool rhs_rsp_or_dns  = rhs_stype == kRowSparseStorage || rhs_stype == kDefaultStorage;
+  if (!dispatched && lhs_stype == kDefaultStorage && rhs_stype == kDefaultStorage) {
+    // dns, dns -> dns
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched && lhs_stype == kCSRStorage && only_lhs_transpose &&
+      (rhs_stype == kRowSparseStorage || rhs_stype == kDefaultStorage)) {
+    // csr.T, rsp/dns -> rsp
+    dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched && lhs_stype == kCSRStorage && rhs_rsp_or_dns &&
+      !param.transpose_a && !param.transpose_b) {
+    // csr, rsp/dns -> dns
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+inline bool DotBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int> *in_attrs,
+                                        std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  const auto& ograd_stype = in_attrs->at(0);
+  const auto& lhs_stype = in_attrs->at(1);
+  const auto& rhs_stype = in_attrs->at(2);
+  const bool no_transpose = !param.transpose_a && !param.transpose_b;
+  auto& lhs_grad_stype = out_attrs->at(0);
+  auto& rhs_grad_stype = out_attrs->at(1);
+  bool dispatched = false;
+  if (!dispatched && lhs_stype == kDefaultStorage && rhs_stype == kDefaultStorage &&
+      ograd_stype == kDefaultStorage) {
+    if (type_assign(&lhs_grad_stype, kDefaultStorage) &&
+        type_assign(&rhs_grad_stype, kDefaultStorage)) {
+      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFCompute);
+      dispatched = true;
+    }
+  }
+  if (!dispatched && no_transpose && lhs_stype == kCSRStorage &&
+      (ograd_stype == kRowSparseStorage || ograd_stype == kDefaultStorage)) {
+    // backward: csr.T, rsp/dns -> rsp, dns.T, rsp/dns -> dns
+    if (type_assign(&rhs_grad_stype, kRowSparseStorage) &&
+        type_assign(&lhs_grad_stype, kDefaultStorage)) {
+      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+      dispatched = true;
+    }
+  }
+  if (!dispatched && param.transpose_a && !param.transpose_b && lhs_stype == kCSRStorage &&
+      (ograd_stype == kRowSparseStorage || ograd_stype == kDefaultStorage)) {
+    // backward: csr, rsp/dns -> dns, dns, rsp/dns -> dns
+    if (type_assign(&rhs_grad_stype, kDefaultStorage) &&
+        type_assign(&lhs_grad_stype, kDefaultStorage)) {
+      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+      dispatched = true;
+    }
+  }
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+/*!
+ * \brief CPU Kernel of dot(csr, dns1) = dns2
+ * Parallelization by row blocks
+ */
+struct DotCsrDnsDnsByRowBlocks {
+  /*!
+   * \brief
+   * \param i the i-th thread
+   */
+  template<typename DType, typename IType, typename CType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const nnvm::dim_t seg_len,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= num_rows) return;
+    const dim_t seg_end = std::min(seg_start + seg_len, num_rows);
+    for (dim_t j = seg_start; j < seg_end; ++j) {
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_out = j * num_cols;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
+        const DType val = data_l[k];
+        const dim_t offset_r = col_idx_l[k] * num_cols;
+        for (dim_t l = 0; l < num_cols; ++l) {
+          out[offset_out+l] += data_r[offset_r+l] * val;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Kernel of dot(csr.T(), dns1) = dns2
+ * Parallelization by row blocks
+ */
+struct DotCsrTransDnsDnsByRowBlocks {
+  /*!
+   * \brief
+   * \param i the i-th thread
+   */
+  template<typename DType, typename IType, typename CType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const nnvm::dim_t seg_len,
+                                  const nnvm::dim_t num_rows_l,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= num_rows) return;
+    const dim_t seg_end = (i + 1) * seg_len;
+    for (dim_t j = 0; j < num_rows_l; ++j) {
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_r = j * num_cols;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
+        const CType col_idx = col_idx_l[k];
+        if (col_idx < seg_start || col_idx >= seg_end) continue;
+        const dim_t offset_out = col_idx * num_cols;
+        const DType val = data_l[k];
+        for (dim_t l = 0; l < num_cols; ++l) {
+          out[offset_out+l] += data_r[offset_r+l] * val;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Kernel of dot(csr.T(), dns) = rsp
+ * Parallelization by row blocks which evenly partition the non-zero rows.
+ */
+struct DotCsrTransDnsRspByRowBlocks {
+  /*!
+   * \brief
+   * \param i the i-th thread
+   */
+  template<typename DType, typename IType, typename CType, typename RType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  nnvm::dim_t* row_flg_sum,
+                                  RType* row_idx,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const nnvm::dim_t seg_len,
+                                  const nnvm::dim_t num_rows_l,
+                                  const nnvm::dim_t nnr,
+                                  const nnvm::dim_t num_cols) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= nnr) return;
+    const dim_t seg_end = (i + 1) * seg_len;
+    const dim_t col_start = row_idx[seg_start];
+    const dim_t col_end = seg_end >= nnr ? (row_idx[nnr-1] + 1) : row_idx[seg_end];
+    for (dim_t j = 0; j < num_rows_l; ++j) {
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_r = j * num_cols;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
+        const CType col_idx = col_idx_l[k];
+        if (col_idx < col_start || col_idx >= col_end) continue;
+
+        const nnvm::dim_t rsp_row = row_flg_sum[col_idx] - 1;
+        const nnvm::dim_t offset_out = rsp_row * num_cols;
+        const DType val = data_l[k];
+
+        for (dim_t l = 0; l < num_cols; ++l) {
+          out[offset_out+l] += data_r[offset_r+l] * val;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Kernel of dot(csr, rsp) = dns
+ * Parallelization by row blocks
+ */
+struct DotCsrRspDnsByRowBlocks {
+  /*!
+   * \brief
+   * \param i         the i-th thread
+   * \param nnr_r     storage_shape[0] of the rsp
+   * \param num_rows  dns.shape[0]
+   * \param num_cols  dns.shape[1]
+   */
+  template<typename DType, typename IType, typename CType, typename RType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const RType* row_idx_r,
+                                  const nnvm::dim_t nnr_r,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols,
+                                  const nnvm::dim_t seg_len) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= num_rows) return;
+    const dim_t seg_end = std::min(seg_start + seg_len, num_rows);
+    for (dim_t j = seg_start; j < seg_end; ++j) {
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_out = j * num_cols;
+      // Use binary search to find the lower_bound of val in row_idx array
+      const RType* first = row_idx_r;
+      const RType* last = row_idx_r + nnr_r;
+      const CType val = col_idx_l[indptr_l[j]];
+      const RType* it;
+      int count = last - first, step;
+      while (count > 0) {
+        it = first;
+        step = count / 2;
+        it += step;
+        if (*it < val) {
+          first = ++it;
+          count -= step + 1;
+        } else {
+          count = step;
+        }
+      }
+      const RType* row_idx_ptr = first;
+      // end of binary search
+      if (row_idx_ptr == row_idx_r+nnr_r || *row_idx_ptr > col_idx_l[indptr_l[j+1]-1]) continue;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1] && row_idx_ptr != row_idx_r+nnr_r;) {
+        if (col_idx_l[k] == *row_idx_ptr) {
+          const dim_t offset_r = (row_idx_ptr - row_idx_r) * num_cols;
+          for (dim_t l = 0; l < num_cols; ++l) {
+            out[offset_out+l] += data_l[k] * data_r[offset_r+l];
+          }
+          ++k;
+          ++row_idx_ptr;
+        } else if (col_idx_l[k] < *row_idx_ptr) {
+          ++k;
+        } else {
+          ++row_idx_ptr;
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Kernel of dot(csr.T(), rsp1) = rsp2, with row_idx marked for non-zero rows
+ * Parallelization by row blocks
+ */
+struct DotCsrTransRspRspByRowBlocks {
+  /*!
+   * \brief
+   * \param i the i-th thread
+   * \param num_rows_l number of rows of lhs matrix
+   * \param nnr_r number of non-zero rows of rhs matrix
+   * \param num_rows number of rows of out matrix
+   * \param num_cols number of cols of out matrix
+   */
+  template<typename DType, typename IType, typename CType, typename RType>
+  MSHADOW_CINLINE static void Map(int i,
+                                  DType* out,
+                                  RType* row_idx_out,
+                                  const DType* data_l,
+                                  const IType* indptr_l,
+                                  const CType* col_idx_l,
+                                  const DType* data_r,
+                                  const RType* row_idx_r,
+                                  const nnvm::dim_t num_rows_l,
+                                  const nnvm::dim_t nnr_r,
+                                  const nnvm::dim_t num_rows,
+                                  const nnvm::dim_t num_cols,
+                                  const nnvm::dim_t seg_len) {
+    using nnvm::dim_t;
+    const dim_t seg_start = i * seg_len;
+    if (seg_start >= num_rows) return;
+    const dim_t seg_end = (i + 1) * seg_len;
+    for (dim_t rid = 0; rid < nnr_r; ++rid) {
+      const RType j = row_idx_r[rid];
+      if (indptr_l[j] == indptr_l[j+1]) continue;
+      const dim_t offset_r = rid * num_cols;
+      for (IType k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
+        const CType col_idx = col_idx_l[k];
+        if (col_idx < seg_start || col_idx >= seg_end) continue;
+        row_idx_out[col_idx] = 1;  // mark nonzero row as 1
+        const dim_t offset_out = col_idx * num_cols;
+        for (dim_t l = 0; l < num_cols; ++l) {
+          out[offset_out+l] += data_r[offset_r+l] * data_l[k];
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief CPU Impl of dot(csr, dns1) = dns2 and dot(csr.T, dns1) = dns2
+ */
+inline void DotCsrDnsDnsImpl(const OpContext& ctx,
+                             const cpu& cpu_dev,
+                             const NDArray& lhs,
+                             const TBlob& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             TBlob* ret) {
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  if (!lhs.storage_initialized()) {
+    Fill(s, *ret, req, 0);
+    return;
+  }
+
+  using nnvm::dim_t;
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob& data_r = rhs;
+  const TBlob data_out = *ret;
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        dim_t num_threads;
+        if (kWriteTo == req) {
+          num_threads = data_out.Size();
+          mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(
+              s, num_threads, data_out.dptr<DType>());
+        }
+        num_threads = mxnet_op::get_num_threads<cpu>(data_out.shape_[0]);
+        dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads;
+        if (trans_lhs) {
+          mxnet_op::Kernel<DotCsrTransDnsDnsByRowBlocks, cpu>::Launch(s, num_threads,
+              data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+              col_idx_l.dptr<CType>(), data_r.dptr<DType>(), seg_len,
+              lhs.shape()[0], data_out.shape_[0], data_out.shape_[1]);
+        } else {
+          mxnet_op::Kernel<DotCsrDnsDnsByRowBlocks, cpu>::Launch(s, num_threads,
+              data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
+              col_idx_l.dptr<CType>(), data_r.dptr<DType>(), seg_len,
+              data_out.shape_[0], data_out.shape_[1]);
+        }
+      });
+    });
+  });
+}
+
+/*!
+ * \brief CPU Impl of dot(csr.T, dns) = rsp
+ */
+inline void DotCsrDnsRspImpl(const OpContext& ctx,
+                             const cpu& cpu_dev,
+                             const NDArray& lhs,
+                             const TBlob& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             NDArray* ret) {
+  if (kNullOp == req) return;
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(ret->storage_type(), kRowSparseStorage);
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  if (!lhs.storage_initialized()) {
+    FillZerosRspImpl(s, *ret);
+    return;
+  }
+  CHECK_EQ(req, kWriteTo);
+
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob& data_r = rhs;
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(ret->aux_type(rowsparse::kIdx), RType, {  // row idx type
+          const dim_t num_rows = lhs.shape()[1];
+          size_t workspace_size = 2 * (num_rows * sizeof(dim_t));
+          mshadow::Tensor<cpu, 1, char> workspace =
+            ctx.requested[0].get_space_typed<cpu, 1, char>(
+            mshadow::Shape1(workspace_size), s);
+          dim_t* row_flg = reinterpret_cast<dim_t*>(workspace.dptr_);
+          dim_t* prefix_sum = row_flg + num_rows;
+
+          Fill<false>(s, TBlob(row_flg, mshadow::Shape1(num_rows), cpu::kDevMask), kWriteTo, 0);
+          mxnet_op::Kernel<MarkRowFlgKernel, cpu>::Launch(s, lhs.aux_shape(csr::kIdx)[0], row_flg,
+            col_idx_l.dptr<CType>());
+
+          prefix_sum[0] = row_flg[0];
+          for (nnvm::dim_t i = 1; i < num_rows; i++) {
+            prefix_sum[i] = prefix_sum[i - 1] + row_flg[i];
+          }
+          dim_t nnr = prefix_sum[num_rows - 1];
+
+          if (nnr == 0) {
+            FillZerosRspImpl(s, *ret);
+            return;
+          }
+
+          ret->CheckAndAlloc({mshadow::Shape1(nnr)});
+          const TBlob& data_out = ret->data();
+          const TBlob& row_idx = ret->aux_data(rowsparse::kIdx);
+
+          dim_t num_threads = data_out.Size();
+          mxnet_op::Kernel<set_zero, cpu>::Launch(s, num_threads, data_out.dptr<DType>());
+          RType* row_idx_out = row_idx.dptr<RType>();
+
+          mxnet_op::Kernel<FillRspRowIdxKernel, cpu>::Launch(s, num_rows,
+            row_idx_out, prefix_sum, num_rows);
+
+          num_threads = mxnet_op::get_num_threads<cpu>(nnr);
+          dim_t seg_len = (nnr + num_threads - 1) / num_threads;
+          if (trans_lhs) {
+            mxnet_op::Kernel<DotCsrTransDnsRspByRowBlocks, cpu>::Launch(s, num_threads,
+              data_out.dptr<DType>(), prefix_sum, row_idx_out, data_l.dptr<DType>(),
+              indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
+              seg_len, lhs.shape()[0], nnr, ret->shape()[1]);
+          } else {
+            LOG(FATAL) << "DotCsrDnsRspImpl has not implemented dot(csr, dns)=rsp yet.";
+          }
+        });
+      });
+    });
+  });
+}
+
+/*!
+ * \brief CPU Impl of dot(csr, rsp) = dns
+ */
+inline void DotCsrRspDnsImpl(const OpContext& ctx,
+                             const cpu& cpu_dev,
+                             const NDArray& lhs,
+                             const NDArray& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             TBlob* ret) {
+  if (kNullOp == req) return;
+  // reuse csr dns implementation when storage_shape == shape for rhs
+  if (rhs.storage_shape()[0] == rhs.shape()[0]) {  // if rsp is actually dense
+    DotCsrDnsDnsImpl(ctx, cpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
+    return;
+  }
+
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
+    if (kWriteTo == req) {
+      MSHADOW_SGL_DBL_TYPE_SWITCH(ret->type_flag_, DType, {  // data type
+        mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(
+            s, ret->Size(), ret->dptr<DType>());
+      });
+    }
+    return;
+  }
+  using nnvm::dim_t;
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob data_r = rhs.data();
+  const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
+          dim_t num_threads;
+          if (kWriteTo == req) {
+            num_threads = ret->Size();
+            mxnet_op::Kernel<mxnet_op::set_zero, cpu>::Launch(s, num_threads,
+                                                              ret->dptr<DType>());
+          }
+          num_threads = mxnet_op::get_num_threads<cpu>(ret->shape_[0]);
+          dim_t seg_len = (ret->shape_[0] + num_threads - 1) / num_threads;
+          if (trans_lhs) {
+            LOG(FATAL) << "DotCsrRspDnsImpl has not implemented dot(csr.T, rsp) = dns yet";
+          } else {
+            mxnet_op::Kernel<DotCsrRspDnsByRowBlocks, cpu>::Launch(s, num_threads,
+                ret->dptr<DType>(), data_l.dptr<DType>(),
+                indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
+                row_idx_r.dptr<RType>(), rhs.storage_shape()[0],
+                ret->shape_[0], ret->shape_[1], seg_len);
+          }
+        });
+      });
+    });
+  });
+}
+
+/*!
+ * \brief CPU Impl of dot(csr.T, rsp1) = rsp2
+ */
+inline void DotCsrRspRspImpl(const OpContext& ctx,
+                             const cpu& cpu_dev,
+                             const NDArray& lhs,
+                             const NDArray& rhs,
+                             const OpReqType req,
+                             const bool trans_lhs,
+                             NDArray* ret) {
+  if (kNullOp == req) return;
+  // reuse csr dns implementation when storage_shape == shape for rhs
+  if (rhs.storage_shape()[0] == rhs.shape()[0]) {  // if rsp is actually dense
+    DotCsrDnsRspImpl(ctx, cpu_dev, lhs, rhs.data(), req, trans_lhs, ret);
+    return;
+  }
+
+  CHECK_EQ(lhs.storage_type(), kCSRStorage);
+  CHECK_EQ(rhs.storage_type(), kRowSparseStorage);
+  CHECK_EQ(ret->storage_type(), kRowSparseStorage);
+  mshadow::Stream<cpu>* s = ctx.get_stream<cpu>();
+  if (!lhs.storage_initialized() || !rhs.storage_initialized()) {
+    FillZerosRspImpl(s, *ret);
+    return;
+  }
+  CHECK_EQ(req, kWriteTo);
+
+  using mxnet_op::set_zero;
+  using nnvm::dim_t;
+
+  const TBlob data_l = lhs.data();
+  const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
+  const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
+  const TBlob data_r = rhs.data();
+  const TBlob row_idx_r = rhs.aux_data(rowsparse::kIdx);
+
+  // pre-allocate spaces for ret using the dense dimension size
+  if (ret->storage_type() == kRowSparseStorage) {
+    ret->CheckAndAlloc({mshadow::Shape1(lhs.shape()[1])});
+  }
+  const TBlob data_out = ret->data();
+  const TBlob row_idx_out = ret->aux_data(rowsparse::kIdx);
+
+  MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
+    MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
+      MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
+        MSHADOW_IDX_TYPE_SWITCH(row_idx_r.type_flag_, RType, {  // row idx type
+          dim_t num_threads = data_out.Size();
+          mxnet_op::Kernel<set_zero, cpu>::Launch(s, num_threads, data_out.dptr<DType>());
+          num_threads = mxnet_op::get_num_threads<cpu>(data_out.shape_[0]);
+          dim_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads;
+          if (trans_lhs) {
+            RType* row_idx = row_idx_out.dptr<RType>();
+            num_threads = row_idx_out.Size();
+            mxnet_op::Kernel<set_zero, cpu>::Launch(s, num_threads, row_idx);
+            mxnet_op::Kernel<DotCsrTransRspRspByRowBlocks, cpu>::Launch(s, num_threads,
+                data_out.dptr<DType>(), row_idx, data_l.dptr<DType>(),
+                indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
+                row_idx_r.dptr<RType>(), lhs.shape()[0], rhs.storage_shape()[0],
+                ret->shape()[0], ret->shape()[1], seg_len);
+            dim_t nnr = 0;
+            nnr = mxnet::common::ParallelAccumulate(row_idx, ret->shape()[0], nnr);
+            if (0 == nnr) {
+              FillZerosRspImpl(s, *ret);
+              return;
+            }
+            ret->set_aux_shape(rowsparse::kIdx, mshadow::Shape1(nnr));
+            mshadow::Tensor<cpu, 2, DType> rsp_data = data_out.FlatTo2D<cpu, DType>(s);
+            dim_t idx = 0;
+            for (index_t i = 0; i < ret->shape()[0]; ++i) {
+              if (row_idx[i] > 0) {
+                row_idx[idx] = i;
+                mshadow::Copy(rsp_data[idx], rsp_data[i], s);
+                ++idx;
+              }
+            }
+          } else {
+            LOG(FATAL) << "DotCsrRspRspImpl has not implemented dot(csr, rsp) = rsp2 yet";
+          }
+        });
+      });
+    });
+  });
+}
+
+inline bool DotShape(const nnvm::NodeAttrs& attrs,
+                     std::vector<TShape> *in_attrs,
+                     std::vector<TShape> *out_attrs) {
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TShape& lshape = (*in_attrs)[0];
+  TShape& rshape = (*in_attrs)[1];
+  if (lshape.ndim() == 1 && rshape.ndim() == 1) {
+    CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors";
+    CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape;
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1));
+  } else {
+    bool Ta = param.transpose_a, Tb = param.transpose_b;
+    TShape L[2], R[2];
+    if (Ta) {
+      L[0] = mshadow::Shape1(lshape[0]);
+      L[1] = lshape.ndim() > 1 ? TShape(&lshape[1], &lshape[lshape.ndim()]) : TShape(1);
+    } else {
+      L[0] = lshape.ndim() > 1 ? TShape(&lshape[0], &lshape[lshape.ndim()-1]) : TShape(1);
+      L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]);
+    }
+    if (Tb) {
+      R[0] = rshape.ndim() > 1 ? TShape(&rshape[0], &rshape[rshape.ndim()-1]) : TShape(1);
+      R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]);
+    } else {
+      R[0] = mshadow::Shape1(rshape[0]);
+      R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1);
+    }
+
+    if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) {
+      CHECK_EQ(L[!Ta].Size(), R[Tb].Size())
+        << "dot shape error: " << lshape << " X " << rshape;
+    }
+    std::vector<index_t> buf;
+    if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]);
+    if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]);
+    TShape oshape(buf.begin(), buf.end());
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  }
+  return true;
+}
+
+template<typename xpu>
+void DotForwardEx(const nnvm::NodeAttrs& attrs,
+                  const OpContext& ctx,
+                  const std::vector<NDArray>& inputs,
+                  const std::vector<OpReqType>& req,
+                  const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK(!param.transpose_b) << "transposing rhs of the sparse dot op is not supported";
+  CHECK_EQ(inputs[0].shape().ndim(), 2) << "sparse dot only supports 2 dimensional lhs";
+  CHECK_EQ(inputs[1].shape().ndim(), 2) << "sparse dot only supports 2 dimensional rhs";
+  auto lhs_stype = inputs[0].storage_type();
+  auto rhs_stype = inputs[1].storage_type();
+  auto out_stype = outputs[0].storage_type();
+  if (lhs_stype == kCSRStorage && rhs_stype == kDefaultStorage &&
+      out_stype == kDefaultStorage && !param.transpose_b) {
+    TBlob ret = outputs[0].data();
+    DotCsrDnsDnsImpl(ctx, xpu(), inputs[0], inputs[1].data(), req[0], param.transpose_a, &ret);
+  } else if (lhs_stype == kCSRStorage && rhs_stype == kRowSparseStorage
+      && out_stype == kDefaultStorage && !param.transpose_b) {
+    TBlob ret = outputs[0].data();
+    DotCsrRspDnsImpl(ctx, xpu(), inputs[0], inputs[1], req[0], param.transpose_a, &ret);
+  } else if (lhs_stype == kCSRStorage && rhs_stype == kDefaultStorage
+      && out_stype == kRowSparseStorage && !param.transpose_b) {
+    NDArray out = outputs[0];
+    DotCsrDnsRspImpl(ctx, xpu(), inputs[0], inputs[1].data(), req[0], param.transpose_a, &out);
+  } else if (lhs_stype == kCSRStorage && rhs_stype == kRowSparseStorage
+      && out_stype == kRowSparseStorage && !param.transpose_b) {
+    NDArray ret = outputs[0];
+    DotCsrRspRspImpl(ctx, xpu(), inputs[0], inputs[1], req[0], param.transpose_a, &ret);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
+template<typename xpu>
+void DotBackwardEx(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<NDArray>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 3U);
+  CHECK_EQ(outputs.size(), 2U);
+  CHECK_EQ(req.size(), 2U);
+  CHECK_EQ(kNullOp, req[0])
+    << "sparse dot does not support computing the gradient of the csr/lhs";
+  CHECK_NE(req[1], kWriteInplace) << "DotBackwardEx does not support WriteInplace";
+
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK(!param.transpose_b) << "sparse dot only supports dot(A, X) and dot(A.T(), X)";
+  CHECK_EQ(inputs[0].shape().ndim(), 2) << "sparse dot only supports 2 dimensional lhs";
+  CHECK_EQ(inputs[1].shape().ndim(), 2) << "sparse dot only supports 2 dimensional rhs";
+  const auto ograd_stype = inputs[0].storage_type();
+  const auto lhs_stype = inputs[1].storage_type();
+  const auto grad_rhs_stype = outputs[1].storage_type();
+  if (ograd_stype == kDefaultStorage  // ograd dns format
+      && lhs_stype == kCSRStorage  // csr input lhs of the op
+      && grad_rhs_stype == kDefaultStorage && !param.transpose_b) {  // grad(rhs) dns format
+    TBlob ret = outputs[1].data();
+    DotCsrDnsDnsImpl(ctx, xpu(), inputs[1], inputs[0].data(), req[1], !param.transpose_a, &ret);
+  } else if (ograd_stype == kDefaultStorage && lhs_stype == kCSRStorage
+      && grad_rhs_stype == kRowSparseStorage && !param.transpose_b) {
+    NDArray ret = outputs[1];
+    DotCsrDnsRspImpl(ctx, xpu(), inputs[1], inputs[0].data(), req[1], !param.transpose_a, &ret);
+  } else if (ograd_stype == kRowSparseStorage && lhs_stype == kCSRStorage
+      && grad_rhs_stype == kRowSparseStorage && !param.transpose_b) {
+    NDArray ret = outputs[1];
+    DotCsrRspRspImpl(ctx, xpu(), inputs[1], inputs[0], req[1], !param.transpose_a, &ret);
+  } else if (ograd_stype == kRowSparseStorage && lhs_stype == kCSRStorage
+      && grad_rhs_stype == kDefaultStorage && !param.transpose_b) {
+    TBlob ret = outputs[1].data();
+    DotCsrRspDnsImpl(ctx, xpu(), inputs[1], inputs[0], req[1], !param.transpose_a, &ret);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
+template<typename xpu>
+void BatchDotForward_(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
+      << "Binary function only support input/output with the same type";
+  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
+      << "dot only supports float32 and float64";
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    mshadow::Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mlhs = inputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mrhs = inputs[1].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 1, DType*> workspace =
+      ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
+    if (kNullOp != req[0]) {
+      if (param.transpose_a && param.transpose_b) {
+        mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      } else if (!param.transpose_a && param.transpose_b) {
+        mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      } else if (param.transpose_a && !param.transpose_b) {
+        mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      } else {
+        mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
+                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                       workspace);
+      }
+    }
+  });
+}
+
+template<typename xpu>
+void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
+                       const OpContext& ctx,
+                       const std::vector<TBlob>& inputs,
+                       const std::vector<OpReqType>& req,
+                       const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  CHECK_NE(req[1], kWriteInplace);
+  CHECK_NE(req[0], kWriteInplace);
+  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
+      << "dot only supports float32 and float64";
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    mshadow::Tensor<xpu, 3, DType> mout_grad = inputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mlhs_data = inputs[1].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mrhs_data = inputs[2].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mlhs_grad = outputs[0].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 3, DType> mrhs_grad = outputs[1].get<xpu, 3, DType>(s);
+    mshadow::Tensor<xpu, 2, DType*> workspace =
+      ctx.requested[0].get_space_typed<xpu, 2, DType*>(
+        mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
+    mshadow::Tensor<xpu, 1, DType*> rhs_workspace = workspace[0];
+    mshadow::Tensor<xpu, 1, DType*> lhs_workspace = workspace[1];
+    if (param.transpose_a && param.transpose_b) {
+      // Gradient of z = dot(x.T, y.T)
+      // dy = dot(x, dz).T = dot(dz.T, x.T)
+      // dx = dot(dz, y).T = dot(y.T, dz.T)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<true, true>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f :  (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<true, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
+    } else if (!param.transpose_a && param.transpose_b) {
+      // Gradient of z = dot(x, y.T)
+      // dy = dot(x.T, dz).T = dot(dz.T, x)
+      // dx = dot(dz, y)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<true, false>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<false, false>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
+    } else if (param.transpose_a && !param.transpose_b) {
+      // Gradient of z = dot(x.T, y)
+      // dy = dot(x, dz)
+      // dx = dot(dz, y.T).T = dot(y, dz.T)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<false, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<false, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
+    } else {
+      // Gradient of z = dot(x, y)
+      // dy = dot(x.T, dz)
+      // dx = dot(dz, y.T)
+      if (kNullOp != req[1]) {
+        mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
+                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
+                                        rhs_workspace);
+      }
+      if (kNullOp != req[0]) {
+        mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
+                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
+                                        lhs_workspace);
+      }
+    }
+  });
+}
+
+inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  TShape& lshape = (*in_attrs)[0];
+  TShape& rshape = (*in_attrs)[1];
+  if (lshape.ndim() == 3 && rshape.ndim() == 3) {
+    CHECK(lshape[0] == rshape[0])
+      << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape
+      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
+    index_t out_m = param.transpose_a ? lshape[2] : lshape[1];
+    index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2];
+    index_t out_n = param.transpose_b ? rshape[1] : rshape[2];
+    index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1];
+    CHECK(lshape_k == rshape_k)
+      << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape
+      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n));
+  } else {
+    LOG(FATAL) << "batch_dot currently only support 3D*3D array"
+               << lshape << " v.s. " << rshape;
+  }
+  return true;
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_DOT_INL_H_
diff --git a/src/operator/tensor/dot.cc b/src/operator/tensor/dot.cc
new file mode 100644
index 000000000000..a7fa2c7933a5
--- /dev/null
+++ b/src/operator/tensor/dot.cc
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dot.cc
+ * \brief CPU Implementation of matrix dot
+ */
+
+#include "./dot-inl.h"
+
+namespace mxnet {
+namespace op {
+DMLC_REGISTER_PARAMETER(DotParam);
+
+NNVM_REGISTER_OP(dot)
+.add_alias("_sparse_dot")  // alias for op registration under mxnet.ndarray.sparse
+.describe(R"doc(Dot product of two arrays.
+
+``dot``'s behavior depends on the input array dimensions:
+
+- 1-D arrays: inner product of vectors
+- 2-D arrays: matrix multiplication
+- N-D arrays: a sum product over the last axis of the first input and the first
+  axis of the second input
+
+  For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the
+  result array will have shape `(n,m,r,s)`. It is computed by::
+
+    dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b])
+
+  Example::
+
+    x = reshape([0,1,2,3,4,5,6,7], shape=(2,2,2))
+    y = reshape([7,6,5,4,3,2,1,0], shape=(2,2,2))
+    dot(x,y)[0,0,1,1] = 0
+    sum(x[0,0,:]*y[:,1,1]) = 0
+
+The storage type of ``dot`` output depends on storage types of inputs and transpose options:
+
+- dot(csr, default) = default
+- dot(csr.T, default) = row_sparse
+- dot(csr, row_sparse) = default
+- otherwise, ``dot`` generates output with default storage
+
+)doc" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<DotParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", DotShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", DotForwardInferStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", DotForward_<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", DotForwardEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_dot"})
+.add_argument("lhs", "NDArray-or-Symbol", "The first input")
+.add_argument("rhs", "NDArray-or-Symbol", "The second input")
+.add_arguments(DotParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_dot)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<DotParam>)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", DotBackwardInferStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", DotBackward_<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", DotBackwardEx<cpu>)
+.add_arguments(DotParam::__FIELDS__());
+
+NNVM_REGISTER_OP(batch_dot)
+.describe(R"doc(Batchwise dot product.
+
+``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
+``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
+
+For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
+`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
+which is computed by::
+
+   batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
+
+)doc" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<DotParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"lhs", "rhs"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", BatchDotShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", BatchDotForward_<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"})
+.add_argument("lhs", "NDArray-or-Symbol", "The first input")
+.add_argument("rhs", "NDArray-or-Symbol", "The second input")
+.add_arguments(DotParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_batch_dot)
+.set_num_inputs(3)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<DotParam>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", BatchDotBackward_<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/dot.cu b/src/operator/tensor/dot.cu
new file mode 100644
index 000000000000..8ee2e2832fbb
--- /dev/null
+++ b/src/operator/tensor/dot.cu
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file dot.cu
+ * \brief GPU Implementation of matrix dot
+ */
+
+#include "./dot-inl.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(dot)
+.set_attr<FCompute>("FCompute<gpu>", DotForward_<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", DotForwardEx<gpu>);
+
+NNVM_REGISTER_OP(_backward_dot)
+.set_attr<FCompute>("FCompute<gpu>", DotBackward_<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", DotBackwardEx<gpu>);
+
+NNVM_REGISTER_OP(batch_dot)
+.set_attr<FCompute>("FCompute<gpu>", BatchDotForward_<gpu>);
+
+NNVM_REGISTER_OP(_backward_batch_dot)
+.set_attr<FCompute>("FCompute<gpu>", BatchDotBackward_<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 851a1c5cdf2f..2317c982853d 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file elementwise_binary_broadcast_op.h
  * \brief Function definition of elementwise unary operators
  */
@@ -133,24 +134,50 @@ inline int BinaryBroadcastShapeCompact(const TShape& lshape, const TShape& rshap
   return j;
 }
 
+namespace mxnet_op {
+template<int ndim, typename DType, typename OP>
+struct binary_broadcast_kernel {
+  MSHADOW_XINLINE static void Map(int base, int length, OpReqType req,
+                                  const Shape<ndim>& lstride, const Shape<ndim>& rstride,
+                                  const Shape<ndim>& oshape, DType* lhs, DType* rhs,
+                                  DType* out, int lsize, int rsize) {
+      Shape <ndim> coord = unravel(base, oshape);
+    index_t lidx = dot(coord, lstride);
+    index_t ridx = dot(coord, rstride);
+      KERNEL_ASSIGN(out[base], req, OP::Map(lhs[lidx], rhs[ridx]));
+      // starts from 1 to avoid extra inc at end of loop
+      for (int i = 1; i < length; ++i) {
+        inc(&coord, oshape, &lidx, lstride, &ridx, rstride);
+        KERNEL_ASSIGN(out[base + i], req, OP::Map(lhs[lidx], rhs[ridx]));
+      }
+    }
+};
+
+}  // namespace mxnet_op
+
 template<typename xpu, typename OP>
 void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                             const OpContext& ctx,
                             const std::vector<TBlob>& inputs,
                             const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& outputs) {
-  using namespace broadcast;
+  using namespace mxnet_op;
   TShape new_lshape, new_rshape, new_oshape;
   int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
   if (!ndim) {
-    BinaryCompute<xpu, OP>(attrs, ctx, inputs, req, outputs);
+    ElemwiseBinaryOp::Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
   } else {
     mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(ndim, NDim, {
-        BinaryBroadcastComputeImpl<NDim, DType, OP>(s, req[0], inputs[0].reshape(new_lshape),
-          inputs[1].reshape(new_rshape), outputs[0].reshape(new_oshape));
+        Shape<NDim> oshape = new_oshape.get<NDim>();
+        Shape<NDim> lstride = calc_stride(new_lshape.get<NDim>());
+        Shape<NDim> rstride = calc_stride(new_rshape.get<NDim>());
+        Kernel<binary_broadcast_kernel<NDim, DType, OP>, xpu>::LaunchEx(
+            s, new_oshape.Size(), req[0], lstride, rstride, oshape,
+            inputs[0].dptr<DType>(), inputs[1].dptr<DType>(), outputs[0].dptr<DType>(),
+            inputs[0].Size(), inputs[1].Size());
       });
     });
   }
@@ -167,7 +194,7 @@ void BinaryBroadcastBackwardUseNone(const nnvm::NodeAttrs& attrs,
   int ndim = BinaryBroadcastShapeCompact(outputs[0].shape_, outputs[1].shape_, inputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
   if (!ndim) {
-    BinaryBackwardUseNone<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
+    ElemwiseBinaryOp::BackwardUseNone<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
   } else {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       Stream<xpu> *s = ctx.get_stream<xpu>();
@@ -223,10 +250,11 @@ void BinaryBroadcastBackwardUseIn(const nnvm::NodeAttrs& attrs,
                                   const std::vector<OpReqType>& req,
                                   const std::vector<TBlob>& outputs) {
   TShape new_lshape, new_rshape, new_oshape;
-  bool need_bc = BinaryBroadcastShapeCompact(outputs[0].shape_, outputs[1].shape_, inputs[0].shape_,
-                                             &new_lshape, &new_rshape, &new_oshape);
+  const bool need_bc = BinaryBroadcastShapeCompact(outputs[0].shape_,
+                                                   outputs[1].shape_, inputs[0].shape_,
+                                                   &new_lshape, &new_rshape, &new_oshape) != 0;
   if (!need_bc) {
-    BinaryBackwardUseIn<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
+    ElemwiseBinaryOp::BackwardUseIn<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
   } else {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       BROADCAST_NDIM_SWITCH(new_oshape.ndim(), NDim, {
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
index c80d46a883ea..04281087f08e 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
@@ -123,6 +124,7 @@ Example::
 .set_attr<FCompute>("FCompute<cpu>", BinaryBroadcastCompute<cpu, mshadow::op::mul>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_broadcast_mul"});
 
+
 NNVM_REGISTER_OP(_backward_broadcast_mul)
 .set_num_inputs(3)
 .set_num_outputs(2)
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
index bf69132cff14..dd3c1b2e12db 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_basic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
index 42da19155ef5..fe7ad7619d17 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
index 2b7cc70b59a7..27a764cd92cd 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_extended.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
index 957b00b5e774..6d74f2d59d5e 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu
index 8673b4f1f1ed..4e80ae9572e5 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu
+++ b/src/operator/tensor/elemwise_binary_broadcast_op_logic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
diff --git a/src/operator/tensor/elemwise_binary_op-inl.h b/src/operator/tensor/elemwise_binary_op-inl.h
new file mode 100644
index 000000000000..8f8bcddea66a
--- /dev/null
+++ b/src/operator/tensor/elemwise_binary_op-inl.h
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_binary_op.h
+ * \brief Function definition of elementwise binary operators
+ */
+#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_INL_H_
+#define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_INL_H_
+
+#include <vector>
+#include <algorithm>
+#include "./elemwise_binary_op.h"
+
+namespace mxnet {
+namespace op {
+
+/*! \brief binary op handling for the following row sparse inputs/outputs
+  rsp, rsp -> rsp,
+  dns, rsp -> rsp,
+  rsp, dns -> rsp,
+  dns, rsp -> dns,
+  rsp, dns -> dns,
+*/
+template<typename DType, typename IType, typename OP>
+void ElemwiseBinaryOp::RspRspOp(mshadow::Stream<cpu> *s,
+                                const nnvm::NodeAttrs &attrs,
+                                const OpContext &ctx,
+                                const NDArray &lhs,
+                                const NDArray &rhs,
+                                const OpReqType req,
+                                const NDArray &output,
+                                const bool lhs_may_be_dense,
+                                const bool rhs_may_be_dense,
+                                const bool allow_inplace,
+                                const bool scatter) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+
+  const bool is_dense_result = output.storage_type() == kDefaultStorage;
+  const bool lhs_is_dense = lhs.storage_type() == kDefaultStorage;
+  const bool rhs_is_dense = rhs.storage_type() == kDefaultStorage;
+  CHECK(!lhs_is_dense || lhs_may_be_dense) << "rvalue cannot be dense";
+  CHECK(!rhs_is_dense || rhs_may_be_dense) << "rvalue cannot be dense";
+  CHECK(!lhs_is_dense || !rhs_is_dense);
+  // Only one item at most may be dense (lhs, rhs or result)
+  if (rhs_is_dense) {
+    // For right-side dense, in order to have sparse output, lhs input zero should
+    // always output zero
+    CHECK(fabs(static_cast<float>(OP::Map(DType(0), DType(99)))) < 1e-4f);
+    CHECK(!is_dense_result);  // Currently not handled
+  }
+  if (lhs_is_dense) {
+    // For left-side dense, in order to have sparse output, lhs input zero should
+    // always output zero
+    CHECK(fabs(static_cast<float>(OP::Map(DType(99), DType(0)))) < 1e-4f);
+    CHECK(!is_dense_result);  // Currently not handled
+  }
+
+  // Memory Estimation: This is (roughly) the number of result rows. We may still
+  // need to subtract the number of common rows
+  bool lhs_in_place = false, rhs_in_place = false;
+  const size_t num_rows_l = lhs_is_dense ? lhs.shape()[0] : lhs.aux_shape(rowsparse::kIdx).Size();
+  const size_t num_rows_r = rhs_is_dense ? rhs.shape()[0] : rhs.aux_shape(rowsparse::kIdx).Size();
+  if (is_dense_result) {
+    output.CheckAndAlloc();
+  } else {
+    if (rhs_is_dense || scatter) {
+      output.CheckAndAlloc({mshadow::Shape1(num_rows_l)});
+    } else if (lhs_is_dense) {
+      output.CheckAndAlloc({mshadow::Shape1(num_rows_r)});
+    } else {
+      lhs_in_place = IsSameArray(lhs, output);
+      rhs_in_place = IsSameArray(rhs, output);
+      if (!lhs_in_place && !rhs_in_place) {
+        output.CheckAndAlloc({mshadow::Shape1(num_rows_l + num_rows_r)});
+      } else {
+        CHECK_EQ(allow_inplace, true);
+        CHECK_EQ(is_dense_result, false);
+        if (lhs_in_place) {
+          // For in-place, zero L-value must always be zero output
+          DCHECK(fabs(static_cast<float>(OP::Map(DType(0), DType(99)))) < DType(1e-3));
+        } else {
+          // For in-place, zero R-value must always be zero output
+          DCHECK(fabs(static_cast<float>(OP::Map(DType(99), DType(0)))) < DType(1e-3));
+        }
+      }
+    }
+  }
+
+  // Indices
+  const Tensor<cpu, 1, IType> indices_l = lhs_is_dense
+                                          ? Tensor<cpu, 1, IType>()
+                                          : lhs.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>(s);
+  const Tensor<cpu, 1, IType> indices_r = rhs_is_dense
+                                          ? Tensor<cpu, 1, IType>()
+                                          : rhs.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>(s);
+  Tensor<cpu, 1, IType> indices_out = is_dense_result
+                                      ? Tensor<cpu, 1, IType>()
+                                      : output.aux_data(rowsparse::kIdx).FlatTo1D<cpu, IType>(s);
+
+  // Data
+  // TODO(cjolivier01): Change to get_with_shape() calls
+  const Tensor<cpu, 2, DType> data_l = AsRowise2D<DType>(s, lhs.data());
+  const Tensor<cpu, 2, DType> data_r = AsRowise2D<DType>(s, rhs.data());
+  Tensor<cpu, 2, DType> out = AsRowise2D<DType>(s, output.data());
+
+  size_t iter_l = 0;
+  size_t iter_r = 0;
+  size_t iter_out = 0;
+  int32_t num_common_rows = 0;
+
+  if (is_dense_result) {
+    if (!num_rows_l && !num_rows_r) {
+      const size_t all_rows = static_cast<size_t>(lhs.shape()[0]);
+      iter_out = FillDense<DType, OP>(s, all_rows, all_rows, req, &out, iter_out);
+    }
+  }
+
+  while (iter_l < num_rows_l && iter_r < num_rows_r) {
+    IType idx_l = lhs_is_dense ? indices_r[iter_r] : indices_l[iter_l];
+    IType idx_r = rhs_is_dense ? idx_l : indices_r[iter_r];
+    if (lhs_in_place) {
+      while (idx_r < idx_l && ++iter_r < num_rows_r) {
+        idx_r = indices_r[iter_r];
+      }
+      if (iter_r >= num_rows_r) {
+        break;
+      }
+    } else if (rhs_in_place) {
+      while (idx_l < idx_r && ++iter_l < num_rows_l) {
+        idx_l = indices_l[iter_l];
+      }
+      if (iter_l >= num_rows_l) {
+        break;
+      }
+    }
+    if (is_dense_result) {
+      iter_out = FillDense<DType, OP>(s, idx_l, idx_r, req, &out, iter_out);
+      DCHECK_EQ(iter_out, static_cast<size_t>(std::min(idx_l, idx_r)));
+    }
+    if (idx_l == idx_r) {
+      // Same row
+      if (!is_dense_result) {
+        indices_out[iter_out] = idx_l;
+      }
+      Tensor<cpu, 1, DType> lvalue = !lhs_is_dense ? data_l[iter_l++] : data_l[idx_l];
+      Tensor<cpu, 1, DType> rvalue = !rhs_is_dense ? data_r[iter_r++] : data_r[idx_r];
+      DCHECK_EQ(lvalue.shape_.Size(), rvalue.shape_.Size());
+      MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+        SerialLaunchCPU<mxnet_op::op_with_req<OP, Req>>(
+          s, lvalue.shape_.Size(), out[iter_out].dptr_, lvalue.dptr_, rvalue.dptr_);
+      });
+      num_common_rows++;
+    } else if (idx_l < idx_r) {
+      // Left only
+      if (!is_dense_result) {
+        indices_out[iter_out] = idx_l;
+      }
+      Tensor<cpu, 1, DType> lvalue = !lhs_is_dense ? data_l[iter_l++] : data_l[idx_l];
+      MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+        SerialLaunchCPU<MissingRValueOp<OP, Req>>(
+          s, lvalue.shape_.Size(), out[iter_out].dptr_, lvalue.dptr_);
+      });
+    } else {
+      // Right only
+      if (scatter) {
+        ++iter_r;
+        continue;  // skip '++iter_out' below
+      }
+      if (!is_dense_result) {
+        indices_out[iter_out] = idx_r;
+      }
+      Tensor<cpu, 1, DType> rvalue = !rhs_is_dense ? data_r[iter_r++] : data_r[idx_r];
+      MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+        SerialLaunchCPU<MissingLValueOp<OP, Req>>(
+          s, rvalue.shape_.Size(), out[iter_out].dptr_, rvalue.dptr_);
+      });
+    }
+    ++iter_out;
+  }
+  // Evaluate the remaining rows beyond the l and r value row intersetion
+  while (iter_l < num_rows_l && !lhs_is_dense && !rhs_in_place) {
+    if (!is_dense_result) {
+      indices_out[iter_out] = indices_l[iter_l];
+    } else {
+      const IType idx_l = indices_l[iter_l];
+      iter_out = FillDense<DType, OP>(s, lhs.shape()[0], idx_l, req, &out, iter_out);
+    }
+    Tensor<cpu, 1, DType> lvalue = data_l[iter_l++];
+    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+      SerialLaunchCPU<MissingRValueOp<OP, Req>>(
+        s, lvalue.shape_.Size(), out[iter_out++].dptr_, lvalue.dptr_);
+    });
+  }
+  while (iter_r < num_rows_r && !rhs_is_dense && !lhs_in_place && !scatter) {
+    if (!is_dense_result) {
+      indices_out[iter_out] = indices_r[iter_r];
+    } else {
+      const IType idx_r = indices_r[iter_r];
+      iter_out = FillDense<DType, OP>(s, lhs.shape()[0], idx_r, req, &out, iter_out);
+    }
+    Tensor<cpu, 1, DType> rvalue = data_r[iter_r++];
+    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+      SerialLaunchCPU<MissingLValueOp<OP, Req>>(
+        s, rvalue.shape_.Size(), out[iter_out++].dptr_, rvalue.dptr_);
+    });
+  }
+  if (is_dense_result) {
+    const size_t all_rows = static_cast<size_t>(lhs.shape()[0]);
+    iter_out = FillDense<DType, OP>(s, all_rows, all_rows, req, &out, iter_out);
+  } else {
+    if (lhs_in_place) {
+      CHECK_LE(iter_out, num_rows_l);
+    }
+    if (rhs_in_place) {
+      CHECK_LE(iter_out, num_rows_r);
+    }
+    DCHECK_LE(iter_out, num_rows_l + num_rows_r);  // Make sure that we didn't overrun
+    nnvm::TShape new_shape = output.aux_shape(rowsparse::kIdx);
+    CHECK_LE(iter_out, new_shape.Size());
+    if (!rhs_is_dense && !lhs_is_dense && !lhs_in_place && !rhs_in_place && !scatter) {
+      // Reduce the first-dimension size by the number of common rows
+      new_shape[0] -= num_common_rows;
+      output.set_aux_shape(rowsparse::kIdx, new_shape);
+    }
+  }
+}
+
+/*! \brief CSR -op- CSR binary operator for non-canonical NDArray */
+template<typename DType, typename IType, typename CType, typename OP>
+void ElemwiseBinaryOp::CsrCsrOp(mshadow::Stream<cpu> *s,
+                                const nnvm::NodeAttrs &attrs,
+                                const OpContext &ctx,
+                                const NDArray &lhs,
+                                const NDArray &rhs,
+                                const OpReqType req,
+                                const NDArray &output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+
+  const auto nr_rows = static_cast<size_t>(lhs.shape()[0]);
+  if (!nr_rows) {
+    return;
+  }
+  CHECK_EQ(lhs.aux_shape(csr::kIndPtr).Size(), nr_rows + 1);
+  const size_t nr_cols = lhs.shape().Size() / nr_rows;
+
+  CHECK_EQ(lhs.shape().Size(), rhs.shape().Size());
+
+  const bool same_lhs_rhs = IsSameArray(lhs, rhs);
+
+  const size_t lhs_nnz = lhs.storage_shape().Size();
+  const size_t rhs_nnz = rhs.storage_shape().Size();
+
+  const size_t output_nnz_guess = same_lhs_rhs ? lhs_nnz : lhs_nnz + rhs_nnz;
+
+  output.CheckAndAlloc({mshadow::Shape1(lhs.shape()[0] + 1),
+                        mshadow::Shape1(std::min(output_nnz_guess, lhs.shape().Size()))});
+  DCHECK_EQ(output.aux_shape(csr::kIndPtr), lhs.aux_shape(csr::kIndPtr));
+
+  const size_t alloc_size = nr_cols * sizeof(IType) + 2 * nr_cols * sizeof(DType);
+
+  Tensor<cpu, 1, uint8_t> workspace =
+    ctx.requested[ResourceRequestType::kTempSpace].get_space_typed<cpu, 1, uint8_t>(
+      mshadow::Shape1(alloc_size), s);
+
+  // Allocate temp space and partition into three tensors
+  mshadow::Tensor<cpu, 1, IType> next(reinterpret_cast<IType *>(workspace.dptr_),
+                                      Shape1(nr_cols));
+  mshadow::Tensor<cpu, 1, DType> lhs_row(reinterpret_cast<DType *>(workspace.dptr_
+                                                                   + nr_cols * sizeof(IType)),
+                                         Shape1(nr_cols));
+  mshadow::Tensor<cpu, 1, DType> rhs_row(lhs_row.dptr_ + nr_cols, Shape1(nr_cols));
+
+  OpBase::FillDense<IType>(s, next.shape_.Size(), IType(-1), req, next.dptr_);
+  OpBase::FillDense<DType>(s, lhs_row.shape_.Size(), DType(0),  req, lhs_row.dptr_);
+  if (!same_lhs_rhs) {
+    OpBase::FillDense<DType>(s, rhs_row.shape_.Size(), DType(0), req, rhs_row.dptr_);
+  }
+
+  // Column indices
+  const Tensor<cpu, 1, IType> col_indices_l = lhs.aux_data(csr::kIdx).FlatTo1D<cpu, IType>(s);
+  const Tensor<cpu, 1, IType> col_indices_r = rhs.aux_data(csr::kIdx).FlatTo1D<cpu, IType>(s);
+  Tensor<cpu, 1, IType> col_indices_out = output.aux_data(csr::kIdx).FlatTo1D<cpu, IType>(s);
+
+  // Row pointers
+  const Tensor<cpu, 1, CType> row_ptr_l = lhs.aux_data(csr::kIndPtr).FlatTo1D<cpu, CType>(s);
+  const Tensor<cpu, 1, CType> row_ptr_r = rhs.aux_data(csr::kIndPtr).FlatTo1D<cpu, CType>(s);
+  Tensor<cpu, 1, CType> row_ptr_out = output.aux_data(csr::kIndPtr).FlatTo1D<cpu, CType>(s);
+
+  Tensor<cpu, 1, DType>   data_l = lhs.data().FlatTo1D<cpu, DType>(s);
+  Tensor<cpu, 1, DType>   data_r = rhs.data().FlatTo1D<cpu, DType>(s);
+  Tensor<cpu, 1, DType> data_out = output.data().FlatTo1D<cpu, DType>(s);
+
+  IType nnz = 0;
+  row_ptr_out[0] = 0;
+
+  for (IType i = 0; i < static_cast<IType>(nr_rows); i++) {
+    IType head = -2;
+    IType length = 0;
+
+    // add a row of A to lhs_row
+    const IType i_start_l = row_ptr_l[i];
+    const IType i_end_l = row_ptr_l[i + 1];
+    for (IType jj = i_start_l; jj < i_end_l; jj++) {
+      IType col = col_indices_l[jj];
+      lhs_row[col] += data_l[jj];
+
+      if (next[col] == -1) {
+        next[col] = head;
+        head = col;
+        ++length;
+      }
+    }
+
+    // add a row of B to rhs_row
+    const IType i_start_r = row_ptr_r[i];
+    const IType i_end_r = row_ptr_r[i + 1];
+    for (IType jj = i_start_r; jj < i_end_r; jj++) {
+      const IType col = col_indices_r[jj];
+      rhs_row[col] += data_r[jj];
+
+      if (next[col] == -1) {
+        next[col] = head;
+        head = col;
+        ++length;
+      }
+    }
+
+    // scan through columns where A or B has
+    // contributed a non-zero entry
+    for (IType jj = 0; jj < length; jj++) {
+      const DType result = OP::Map(lhs_row[head], rhs_row[head]);
+
+      if (result != 0) {
+        col_indices_out[nnz] = head;
+        data_out[nnz] = result;
+        ++nnz;
+      }
+
+      const IType temp = head;
+      head = next[head];
+
+      next[temp] = -1;
+      lhs_row[temp] = 0;
+      rhs_row[temp] = 0;
+    }
+
+    row_ptr_out[i + 1] = nnz;
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_INL_H_
diff --git a/src/operator/tensor/elemwise_binary_op.cc b/src/operator/tensor/elemwise_binary_op.cc
new file mode 100644
index 000000000000..931132b4257c
--- /dev/null
+++ b/src/operator/tensor/elemwise_binary_op.cc
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./elemwise_binary_op.h"
+
+
+namespace mxnet {
+namespace op {
+
+bool ElemwiseBinaryOp::SparseSparseWithDenseResult(const nnvm::NodeAttrs& attrs,
+                                                   const int dev_mask,
+                                                   DispatchMode* dispatch_mode,
+                                                   std::vector<int> *in_attrs,
+                                                   std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U) << " in operator " << attrs.name;
+  CHECK_EQ(out_attrs->size(), 1U) << " in operator " << attrs.name;
+  const auto& lhs_stype = in_attrs->at(0);
+  const auto& rhs_stype = in_attrs->at(1);
+  auto& out_stype = out_attrs->at(0);
+  bool dispatched = false;
+  const bool invalid_ctx = dev_mask != mshadow::cpu::kDevMask;
+  const auto dispatch_ex = invalid_ctx ?
+                           DispatchMode::kFComputeFallback : DispatchMode::kFComputeEx;
+  if (!dispatched && (lhs_stype == kDefaultStorage || rhs_stype == kDefaultStorage)) {
+    // dns, dns -> dns
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched && lhs_stype == kRowSparseStorage && rhs_stype == kRowSparseStorage) {
+    // rsp, rsp -> dns
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode, dispatch_ex);
+  }
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+  }
+  if (*dispatch_mode == DispatchMode::kFComputeFallback) {
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+bool ElemwiseBinaryOp::BackwardUseInStorageType(const nnvm::NodeAttrs& attrs,
+                                                const int dev_mask,
+                                                DispatchMode* dispatch_mode,
+                                                std::vector<int> *in_attrs,
+                                                std::vector<int> *out_attrs) {
+  using namespace common;
+  CHECK_EQ(in_attrs->size(), 3U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  bool dispatched = false;
+  const bool invalid_ctx = dev_mask != mshadow::cpu::kDevMask;
+  const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback :
+                           DispatchMode::kFComputeEx;
+  if (!dispatched && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched) {
+    if (common::ContainsOnlyStorage(*in_attrs, kRowSparseStorage)
+      && common::ContainsOnlyStorage(*out_attrs, kRowSparseStorage)) {
+      dispatched = storage_type_assign(out_attrs, kRowSparseStorage,
+                                       dispatch_mode, dispatch_ex);
+    }
+  }
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+  }
+  if (*dispatch_mode == DispatchMode::kFComputeFallback) {
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 87b0d46a63c9..d54636c05515 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_op.h
  * \brief Function definition of elementwise binary operators
  */
@@ -25,199 +26,532 @@
 #define MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_H_
 
 #include <mxnet/operator_util.h>
+#include <mxnet/op_attr_types.h>
 #include <vector>
 #include <string>
 #include <utility>
+#include <typeinfo>
+#include <algorithm>
 #include "../mxnet_op.h"
 #include "../mshadow_op.h"
-#include "../elemwise_op_common.h"
-#include "../mxnet_op.h"
+#include "../../engine/openmp.h"
+#include "elemwise_unary_op.h"
+#include "../../common/utils.h"
+#include "./init_op.h"
 
 namespace mxnet {
 namespace op {
 
-template<typename OP, int Req>
-struct BinaryOp {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* lhs,
-    const DType* rhs) {
-    KERNEL_ASSIGN(out[i], Req, OP::Map(lhs[i], rhs[i]));
-  }
-};
-
-template<typename xpu, typename OP, typename DType>
-void BinaryCompute_(const nnvm::NodeAttrs& attrs,
-                    const OpContext& ctx,
-                    const std::vector<TBlob>& inputs,
-                    const std::vector<OpReqType>& req,
-                    const std::vector<TBlob>& outputs) {
-  using namespace mxnet_op;
-  if (req[0] == kNullOp) return;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  int size = static_cast<int>((outputs[0].Size() + DataType<DType>::kLanes - 1)
-    /DataType<DType>::kLanes);
-  DType* out_dptr = outputs[0].dptr<DType>();
-  DType* lhs_dptr = inputs[0].dptr<DType>();
-  DType* rhs_dptr = inputs[1].dptr<DType>();
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
-    Kernel<BinaryOp<OP, Req>, xpu>::Launch(s, size, out_dptr, lhs_dptr, rhs_dptr);
-  });
-}
-
-template<typename xpu, typename OP>
-void BinaryCompute(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    BinaryCompute_<xpu, OP, DType>(attrs, ctx, inputs, req, outputs);
-  });
-}
-
-template<typename xpu, typename OP>
-void BinaryComputeWithHalf2(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const std::vector<TBlob>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<TBlob>& outputs) {
-  MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
-    BinaryCompute_<xpu, OP, DType>(attrs, ctx, inputs, req, outputs);
-  });
-}
-
-template<typename xpu, typename op>
-void BinaryLaunch(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<TBlob>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-
-  CHECK_EQ(inputs.size(), 2U);
-  CHECK_EQ(outputs.size(), 1U);
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Kernel<op, xpu>::Launch(s, outputs[0].Size(),
-      outputs[0].dptr<DType>(), inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
-  });
-}
-
-template<typename OP, int Req >
-struct BinaryOpBackwardUseNone {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* igrad, const DType* ograd) {
-    KERNEL_ASSIGN(igrad[i], Req, OP::Map(ograd[i]));
-  }
-};
-
-template<typename xpu, typename LOP, typename ROP, typename DType>
-void BinaryBackwardUseNone_(const nnvm::NodeAttrs& attrs,
-                            const OpContext& ctx,
-                            const std::vector<TBlob>& inputs,
-                            const std::vector<OpReqType>& req,
-                            const std::vector<TBlob>& outputs) {
-  using namespace mxnet_op;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  int size = static_cast<int>((outputs[0].Size() + DataType<DType>::kLanes - 1)
-    /DataType<DType>::kLanes);
-  DType* lgrad_dptr = outputs[0].dptr<DType>();
-  DType* rgrad_dptr = outputs[1].dptr<DType>();
-  DType* ograd_dptr = inputs[0].dptr<DType>();
-  if (std::is_same<LOP, mshadow_op::identity>::value && req[0] == kWriteInplace) {
-    CHECK_EQ(ograd_dptr, lgrad_dptr);
-  } else if (req[0] != kNullOp) {
-    MXNET_ASSIGN_REQ_SWITCH(req[0], Req,
-      {Kernel<BinaryOpBackwardUseNone<LOP, Req>, xpu>::Launch(s, size, lgrad_dptr,
-        ograd_dptr);});
-  }
-  if (std::is_same<ROP, mshadow_op::identity>::value && req[1] == kWriteInplace) {
-    CHECK_EQ(ograd_dptr, rgrad_dptr);
-  } else if (req[1] != kNullOp) {
-    MXNET_ASSIGN_REQ_SWITCH(req[1], Req,
-      {Kernel<BinaryOpBackwardUseNone<ROP, Req>, xpu>::Launch(s, size, rgrad_dptr,
-        ograd_dptr);});
-  }
-}
-
-template<typename xpu, typename LOP, typename ROP>
-void BinaryBackwardUseNone(const nnvm::NodeAttrs& attrs,
-                           const OpContext& ctx,
-                           const std::vector<TBlob>& inputs,
-                           const std::vector<OpReqType>& req,
-                           const std::vector<TBlob>& outputs) {
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    BinaryBackwardUseNone_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
-  });
-}
-
-template<typename xpu, typename LOP, typename ROP>
-void BinaryBackwardUseNoneWithHalf2(const nnvm::NodeAttrs& attrs,
-                                    const OpContext& ctx,
-                                    const std::vector<TBlob>& inputs,
-                                    const std::vector<OpReqType>& req,
-                                    const std::vector<TBlob>& outputs) {
-  MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
-    BinaryBackwardUseNone_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
-  });
-}
-
-template<typename OP, int Req>
-struct BinaryOpBackwardUseIn {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType* igrad,
-    const DType* ograd, const DType* lhs, const DType* rhs) {
-    KERNEL_ASSIGN(igrad[i], Req, ograd[i]*OP::Map(lhs[i], rhs[i]));
-  }
-};
-
-template<typename xpu, typename LOP, typename ROP, typename DType>
-void BinaryBackwardUseIn_(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
-  using namespace mxnet_op;
-  if (req[0] == kNullOp && req[1] == kNullOp) return;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  int size = static_cast<int>((outputs[0].Size() + DataType<DType>::kLanes - 1)
-    /DataType<DType>::kLanes);
-  DType* lgrad_dptr = outputs[0].dptr<DType>();
-  DType* rgrad_dptr = outputs[1].dptr<DType>();
-  DType* ograd_dptr = inputs[0].dptr<DType>();
-  DType* lhs_dptr = inputs[1].dptr<DType>();
-  DType* rhs_dptr = inputs[2].dptr<DType>();
-  MXNET_ASSIGN_REQ_SWITCH(req[0], Req,
-    {Kernel<BinaryOpBackwardUseIn<LOP, Req>, xpu>::Launch(s, size, lgrad_dptr, ograd_dptr,
-      lhs_dptr, rhs_dptr);});
-  MXNET_ASSIGN_REQ_SWITCH(req[1], Req,
-    {Kernel<BinaryOpBackwardUseIn<ROP, Req>, xpu>::Launch(s, size, rgrad_dptr, ograd_dptr,
-      lhs_dptr, rhs_dptr);});
-}
-
-template<typename xpu, typename LOP, typename ROP>
-void BinaryBackwardUseIn(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    BinaryBackwardUseIn_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
-  });
-}
-
-template<typename xpu, typename LOP, typename ROP>
-void BinaryBackwardUseInWithHalf2(const nnvm::NodeAttrs& attrs,
-                                  const OpContext& ctx,
-                                  const std::vector<TBlob>& inputs,
-                                  const std::vector<OpReqType>& req,
-                                  const std::vector<TBlob>& outputs) {
-  MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
-    BinaryBackwardUseIn_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
-  });
-}
+/*! Gather binary operator functions into ElemwiseBinaryOp class */
+class ElemwiseBinaryOp : public OpBase {
+ public:
+  /*! \brief For sparse, assume missing rvalue is 0 */
+  template<typename OP, int Req>
+  struct MissingRValueOp {
+    template<typename DType>
+    MSHADOW_XINLINE static void Map(int i, DType *out, const DType *lhs) {
+      KERNEL_ASSIGN(out[i], Req, OP::Map(lhs[i], DType(0)));
+    }
+  };
+
+  /*! \brief For sparse, assume missing lvalue is 0 */
+  template<typename OP, int Req>
+  struct MissingLValueOp {
+    template<typename DType>
+    MSHADOW_XINLINE static void Map(int i, DType *out, const DType *rhs) {
+      KERNEL_ASSIGN(out[i], Req, OP::Map(DType(0), rhs[i]));
+    }
+  };
+
+ private:
+  /*!
+   * \brief CSR operation requires temp space
+   */
+  enum ResourceRequestType {
+    kTempSpace
+  };
+
+  /*!
+   * \brief Fill contiguous dense output rows with value computed from 0 lhs and 0 rhs input
+   *        CPU-Only version
+   */
+  template<typename DType, typename OP, typename xpu>
+  static inline size_t FillDense(mshadow::Stream<xpu> *s,
+                                 const size_t idx_l,
+                                 const size_t idx_r,
+                                 const OpReqType req,
+                                 mshadow::Tensor<xpu, 2, DType> *out,
+                                 const size_t iter_out) {
+    const int index_out_min = static_cast<int>(std::min(idx_l, idx_r));
+    if (static_cast<size_t>(index_out_min) > iter_out) {
+      const DType zero_input_val = OP::Map(DType(0), DType(0));
+      #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+      for (int i = static_cast<int>(iter_out); i < index_out_min; ++i) {
+        Fill<false>(s, (*out)[i], req, zero_input_val);
+      }
+    }
+    return static_cast<size_t>(index_out_min);  // MSVC wants OMP loops to always use 'int'
+  }
+
+  static inline bool IsSameArray(const NDArray& a1, const NDArray& a2) {
+    return a1.var() == a2.var();
+  }
+
+  /*! \brief Minimum of three */
+  static MSHADOW_XINLINE size_t minthree(const size_t a, const size_t b, const size_t c) {
+    return a < b ? (a < c ? a : c) : (b < c ? b : c);
+  }
+
+  template<typename xpu, typename LOP, typename ROP, typename DType>
+  static void BackwardUseNone_(const nnvm::NodeAttrs &attrs,
+                               const OpContext &ctx,
+                               const std::vector<TBlob> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<TBlob> &outputs) {
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const int size = static_cast<int>((outputs[0].Size() + DataType<DType>::kLanes - 1)
+                                      / DataType<DType>::kLanes);
+    const DType *ograd_dptr = inputs[0].dptr<DType>();
+    if (std::is_same<LOP, mshadow_op::identity>::value && req[0] == kWriteInplace) {
+      CHECK_EQ(ograd_dptr, outputs[0].dptr<DType>());
+    } else if (req[0] != kNullOp) {
+      DType *lgrad_dptr = outputs[0].dptr<DType>();
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        Kernel<mxnet_op::op_with_req<LOP, Req>, xpu>::Launch(s, size, lgrad_dptr, ograd_dptr);
+      });
+    }
+    if (std::is_same<ROP, mshadow_op::identity>::value && req[1] == kWriteInplace) {
+      CHECK_EQ(ograd_dptr, outputs[1].dptr<DType>());
+    } else if (req[1] != kNullOp) {
+      DType *rgrad_dptr = outputs[1].dptr<DType>();
+      MXNET_ASSIGN_REQ_SWITCH(req[1], Req, {
+        Kernel<mxnet_op::op_with_req<ROP, Req>, xpu>::Launch(s, size, rgrad_dptr, ograd_dptr);
+      });
+    }
+  }
+
+  template<typename xpu, typename LOP, typename ROP, typename DType>
+  static void BackwardUseIn_(const nnvm::NodeAttrs &attrs,
+                             const OpContext &ctx,
+                             const std::vector<TBlob> &inputs,
+                             const std::vector<OpReqType> &req,
+                             const std::vector<TBlob> &outputs) {
+    DCHECK_EQ(outputs.size(), 2U);
+    DCHECK_EQ(inputs.size(), 3U);
+    mxnet_op::Stream<xpu> *s = ctx.get_stream<xpu>();
+    const DType *ograd_dptr = inputs[0].dptr<DType>();
+    const DType *lhs_dptr = inputs[1].dptr<DType>();
+    const DType *rhs_dptr = inputs[2].dptr<DType>();
+    MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+      const int size = static_cast<int>(
+        (outputs[0].Size() + mxnet_op::DataType<DType>::kLanes - 1)
+        / mxnet_op::DataType<DType>::kLanes);
+      DType * lgrad_dptr = outputs[0].dptr<DType>();
+      mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::backward_grad<LOP>, Req>, xpu>::Launch(
+        s, size, lgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);});
+    MXNET_ASSIGN_REQ_SWITCH(req[1], Req, {
+      const int size = static_cast<int>(
+        (outputs[1].Size() + mxnet_op::DataType<DType>::kLanes - 1)
+        / mxnet_op::DataType<DType>::kLanes);
+      DType * rgrad_dptr = outputs[1].dptr<DType>();
+      mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::backward_grad<ROP>, Req>, xpu>::Launch(
+        s, size, rgrad_dptr, ograd_dptr, lhs_dptr, rhs_dptr);});
+  }
+
+  template<
+    typename xpu,
+    typename LOP,
+    typename ROP,
+    typename DType,
+    bool in0_ok_dense = false,
+    bool in1_ok_dense = false,
+    bool in2_ok_dense = false,
+    typename BackupCompute>
+  static inline void BackwardUseInEx_(const nnvm::NodeAttrs &attrs,
+                                      const OpContext &ctx,
+                                      const std::vector<NDArray> &inputs,
+                                      const std::vector<OpReqType> &req,
+                                      const std::vector<NDArray> &outputs,
+                                      BackupCompute backup_compute) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    // lhs grad
+    if (req[0] != kNullOp) {
+      // RspRspOp can handle dense outputs so long as OP(0, 0) == 0
+      MSHADOW_IDX_TYPE_SWITCH(inputs[1].aux_type(rowsparse::kIdx), IType, {
+        RspRspOp<DType, IType, LOP>(
+          s, attrs, ctx, inputs[1], inputs[2], req[0], outputs[0],
+          false, false, false, false);
+      });
+      // lhs in-place
+      MSHADOW_IDX_TYPE_SWITCH(inputs[0].aux_type(rowsparse::kIdx), IType, {
+        RspRspOp<DType, IType, mshadow::op::mul>(
+          s, attrs, ctx, outputs[0], inputs[0], req[0], outputs[0],
+          false, false, true, false);
+      });
+    }
+    // rhs grad
+    if (req[1] != kNullOp) {
+      MSHADOW_IDX_TYPE_SWITCH(inputs[1].aux_type(rowsparse::kIdx), IType, {
+        RspRspOp<DType, IType, ROP>(
+          s, attrs, ctx, inputs[1], inputs[2], req[1], outputs[1],
+          false, false, false, false);
+      });
+      // rhs in-place
+      MSHADOW_IDX_TYPE_SWITCH(inputs[0].aux_type(rowsparse::kIdx), IType, {
+        RspRspOp<DType, IType, mshadow::op::mul>(
+          s, attrs, ctx, inputs[0], outputs[1], req[1], outputs[1],
+          false, false, true, false);
+      });
+    }
+  }
+
+ protected:
+  /*! \brief Binary op handling for lhr/rhs: RspDns, RspRsp, DnsRsp, or RspRsp->Dns result */
+  template<typename DType, typename IType, typename OP>
+  static void RspRspOp(mshadow::Stream<cpu> *s,
+                       const nnvm::NodeAttrs &attrs,
+                       const OpContext &ctx,
+                       const NDArray &lhs,
+                       const NDArray &rhs,
+                       OpReqType req,
+                       const NDArray &output,
+                       bool lhs_may_be_dense,
+                       bool rhs_may_be_dense,
+                       bool allow_inplace,
+                       bool scatter);
+
+  /*! \brief CSR -op- CSR binary operator for non-canonical NDArray */
+  template<typename DType, typename IType, typename CType, typename OP>
+  static inline void CsrCsrOp(mshadow::Stream<cpu> *s,
+                              const nnvm::NodeAttrs &attrs,
+                              const OpContext &ctx,
+                              const NDArray &lhs,
+                              const NDArray &rhs,
+                              OpReqType req,
+                              const NDArray &output);
+
+ public:
+  /*!
+   * \brief Rsp-op-Rsp operation which produces a dense result
+   * \param attrs Attributes
+   * \param dev_mask Device mask
+   * \param dispatch_mode Dispatch Mode
+   * \param in_attrs Input storage attributes
+   * \param out_attrs Output storage attributes
+   * \return true if handled
+   */
+  static bool SparseSparseWithDenseResult(const nnvm::NodeAttrs& attrs,
+                                          int dev_mask,
+                                          DispatchMode* dispatch_mode,
+                                          std::vector<int> *in_attrs,
+                                          std::vector<int> *out_attrs);
+
+  /*!
+   * \brief Allow one of the inputs to be dense and still produce a sparse output
+   * \param attrs Attributes
+   * \param dev_mask Device mask
+   * \param dispatch_mode Dispatch Mode
+   * \param in_attrs Input storage attributes
+   * \param out_attrs Output storage attributes
+   * \return true if handled
+   */
+  template<bool lhs_dense_ok = true, bool rhs_dense_ok = true>
+  static bool AllowLRDenseInputWithSparseOutputStorageType(const nnvm::NodeAttrs& attrs,
+                                                           int dev_mask,
+                                                           DispatchMode* dispatch_mode,
+                                                           std::vector<int> *in_attrs,
+                                                           std::vector<int> *out_attrs) {
+    CHECK_EQ(in_attrs->size(), 2U) << " in operator " << attrs.name;
+    CHECK_EQ(out_attrs->size(), 1U) << " in operator " << attrs.name;
+    const auto& lhs_stype = in_attrs->at(0);
+    const auto& rhs_stype = in_attrs->at(1);
+    auto& out_stype = out_attrs->at(0);
+    bool dispatched = false;
+    const bool invalid_ctx = dev_mask != mshadow::cpu::kDevMask;
+    const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback :
+                             DispatchMode::kFComputeEx;
+    if (!dispatched && lhs_stype == kDefaultStorage && rhs_stype == kDefaultStorage) {
+      // dns, dns -> dns
+      dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                       dispatch_mode, DispatchMode::kFCompute);
+    }
+    if (!dispatched) {
+      if ((lhs_stype == kRowSparseStorage && rhs_stype == kRowSparseStorage) ||
+          (rhs_dense_ok && lhs_stype == kRowSparseStorage && rhs_stype == kDefaultStorage) ||
+          (lhs_dense_ok && lhs_stype == kDefaultStorage && rhs_stype == kRowSparseStorage)) {
+        // rsp, rsp -> rsp
+        // rsp, dns -> rsp
+        // dns, rsp -> rsp
+        dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
+                                         dispatch_mode, dispatch_ex);
+      } else if (lhs_stype == kCSRStorage && rhs_stype == kCSRStorage) {
+        // csr, csr -> csr
+        dispatched = storage_type_assign(&out_stype, kCSRStorage,
+                                         dispatch_mode, dispatch_ex);
+      } else if ((lhs_stype == kCSRStorage && rhs_dense_ok) ||
+        (rhs_stype == kCSRStorage && lhs_dense_ok)) {
+        // csr, dns -> csr
+        // dns, csr -> csr
+        dispatched = storage_type_assign(&out_stype, kCSRStorage,
+                                         dispatch_mode, DispatchMode::kFComputeFallback);
+      }
+    }
+    if (!dispatched) {
+      dispatch_fallback(out_attrs, dispatch_mode);
+    }
+    if (*dispatch_mode == DispatchMode::kFComputeFallback) {
+      LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+    }
+    return true;
+  }
+
+  /*!
+   * \brief Backward pass computing input gradient using forward inputs
+   * \param attrs Attributes
+   * \param dev_mask Device mask
+   * \param dispatch_mode Dispatch Mode
+   * \param in_attrs Input storage attributes
+   * \param out_attrs Output storage attributes
+   * \return true if handled
+   */
+  static bool BackwardUseInStorageType(const nnvm::NodeAttrs& attrs,
+                                       int dev_mask,
+                                       DispatchMode* dispatch_mode,
+                                       std::vector<int> *in_attrs,
+                                       std::vector<int> *out_attrs);
+
+  template<typename xpu, typename OP>
+  static void Compute(const nnvm::NodeAttrs &attrs,
+                      const OpContext &ctx,
+                      const std::vector<TBlob> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<TBlob> &outputs) {
+    using namespace mxnet_op;
+    if (req[0] != kNullOp) {
+      Stream<xpu> *s = ctx.get_stream<xpu>();
+      CHECK_EQ(inputs.size(), 2U);
+      CHECK_EQ(outputs.size(), 1U);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+          const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
+          + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
+          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+          outputs[0].dptr<DType>(),
+          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+        });
+      });
+    }
+  }
 
+  template<typename xpu, typename OP>
+  static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
+                               const OpContext &ctx,
+                               const std::vector<TBlob> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<TBlob> &outputs) {
+    using namespace mxnet_op;
+    if (req[0] != kNullOp) {
+      Stream<xpu> *s = ctx.get_stream<xpu>();
+      CHECK_EQ(inputs.size(), 2U);
+      CHECK_EQ(outputs.size(), 1U);
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
+          const size_t size = (minthree(outputs[0].Size(), inputs[0].Size(), inputs[1].Size())
+          + DataType<DType>::kLanes - 1) / DataType<DType>::kLanes;
+          Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(s, size,
+          outputs[0].dptr<DType>(),
+          inputs[0].dptr<DType>(), inputs[1].dptr<DType>());
+        });
+      });
+    }
+  }
+
+  template<typename xpu, typename OP>
+  static void ComputeEx(const nnvm::NodeAttrs &attrs,
+                        const OpContext &ctx,
+                        const std::vector<NDArray> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<NDArray> &outputs) {
+    CHECK_EQ(inputs.size(), 2);
+    CHECK_EQ(outputs.size(), 1);
+    if (req[0] == kNullOp) return;
+    const auto lhs_stype = inputs[0].storage_type();
+    const auto out_stype = outputs[0].storage_type();
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    if ((common::ContainsOnlyStorage(inputs, kRowSparseStorage))
+        && (out_stype == kRowSparseStorage || out_stype == kDefaultStorage)) {
+      // rsp, rsp -> rsp
+      // rsp, rsp -> dns
+      const int rsp_input_idx = lhs_stype == kRowSparseStorage ? 0 : 1;
+      MSHADOW_IDX_TYPE_SWITCH(inputs[rsp_input_idx].aux_type(rowsparse::kIdx), IType, {
+        MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
+          RspRspOp<DType, IType, OP>(
+            s, attrs, ctx, inputs[0], inputs[1], req[0], outputs[0], false, false, false, false);
+        });
+      });
+    } else if (common::ContainsOnlyStorage(inputs, kCSRStorage) && out_stype == kCSRStorage) {
+      // csr, csr -> csr
+      MSHADOW_IDX_TYPE_SWITCH(inputs[0].aux_type(csr::kIdx), IType, {
+        MSHADOW_IDX_TYPE_SWITCH(inputs[0].aux_type(csr::kIndPtr), CType, {
+          MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
+            CsrCsrOp<DType, IType, CType, OP>(
+              s, attrs, ctx, inputs[0], inputs[1], req[0], outputs[0]);
+          });
+        });
+      });
+    } else {
+      LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+    }
+  }
+
+  /*! \brief ComputeEx allowing dense lvalue and/or rvalue */
+  template<typename xpu, typename OP, bool lhs_may_be_dense, bool rhs_may_be_dense>
+  static void ComputeDnsLRValueEx(const nnvm::NodeAttrs &attrs,
+                                  const OpContext &ctx,
+                                  const std::vector<NDArray> &inputs,
+                                  const std::vector<OpReqType> &req,
+                                  const std::vector<NDArray> &outputs) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(inputs.size(), 2);
+    CHECK_EQ(outputs.size(), 1);
+    if (req[0] == kNullOp) return;
+    const auto lhs_stype = inputs[0].storage_type();
+    const auto rhs_stype = inputs[1].storage_type();
+    const auto out_stype = outputs[0].storage_type();
+    if ((out_stype == kRowSparseStorage || out_stype == kDefaultStorage) &&
+        ((lhs_stype == kRowSparseStorage && rhs_stype == kRowSparseStorage) ||
+         (lhs_stype == kRowSparseStorage && rhs_stype == kDefaultStorage) ||
+         (lhs_stype == kDefaultStorage && rhs_stype == kRowSparseStorage)) &&
+         lhs_may_be_dense && rhs_may_be_dense) {
+      // rsp, rsp -> rsp
+      // rsp, rsp -> dns
+      // rsp, dns -> rsp
+      // dns, rsp -> rsp
+      // More than once dense not allowed (this will be checked in RspRspOp):
+      //   rsp, dns -> dns  <-- NOT ALLOWED
+      //   dns, rsp -> dns  <-- NOT ALLOWED
+      mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+      MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
+        MSHADOW_IDX_TYPE_SWITCH(outputs[0].aux_type(rowsparse::kIdx), IType, {
+          RspRspOp<DType, IType, OP>(
+            s, attrs, ctx, inputs[0], inputs[1],
+            req[0], outputs[0], lhs_may_be_dense, rhs_may_be_dense, false, false);
+        });
+      });
+    } else if (lhs_stype == kCSRStorage && rhs_stype == kCSRStorage) {
+      ComputeEx<xpu, OP>(attrs, ctx, inputs, req, outputs);
+    } else {
+      LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+    }
+  }
+
+  template<typename xpu, typename LOP, typename ROP>
+  static inline void BackwardUseNone(const nnvm::NodeAttrs &attrs,
+                                     const OpContext &ctx,
+                                     const std::vector<TBlob> &inputs,
+                                     const std::vector<OpReqType> &req,
+                                     const std::vector<TBlob> &outputs) {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      BackwardUseNone_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
+    });
+  }
+
+  template<typename xpu, typename LOP, typename ROP>
+  static inline void BackwardUseNoneWithHalf2(const nnvm::NodeAttrs &attrs,
+                                              const OpContext &ctx,
+                                              const std::vector<TBlob> &inputs,
+                                              const std::vector<OpReqType> &req,
+                                              const std::vector<TBlob> &outputs) {
+    MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
+      BackwardUseNone_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
+    });
+  }
+
+  template<typename xpu, typename LOP, typename ROP>
+  static inline void BackwardUseNoneEx(const nnvm::NodeAttrs &attrs,
+                                       const OpContext &ctx,
+                                       const std::vector<NDArray> &inputs,
+                                       const std::vector<OpReqType> &req,
+                                       const std::vector<NDArray> &outputs) {
+    CHECK_EQ(inputs.size(), 1U);   // output grad
+    CHECK_EQ(outputs.size(), 2U);  // lhs input grad, rhs input grad
+    const auto in_stype = inputs[0].storage_type();
+    const auto lhs_stype = outputs[0].storage_type();
+    const auto rhs_stype = outputs[1].storage_type();
+    // lhs grad
+    if (req[0] != kNullOp) {
+      if (in_stype == lhs_stype && (in_stype == kRowSparseStorage || in_stype == kCSRStorage)) {
+        CHECK_EQ(outputs[0].storage_type(), in_stype);
+        // rsp -> rsp, _. op requires 0-input returns 0-output
+        DCHECK_LT(fabs(static_cast<float>(LOP::Map(0))), 1e-5f);
+        UnaryOp::ComputeEx<xpu, LOP>(attrs, ctx, inputs, req, {outputs[0]});
+      } else {
+        LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+      }
+    }
+    // rhs grad
+    if (req[1] != kNullOp) {
+      if (in_stype == rhs_stype && (in_stype == kRowSparseStorage || in_stype == kCSRStorage)) {
+        CHECK_EQ(outputs[0].storage_type(), in_stype);
+        // rsp -> _, rsp. op requires 0-input returns 0-output
+        DCHECK_LT(fabs(static_cast<float>(ROP::Map(0))), 1e-5f);
+        UnaryOp::ComputeEx<xpu, ROP>(attrs, ctx, inputs, req, {outputs[1]});
+      } else {
+        LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+      }
+    }
+  }
+
+  template<typename xpu, typename LOP, typename ROP>
+  static inline void BackwardUseIn(const nnvm::NodeAttrs &attrs,
+                                   const OpContext &ctx,
+                                   const std::vector<TBlob> &inputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<TBlob> &outputs) {
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      BackwardUseIn_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
+    });
+  }
+
+  template<typename xpu, typename LOP, typename ROP>
+  static inline void BackwardUseInWithHalf2(const nnvm::NodeAttrs &attrs,
+                                            const OpContext &ctx,
+                                            const std::vector<TBlob> &inputs,
+                                            const std::vector<OpReqType> &req,
+                                            const std::vector<TBlob> &outputs) {
+    MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
+      BackwardUseIn_<xpu, LOP, ROP, DType>(attrs, ctx, inputs, req, outputs);
+    });
+  }
+
+  template<
+    typename xpu, typename LOP, typename ROP,
+    bool in0_ok_dense = false, bool in1_ok_dense = false, bool in2_ok_dense = false>
+  static inline void BackwardUseInEx(const nnvm::NodeAttrs &attrs,
+                                     const OpContext &ctx,
+                                     const std::vector<NDArray> &inputs,
+                                     const std::vector<OpReqType> &req,
+                                     const std::vector<NDArray> &outputs) {
+    using namespace common;
+    CHECK_EQ(inputs.size(), 3U);
+    CHECK_EQ(outputs.size(), 2U);  // lhs input grad, rhs input grad
+    const auto lhs_grad_stype = outputs[0].storage_type();
+    const auto rhs_grad_stype = outputs[1].storage_type();
+    if (ContainsOnlyStorage(inputs, kRowSparseStorage) &&
+        (lhs_grad_stype == kDefaultStorage || lhs_grad_stype == kRowSparseStorage) &&
+        (rhs_grad_stype == kDefaultStorage || rhs_grad_stype == kRowSparseStorage)) {
+      // rsp, rsp, rsp -> [dns, rsp], [dns, rsp]
+      MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
+        BackwardUseInEx_<xpu, LOP, ROP, DType, in0_ok_dense, in1_ok_dense, in2_ok_dense>(
+          attrs, ctx, inputs, req, outputs, BackwardUseIn<xpu, LOP, ROP>);
+      });
+    }
+  }
+};  // class ElemwiseBinaryOp
+
+/*! \brief Binary launch */
 #define MXNET_OPERATOR_REGISTER_BINARY(name)                        \
   NNVM_REGISTER_OP(name)                                            \
   .set_num_inputs(2)                                                \
@@ -232,9 +566,32 @@ void BinaryBackwardUseInWithHalf2(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                     \
       return std::vector<std::pair<int, int> >{{0, 0}, {1, 0}};     \
     })                                                              \
-  .add_argument("lhs", "NDArray-or-Symbol", "first input")                    \
+  .add_argument("lhs", "NDArray-or-Symbol", "first input")          \
   .add_argument("rhs", "NDArray-or-Symbol", "second input")
 
+/*! \brief Binary launch, with FComputeEx for csr and rsp available */
+#define MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(__name$, __kernel$)              \
+  MXNET_OPERATOR_REGISTER_BINARY(__name$)                                               \
+  .set_attr<FInferStorageType>("FInferStorageType",                                     \
+    ElemwiseStorageType<2, 1, true, true, true>)                                        \
+  .set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, __kernel$>)       \
+  .set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseBinaryOp::ComputeEx<cpu, __kernel$>) \
+  .set_attr<FResourceRequest>("FResourceRequest",  /* For Sparse CSR */ \
+    [](const NodeAttrs& attrs) { \
+      return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};})
+
+/*! \brief Binary launch, dense result
+ *         FInferStorageType attr is not set using this macro.
+ *         By default DefaultStorageType is used.
+ */
+#define MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(__name$, __kernel$)                  \
+  MXNET_OPERATOR_REGISTER_BINARY(__name$)                                                      \
+  .set_attr<FInferStorageType>("FInferStorageType",                                            \
+                               ElemwiseBinaryOp::SparseSparseWithDenseResult)                  \
+  .set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, __kernel$>)              \
+  .set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseBinaryOp::ComputeEx<cpu, __kernel$>)
+
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_BINARY_OP_H_
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc
index 65d4ca9aadd6..10e7fac5e978 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_op_basic.cc
@@ -18,39 +18,63 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
 #include "./elemwise_unary_op.h"
-#include "./elemwise_binary_op.h"
+#include "./elemwise_binary_op-inl.h"
 
 namespace mxnet {
 namespace op {
-MXNET_OPERATOR_REGISTER_BINARY(elemwise_add)
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_add, mshadow::op::plus)
+MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add)
 .add_alias("_add").add_alias("_plus").add_alias("_Plus")
-.describe("Adds arguments element-wise.")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow::op::plus>)
+.describe(R"code(Adds arguments element-wise.
+
+The storage type of ``elemwise_add`` output depends on storage types of inputs
+
+   - elemwise_add(row_sparse, row_sparse) = row_sparse
+   - elemwise_add(csr, csr) = csr
+   - otherwise, ``elemwise_add`` generates output with default storage
+
+)code")
 .set_attr<nnvm::FGradient>("FGradient", CloneGradient{"_backward_add"});
 
 // specialized gradient add function to do add to optimization
 // this must differ from elemwise_add to prevent add to optimization in forward pass.
-MXNET_OPERATOR_REGISTER_BINARY(_grad_add)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow::op::plus>);
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_grad_add, mshadow::op::plus);
 
 NNVM_REGISTER_OP(_backward_add)
 .set_num_inputs(1)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseNone<cpu, mshadow_op::identity,
-                                                                mshadow_op::identity>);
-
-MXNET_OPERATOR_REGISTER_BINARY(_sub)
-.add_alias("_minus").add_alias("_Minus")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow::op::minus>)
+                                [](const NodeAttrs &attrs) {
+                                  return std::vector<std::pair<int, int> >{{0, 0},
+                                                                           {0, 1}};
+                                })
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseNone<
+  cpu, mshadow_op::identity, mshadow_op::identity>)
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+                      ElemwiseBinaryOp::BackwardUseNoneEx<cpu, mshadow_op::identity,
+                      mshadow_op::identity>)
+.set_attr<FInferStorageType>("FInferStorageType",
+                             ElemwiseStorageType<1, 2, true, true, true>);
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_sub, mshadow::op::minus)
+MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub)
+.add_alias("_sub").add_alias("_minus").add_alias("_Minus")
+.describe(R"code(Subtracts arguments element-wise.
+
+The storage type of ``elemwise_sub`` output depends on storage types of inputs
+
+   - elemwise_sub(row_sparse, row_sparse) = row_sparse
+   - elemwise_sub(csr, csr) = csr
+   - otherwise, ``elemwise_sub`` generates output with default storage
+
+)code")
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_sub"});
 
 NNVM_REGISTER_OP(_backward_sub)
@@ -58,15 +82,42 @@ NNVM_REGISTER_OP(_backward_sub)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 0}, {0, 1}};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseNone<cpu, mshadow_op::identity,
-                                                                mshadow_op::negation>);
-
-MXNET_OPERATOR_REGISTER_BINARY(_mul)
-.add_alias("_Mul")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow::op::mul>)
+                                [](const NodeAttrs &attrs) {
+                                  return std::vector<std::pair<int, int> >{{0, 0},
+                                                                           {0, 1}};
+                                })
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseNone<cpu,
+  mshadow_op::identity, mshadow_op::negation>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseBinaryOp::BackwardUseNoneEx<cpu,
+  mshadow_op::identity, mshadow_op::negation>)
+.set_attr<FInferStorageType>("FInferStorageType",
+                             ElemwiseStorageType<1, 2, true, true, true>);
+
+MXNET_OPERATOR_REGISTER_BINARY(elemwise_mul)
+MXNET_ADD_SPARSE_OP_ALIAS(elemwise_mul)
+.describe(R"code(Multiplies arguments element-wise.
+
+The storage type of ``elemwise_mul`` output depends on storage types of inputs
+
+   - elemwise_mul(default, default) = default
+   - elemwise_mul(row_sparse, row_sparse) = row_sparse
+   - elemwise_mul(default, row_sparse) = default
+   - elemwise_mul(row_sparse, default) = default
+   - elemwise_mul(csr, csr) = csr
+   - otherwise, ``elemwise_mul`` generates output with default storage
+
+)code")
+.set_attr<FInferStorageType>("FInferStorageType",
+                             ElemwiseBinaryOp::AllowLRDenseInputWithSparseOutputStorageType<
+                               false, false>)  // 0 * nan or nan * 0 -> nan, so rsp * dns -> dns
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, mshadow::op::mul>)
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+                      ElemwiseBinaryOp::ComputeDnsLRValueEx<cpu, mshadow::op::mul, true, true>)
+.set_attr<FResourceRequest>("FResourceRequest",  /* For Sparse CSR */
+                              [](const NodeAttrs& attrs) {
+                                return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                              })
+.add_alias("_mul").add_alias("_Mul")
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mul"});
 
 NNVM_REGISTER_OP(_backward_mul)
@@ -74,15 +125,27 @@ NNVM_REGISTER_OP(_backward_mul)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseIn<cpu, mshadow_op::right,
-                                                              mshadow_op::left>);
-
-MXNET_OPERATOR_REGISTER_BINARY(_div)
-.add_alias("_Div")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow::op::div>)
+                                [](const NodeAttrs &attrs) {
+                                  return std::vector<std::pair<int, int> >{{0, 1}};
+                                })
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseBinaryOp::BackwardUseInStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",  /* For Sparse CSR */
+                              [](const NodeAttrs& attrs) {
+                                return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                              })
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseIn<
+  cpu, mshadow_op::right, mshadow_op::left>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseBinaryOp::BackwardUseInEx<
+  cpu, mshadow_op::right, mshadow_op::left>);
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(elemwise_div, mshadow::op::div)
+MXNET_ADD_SPARSE_OP_ALIAS(elemwise_div)
+.describe(R"code(Divides arguments element-wise.
+
+The storage type of ``elemwise_div`` output is always dense
+
+)code")
+.add_alias("_div").add_alias("_Div")
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_div"});
 
 NNVM_REGISTER_OP(_backward_div)
@@ -90,15 +153,17 @@ NNVM_REGISTER_OP(_backward_div)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseIn<cpu, mshadow_op::div_grad,
-                                                              mshadow_op::div_rgrad>);
+                                [](const NodeAttrs &attrs) {
+                                  return std::vector<std::pair<int, int> >{{0, 1}};
+                                })
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseIn<
+  cpu, mshadow_op::div_grad, mshadow_op::div_rgrad>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseBinaryOp::BackwardUseInEx<
+  cpu, mshadow_op::div_grad, mshadow_op::div_rgrad>);
 
 MXNET_OPERATOR_REGISTER_BINARY(_mod)
 .add_alias("_Mod")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::mod>)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, mshadow_op::mod>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mod"});
 
 NNVM_REGISTER_OP(_backward_mod)
@@ -106,11 +171,11 @@ NNVM_REGISTER_OP(_backward_mod)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseIn<cpu, mshadow_op::mod_grad,
-                                                         mshadow_op::mod_rgrad>);
+                                [](const NodeAttrs &attrs) {
+                                  return std::vector<std::pair<int, int> >{{0, 1}};
+                                })
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseIn<
+  cpu, mshadow_op::mod_grad, mshadow_op::mod_rgrad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index 429140a63bc5..9b55e2fd7648 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -18,52 +18,56 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
-#include "./elemwise_unary_op.h"
 #include "./elemwise_binary_op.h"
 
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(elemwise_add)
-.set_attr<FCompute>("FCompute<gpu>", BinaryComputeWithHalf2<gpu, mshadow::op::plus>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, mshadow::op::plus>);
 
 NNVM_REGISTER_OP(_grad_add)
-.set_attr<FCompute>("FCompute<gpu>", BinaryComputeWithHalf2<gpu, mshadow::op::plus>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, mshadow::op::plus>);
 
 NNVM_REGISTER_OP(_backward_add)
 .set_attr<FCompute>("FCompute<gpu>",
-                    BinaryBackwardUseNoneWithHalf2<gpu,
-                    mshadow_op::identity, mshadow_op::identity>);
+                    ElemwiseBinaryOp::BackwardUseNoneWithHalf2<gpu, mshadow_op::identity,
+                    mshadow_op::identity>);
 
-NNVM_REGISTER_OP(_sub)
-.set_attr<FCompute>("FCompute<gpu>", BinaryComputeWithHalf2<gpu, mshadow::op::minus>);
+NNVM_REGISTER_OP(elemwise_sub)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, mshadow::op::minus>);
 
 NNVM_REGISTER_OP(_backward_sub)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBackwardUseNoneWithHalf2<gpu, mshadow_op::identity,
-                                                                    mshadow_op::negation>);
+.set_attr<FCompute>("FCompute<gpu>",
+                    ElemwiseBinaryOp::BackwardUseNoneWithHalf2<gpu, mshadow_op::identity,
+                    mshadow_op::negation>);
 
-NNVM_REGISTER_OP(_mul)
-.set_attr<FCompute>("FCompute<gpu>", BinaryComputeWithHalf2<gpu, mshadow::op::mul>);
+NNVM_REGISTER_OP(elemwise_mul)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, mshadow::op::mul>);
 
 NNVM_REGISTER_OP(_backward_mul)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBackwardUseInWithHalf2<gpu, mshadow_op::right,
-                                                                  mshadow_op::left>);
+.set_attr<FCompute>("FCompute<gpu>",
+                    ElemwiseBinaryOp::BackwardUseInWithHalf2<gpu, mshadow_op::right,
+                    mshadow_op::left>);
 
-NNVM_REGISTER_OP(_div)
-.set_attr<FCompute>("FCompute<gpu>", BinaryComputeWithHalf2<gpu, mshadow::op::div>);
+NNVM_REGISTER_OP(elemwise_div)
+.set_attr<FCompute>("FCompute<gpu>",
+                    ElemwiseBinaryOp::ElemwiseBinaryOp::ComputeWithHalf2<gpu, mshadow::op::div>);
 
 NNVM_REGISTER_OP(_backward_div)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBackwardUseInWithHalf2<gpu, mshadow_op::div_grad,
-                                                                  mshadow_op::div_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>",
+                    ElemwiseBinaryOp::BackwardUseInWithHalf2<gpu, mshadow_op::div_grad,
+                    mshadow_op::div_rgrad>);
 
 NNVM_REGISTER_OP(_mod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryComputeWithHalf2<gpu, mshadow_op::mod>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::ComputeWithHalf2<gpu, mshadow_op::mod>);
 
 NNVM_REGISTER_OP(_backward_mod)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBackwardUseInWithHalf2<gpu, mshadow_op::mod_grad,
-                                                                  mshadow_op::mod_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>",
+  ElemwiseBinaryOp::BackwardUseInWithHalf2<gpu, mshadow_op::mod_grad, mshadow_op::mod_rgrad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op_extended.cc b/src/operator/tensor/elemwise_binary_op_extended.cc
index 31d977c9fd48..26d429c53312 100644
--- a/src/operator/tensor/elemwise_binary_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_op_extended.cc
@@ -18,17 +18,17 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
 #include "./elemwise_unary_op.h"
-#include "./elemwise_binary_op.h"
+#include "./elemwise_binary_op-inl.h"
 
 namespace mxnet {
 namespace op {
-MXNET_OPERATOR_REGISTER_BINARY(_power)
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_power, mshadow_op::power)
 .add_alias("_Power")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::power>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_power"});
 
 NNVM_REGISTER_OP(_backward_power)
@@ -36,15 +36,14 @@ NNVM_REGISTER_OP(_backward_power)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseIn<cpu, mshadow_op::power_grad,
-                                                              mshadow_op::power_rgrad>);
+                                [](const NodeAttrs &attrs) {
+                                  return std::vector<std::pair<int, int> >{{0, 1}};
+                                })
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseIn<
+  cpu, mshadow_op::power_grad, mshadow_op::power_rgrad>);
 
-MXNET_OPERATOR_REGISTER_BINARY(_maximum)
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_maximum, mshadow_op::maximum)
 .add_alias("_Maximum")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::maximum>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_maximum"});
 
 NNVM_REGISTER_OP(_backward_maximum)
@@ -52,15 +51,14 @@ NNVM_REGISTER_OP(_backward_maximum)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseIn<cpu, mshadow_op::ge,
-                                                              mshadow_op::lt>);
+                                [](const NodeAttrs &attrs) {
+                                  return std::vector<std::pair<int, int> >{{0, 1}};
+                                })
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseIn<cpu, mshadow_op::ge,
+  mshadow_op::lt>);
 
-MXNET_OPERATOR_REGISTER_BINARY(_minimum)
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_minimum, mshadow_op::minimum)
 .add_alias("_Minimum")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::minimum>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_minimum"});
 
 NNVM_REGISTER_OP(_backward_minimum)
@@ -68,18 +66,17 @@ NNVM_REGISTER_OP(_backward_minimum)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 1}};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseIn<cpu, mshadow_op::le,
-                                                              mshadow_op::gt>);
+                                [](const NodeAttrs &attrs) {
+                                  return std::vector<std::pair<int, int> >{{0, 1}};
+                                })
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseIn<cpu, mshadow_op::le,
+  mshadow_op::gt>);
 
-MXNET_OPERATOR_REGISTER_BINARY(_hypot)
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_hypot, mshadow_op::hypot)
 .add_alias("_Hypot")
 .describe(R"code(Given the "legs" of a right triangle, return its hypotenuse.
 
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::hypot>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_hypot" });
 
 NNVM_REGISTER_OP(_backward_hypot)
@@ -87,11 +84,11 @@ NNVM_REGISTER_OP(_backward_hypot)
 .set_num_outputs(2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
-[](const NodeAttrs& attrs) {
-  return std::vector<std::pair<int, int> > {{0, 1}};
-})
-.set_attr<FCompute>("FCompute<cpu>", BinaryBackwardUseIn<cpu, mshadow_op::hypot_grad_left,
-                                                              mshadow_op::hypot_grad_right>);
+                                [](const NodeAttrs &attrs) {
+                                  return std::vector<std::pair<int, int> > {{0, 1}};
+                                })
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::BackwardUseIn<cpu,
+  mshadow_op::hypot_grad_left, mshadow_op::hypot_grad_right>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op_extended.cu b/src/operator/tensor/elemwise_binary_op_extended.cu
index 9a10b05cf901..54eceb2f8a6b 100644
--- a/src/operator/tensor/elemwise_binary_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_op_extended.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
@@ -27,32 +28,32 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(_power)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::power>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::power>);
 
 NNVM_REGISTER_OP(_backward_power)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBackwardUseIn<gpu, mshadow_op::power_grad,
-                                                              mshadow_op::power_rgrad>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu,
+  mshadow_op::power_grad, mshadow_op::power_rgrad>);
 
 NNVM_REGISTER_OP(_maximum)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::maximum>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::maximum>);
 
 NNVM_REGISTER_OP(_backward_maximum)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBackwardUseIn<gpu, mshadow_op::ge,
-                                                              mshadow_op::lt>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::ge,
+  mshadow_op::lt>);
 
 NNVM_REGISTER_OP(_minimum)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::minimum>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::minimum>);
 
 NNVM_REGISTER_OP(_backward_minimum)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBackwardUseIn<gpu, mshadow_op::le,
-                                                              mshadow_op::gt>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu, mshadow_op::le,
+  mshadow_op::gt>);
 
 NNVM_REGISTER_OP(_hypot)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::hypot>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::hypot>);
 
 NNVM_REGISTER_OP(_backward_hypot)
-.set_attr<FCompute>("FCompute<gpu>", BinaryBackwardUseIn<gpu, mshadow_op::hypot_grad_left,
-                                          mshadow_op::hypot_grad_right>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::BackwardUseIn<gpu,
+  mshadow_op::hypot_grad_left, mshadow_op::hypot_grad_right>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_op_logic.cc b/src/operator/tensor/elemwise_binary_op_logic.cc
index 85f2bf11539d..5d328b56b553 100644
--- a/src/operator/tensor/elemwise_binary_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_op_logic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
@@ -28,32 +29,32 @@ namespace mxnet {
 namespace op {
 MXNET_OPERATOR_REGISTER_BINARY(_equal)
 .add_alias("_Equal")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::eq>)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, mshadow_op::eq>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY(_not_equal)
 .add_alias("_Not_Equal")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::ne>)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, mshadow_op::ne>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY(_greater)
 .add_alias("_Greater")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::gt>)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, mshadow_op::gt>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY(_greater_equal)
 .add_alias("_Greater_Equal")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::ge>)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, mshadow_op::ge>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY(_lesser)
 .add_alias("_Lesser")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::lt>)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, mshadow_op::lt>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 MXNET_OPERATOR_REGISTER_BINARY(_lesser_equal)
 .add_alias("_Lesser_Equal")
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, mshadow_op::le>)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseBinaryOp::Compute<cpu, mshadow_op::le>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_binary_op_logic.cu b/src/operator/tensor/elemwise_binary_op_logic.cu
index 1a703ed90ed0..be5b72204526 100644
--- a/src/operator/tensor/elemwise_binary_op_logic.cu
+++ b/src/operator/tensor/elemwise_binary_op_logic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
@@ -27,22 +28,22 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(_equal)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::eq>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::eq>);
 
 NNVM_REGISTER_OP(_not_equal)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::ne>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::ne>);
 
 NNVM_REGISTER_OP(_greater)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::gt>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::gt>);
 
 NNVM_REGISTER_OP(_greater_equal)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::ge>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::ge>);
 
 NNVM_REGISTER_OP(_lesser)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::lt>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::lt>);
 
 NNVM_REGISTER_OP(_lesser_equal)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow_op::le>);
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, mshadow_op::le>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op.h b/src/operator/tensor/elemwise_binary_scalar_op.h
index f27df274e5b2..cdf14055cf9c 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op.h
+++ b/src/operator/tensor/elemwise_binary_scalar_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.h
  * \brief Function definition of elementwise binary scalar operators
  */
@@ -29,43 +30,270 @@
 #include <utility>
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
+#include "elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
-template<typename xpu, typename OP>
-void BinaryScalarCompute(const nnvm::NodeAttrs& attrs,
-                         const OpContext& ctx,
-                         const std::vector<TBlob>& inputs,
-                         const std::vector<OpReqType>& req,
-                         const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  double alpha = nnvm::get<double>(attrs.parsed);
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
-    Tensor<xpu, 1, DType> lhs = inputs[0].FlatTo1D<xpu, DType>(s);
-    ASSIGN_DISPATCH(out, req[0], F<OP>(lhs, scalar<DType>(DType(alpha))));
-  });
-}
-
-template<typename xpu, typename OP>
-void BinaryScalarBackward(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx,
-                          const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  double alpha = nnvm::get<double>(attrs.parsed);
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Tensor<xpu, 1, DType> igrad = outputs[0].FlatTo1D<xpu, DType>(s);
-    Tensor<xpu, 1, DType> ograd = inputs[0].FlatTo1D<xpu, DType>(s);
-    Tensor<xpu, 1, DType> lhs = inputs[1].FlatTo1D<xpu, DType>(s);
-    ASSIGN_DISPATCH(igrad, req[0], ograd*F<OP>(lhs, scalar<DType>(DType(alpha))));
-  });
-}
+
+class BinaryScalarOp : public UnaryOp {
+  /*! \brief Tensor operation against a scalar with a dense result */
+  template<typename OP, typename DType, typename IType>
+  static void ComputeExDenseResultRsp(mshadow::Stream<cpu> *stream,
+                                      const nnvm::NodeAttrs &attrs,
+                                      const OpContext &ctx,
+                                      const NDArray &input,
+                                      const OpReqType req,
+                                      const NDArray &output) {
+    const double alpha = nnvm::get<double>(attrs.parsed);
+    CHECK_EQ(output.shape(), input.shape());
+    const int64_t row_count = output.shape()[0];
+    const int64_t items_per_row = output.shape().Size() / row_count;
+    const DType result_for_zero = OP::Map(DType(0), DType(alpha));
+    mshadow::Tensor<cpu, 1, DType> input_data = input.data().FlatTo1D<cpu, DType>(stream);
+    mshadow::Tensor<cpu, 1, DType> output_data = output.data().FlatTo1D<cpu, DType>(stream);
+    const int64_t sparse_row_count = input.aux_shape(rowsparse::kIdx).Size();
+    if (sparse_row_count != row_count) {
+      mshadow::Tensor<cpu, 1, IType> row_indexes = input.aux_data(
+        rowsparse::kIdx).FlatTo1D<cpu, IType>(stream);
+      int64_t input_iter = 0;
+      int64_t output_row = 0;
+      IType next_input_row = 0;
+      while (output_row < row_count) {
+        next_input_row = input_iter < sparse_row_count ? int64_t(row_indexes[input_iter])
+                                                       : row_count;
+        // Split up into blocks of contiguous data and do those together
+
+        // Do contiguous dense blocks
+        const int64_t dense_block_count = next_input_row - output_row;
+        if (dense_block_count > 0) {
+          MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, cpu>::Launch(
+              stream,
+              items_per_row * dense_block_count,
+              output_data.dptr_ + items_per_row * output_row,
+              result_for_zero);
+          });
+          output_row += dense_block_count;
+          continue;
+        }
+
+        // Do contiguous sparse blocks
+        int64_t next_non_contiguous_sparse = input_iter;
+        while (next_non_contiguous_sparse < sparse_row_count - 1) {
+          if (row_indexes[next_non_contiguous_sparse + 1]
+              != row_indexes[next_non_contiguous_sparse] + 1) {
+            break;
+          }
+          ++next_non_contiguous_sparse;
+        }
+        const int64_t sparse_block_count = next_non_contiguous_sparse - input_iter + 1;
+        if (sparse_block_count > 0) {
+          MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+            mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, cpu>::Launch(
+              stream,
+              items_per_row * sparse_block_count,
+              &output_data.dptr_[items_per_row * output_row],
+              &input_data.dptr_[items_per_row * input_iter],
+              DType(alpha));
+          });
+          output_row += sparse_block_count;
+          input_iter += sparse_block_count;
+          continue;
+        }
+      }
+    } else {
+      // All rows exist (eventually we don't have to do complex
+      // things to call GPU kernels because we don't need to access row indices)
+      MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, cpu>::Launch(
+          stream,
+          items_per_row * row_count,
+          output_data.dptr_,
+          input_data.dptr_,
+          DType(alpha));
+      });
+    }
+  }
+
+  /*! \brief Tensor operation against a scalar with a dense result */
+  template<typename OP, typename DType, typename IType>
+  static void ComputeExDenseResultRsp(mshadow::Stream<gpu> *stream,
+                                      const nnvm::NodeAttrs &attrs,
+                                      const OpContext &ctx,
+                                      const NDArray &input,
+                                      const OpReqType req,
+                                      const NDArray &output) {
+    LOG(FATAL) << "NOT IMPLEMENTED";
+  }
+
+  /*! \brief Tensor operation against a scalar with a dense result */
+  template<typename OP, typename DType, typename IType, typename CType>
+  static void ComputeExDenseResultCsr(mshadow::Stream<cpu> *stream,
+                                      const nnvm::NodeAttrs &attrs,
+                                      const OpContext &ctx,
+                                      const NDArray &input,
+                                      const OpReqType req,
+                                      const NDArray &output) {
+    CHECK_EQ(output.shape(), input.shape());
+
+    const double alpha = nnvm::get<double>(attrs.parsed);
+    const DType dense_fill_val = OP::Map(DType(0), DType(alpha));
+    const TBlob  column_indexes = input.aux_data(csr::kIdx);
+    const size_t item_count = column_indexes.Size();
+
+    // Pre-fill dense with 0-input/output value
+    FillDense<DType>(stream, output.shape().Size(), dense_fill_val,
+                     req, output.data().dptr<DType>());
+
+    mshadow::Tensor<cpu, 2, DType> out = AsRowise2D<DType>(stream, output.data());
+    if (item_count) {
+      const DType *in = input.data().dptr<DType>();
+      const IType *column_indexes_ptr = column_indexes.dptr<IType>();
+
+      const auto row_count = static_cast<size_t>(input.shape()[0]);
+      const TBlob row_starts = input.aux_data(csr::kIndPtr);
+      const CType *row_starts_ptr = row_starts.dptr<CType>();
+
+      #pragma omp parallel for
+      for (int i = 0; i < static_cast<int>(row_count); ++i) {
+        const bool last_row = i == static_cast<int>(row_count) - 1;
+        // Split up into blocks of contiguous data and do those together
+        const size_t row_item_start_iter = row_starts_ptr[i];
+        const size_t input_items_this_row = !last_row
+                                            ? static_cast<size_t>(row_starts_ptr[i + 1])
+                                              - row_item_start_iter
+                                            : item_count - row_item_start_iter;
+        if (input_items_this_row) {
+          const IType *this_row_column_indexes = column_indexes_ptr + row_item_start_iter;
+          const DType *row_data_start = in + row_item_start_iter;
+          DType *output_this_row = out[i].dptr_;
+          // More overhead to use OMP for small loops, so don't
+          if (input_items_this_row > 1000) {
+            #pragma omp parallel for
+            for (CType j = 0; j < static_cast<CType>(input_items_this_row); ++j) {
+              const IType col = this_row_column_indexes[j];
+              const DType val = row_data_start[j];
+              output_this_row[col] = OP::Map(val, DType(alpha));
+            }
+          } else {
+            for (CType j = 0; j < static_cast<CType>(input_items_this_row); ++j) {
+              const IType col = this_row_column_indexes[j];
+              const DType val = row_data_start[j];
+              output_this_row[col] = OP::Map(val, DType(alpha));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /*! \brief Tensor operation against a scalar with a dense result */
+  template<typename OP, typename DType, typename IType, typename CType>
+  static void ComputeExDenseResultCsr(mshadow::Stream<gpu> *stream,
+                                      const nnvm::NodeAttrs &attrs,
+                                      const OpContext &ctx,
+                                      const NDArray &input,
+                                      const OpReqType req,
+                                      const NDArray &output) {
+    LOG(FATAL) << "NOT IMPLEMENTED";
+  }
+
+  template<typename xpu, typename OP, typename DType, typename IType>
+  static void ComputeExDenseResult(const nnvm::NodeAttrs &attrs,
+                                   const OpContext &ctx,
+                                   const NDArray &input,
+                                   const OpReqType req,
+                                   const NDArray output) {
+    mshadow::Stream<xpu> *stream = ctx.get_stream<xpu>();
+    CHECK_EQ(output.storage_type(), kDefaultStorage);
+    switch (input.storage_type()) {
+      case kRowSparseStorage: {
+        ComputeExDenseResultRsp<OP, DType, IType>(stream, attrs, ctx, input, req, output);
+        break;
+      }
+      case kCSRStorage: {
+        MSHADOW_IDX_TYPE_SWITCH(input.aux_data(csr::kIndPtr).type_flag_, CType, {
+          ComputeExDenseResultCsr<OP, DType, IType, CType>(stream, attrs, ctx, input, req, output);
+        });
+        break;
+      }
+      default:
+        CHECK(false) << "Unsupported sparse storage type";
+        break;
+    }
+  }
+
+ public:
+  template<typename xpu, typename OP>
+  static void Compute(const nnvm::NodeAttrs &attrs,
+                      const OpContext &ctx,
+                      const std::vector<TBlob> &inputs,
+                      const std::vector<OpReqType> &req,
+                      const std::vector<TBlob> &outputs) {
+    DCHECK_EQ(inputs.size(), 1);
+    DCHECK_EQ(outputs.size(), 1);
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const double alpha = nnvm::get<double>(attrs.parsed);
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+          s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>(), DType(alpha));
+      });
+    });
+  }
+
+  template<typename xpu, typename OP>
+  static void ComputeEx(const nnvm::NodeAttrs &attrs,
+                        const OpContext &ctx,
+                        const std::vector<NDArray> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<NDArray> &outputs) {
+    DCHECK_EQ(inputs.size(), 1);
+    DCHECK_EQ(outputs.size(), 1);
+    const auto in_stype = inputs[0].storage_type();
+    const auto out_stype = outputs[0].storage_type();
+    if (req[0] == kNullOp) {
+      return;
+    }
+    if ((in_stype == kRowSparseStorage && out_stype == kRowSparseStorage) ||
+        (in_stype == kCSRStorage && out_stype == kCSRStorage)) {
+      // csr -> csr, or rsp -> rsp
+      UnaryOp::MapToFCompute<xpu>(attrs, ctx, inputs, req, outputs, Compute<xpu, OP>);
+    } else if (out_stype == kDefaultStorage &&
+              (in_stype == kRowSparseStorage || in_stype == kCSRStorage)) {
+      MSHADOW_TYPE_SWITCH(outputs[0].data().type_flag_, DType, {
+        MSHADOW_IDX_TYPE_SWITCH(inputs[0].aux_type(rowsparse::kIdx), IType, {
+          ComputeExDenseResult<xpu, OP, DType, IType>(attrs, ctx, inputs[0], req[0], outputs[0]);
+        });
+      });
+    } else {
+      LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+    }
+  }
+
+  template<typename xpu, typename OP>
+  static void Backward(const nnvm::NodeAttrs &attrs,
+                       const OpContext &ctx,
+                       const std::vector<TBlob> &inputs,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &outputs) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const double alpha = nnvm::get<double>(attrs.parsed);
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        mxnet::op::mxnet_op::Kernel<mxnet::op::mxnet_op::op_with_req<
+          mxnet::op::mxnet_op::backward_grad<OP>, Req>, xpu>::
+          Launch(s, inputs[0].Size(), outputs[0].dptr<DType>(),
+                 inputs[0].dptr<DType>(), inputs[1].dptr<DType>(),
+                 DType(alpha));
+      });
+    });
+  }
+};
 
 #define MXNET_OPERATOR_REGISTER_BINARY_SCALAR(name)                 \
   NNVM_REGISTER_OP(name)                                            \
@@ -80,7 +308,7 @@ void BinaryScalarBackward(const nnvm::NodeAttrs& attrs,
     [](const NodeAttrs& attrs){                                     \
       return std::vector<std::pair<int, int> >{{0, 0}};             \
     })                                                              \
-  .add_argument("data", "NDArray-or-Symbol", "source input")                   \
+  .add_argument("data", "NDArray-or-Symbol", "source input")        \
   .add_argument("scalar", "float", "scalar input")
 
 }  // namespace op
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
index 3249bcbaa7ca..2d6662ef2b99 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc
@@ -18,69 +18,195 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
-#include "./elemwise_unary_op.h"
+#include "../../common/utils.h"
 #include "./elemwise_binary_op.h"
 #include "./elemwise_binary_scalar_op.h"
 
+#define MXNET_OPERATOR_REGISTER_BINARY_WITH_SCALAR_SUPPORT_WITH_DENSE_RESULT(name)    \
+  NNVM_REGISTER_OP(name)                                            \
+  .set_num_inputs(1)                                                \
+  .set_num_outputs(1)                                               \
+  .set_attr_parser([](NodeAttrs* attrs) {                           \
+      attrs->parsed = std::stod(attrs->dict["scalar"]);             \
+    })                                                              \
+  .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
+  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)     \
+  .set_attr<FInferStorageType>("FInferStorageType",                 \
+    BinaryScalarStorageTypeWithDenseResultStorageType)              \
+  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                 \
+    [](const NodeAttrs& attrs){                                     \
+      return std::vector<std::pair<int, int> >{{0, 0}};             \
+    })                                                              \
+  .add_argument("data", "NDArray-or-Symbol", "source input")        \
+  .add_argument("scalar", "float", "scalar input")
+
 namespace mxnet {
 namespace op {
-MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_plus_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow::op::plus>)
+
+static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& attrs,
+                                                              const int dev_mask,
+                                                              DispatchMode* dispatch_mode,
+                                                              std::vector<int>* in_attrs,
+                                                              std::vector<int>* out_attrs)  {
+  bool dispatched = false;
+  if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) {
+    dispatched = storage_type_assign(&out_attrs[0],
+                                     kDefaultStorage,
+                                     dispatch_mode,
+                                     DispatchMode::kFCompute);
+  } else if (dev_mask == kCPU) {
+    dispatched = storage_type_assign(&out_attrs[0],
+                                     kDefaultStorage,
+                                     dispatch_mode,
+                                     DispatchMode::kFComputeEx);
+  }
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs,
+                                    const int dev_mask,
+                                    DispatchMode* dispatch_mode,
+                                    std::vector<int> *in_attrs,
+                                    std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const auto in_stype = in_attrs->at(0);
+  auto &out_stype = out_attrs->at(0);
+  bool dispatched = false;
+  if (!dispatched && in_stype == kDefaultStorage) {
+    // dns -> dns
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched && in_stype == kRowSparseStorage) {
+    // rsp -> rsp
+    dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+    // FComputeEx can handle dns output on cpu, too
+    if (dev_mask == cpu::kDevMask && out_stype == kDefaultStorage) {
+      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+      dispatched = true;
+    }
+  }
+  if (!dispatched && in_stype == kCSRStorage) {
+    // csr -> csr
+    dispatched = storage_type_assign(&out_stype, kCSRStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+    // FComputeEx can handle dns output on cpu, too
+    if (dev_mask == cpu::kDevMask && out_stype == kDefaultStorage) {
+      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+      dispatched = true;
+    }
+  }
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SCALAR_SUPPORT_WITH_DENSE_RESULT(_plus_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow::op::plus>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", BinaryScalarOp::ComputeEx<cpu, mshadow::op::plus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"})
 .add_alias("_PlusScalar");
 
-MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_minus_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow::op::minus>)
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SCALAR_SUPPORT_WITH_DENSE_RESULT(_minus_scalar)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow::op::minus>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", BinaryScalarOp::ComputeEx<cpu, mshadow::op::minus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"})
 .add_alias("_MinusScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rminus_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::rminus>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rminus>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"})
 .add_alias("_RMinusScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_mul_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow::op::mul>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_mul_scalar"})
+.describe(R"doc(Multiply an array with a scalar.
+
+``_mul_scalar`` only operates on data array of input if input is sparse.
+
+For example, if input of shape (100, 100) has only 2 non zero elements,
+i.e. input.data = [5, 6], scalar = nan,
+it will result output.data = [nan, nan] instead of 10000 nans.
+
+)doc" ADD_FILELINE)
+.set_attr<FInferStorageType>("FInferStorageType", BinaryScalarStorageType)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow::op::mul>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", BinaryScalarOp::ComputeEx<cpu, mshadow::op::mul>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_mul_scalar"})
 .add_alias("_MulScalar");
 
+MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_backward_mul_scalar)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BinaryScalarStorageType)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow::op::mul>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", BinaryScalarOp::ComputeEx<cpu, mshadow::op::mul>);
+
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_div_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow::op::div>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_div_scalar"})
+.describe(R"doc(Divide an array with a scalar.
+
+``_div_scalar`` only operates on data array of input if input is sparse.
+
+For example, if input of shape (100, 100) has only 2 non zero elements,
+i.e. input.data = [5, 6], scalar = nan,
+it will result output.data = [nan, nan] instead of 10000 nans.
+
+)doc" ADD_FILELINE)
+.set_attr<FInferStorageType>("FInferStorageType", BinaryScalarStorageType)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow::op::div>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", BinaryScalarOp::ComputeEx<cpu, mshadow::op::div>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_div_scalar"})
 .add_alias("_DivScalar");
 
+MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_backward_div_scalar)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", BinaryScalarStorageType)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow::op::div>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", BinaryScalarOp::ComputeEx<cpu, mshadow::op::div>);
+
+
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rdiv_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::rdiv>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rdiv>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rdiv_scalar"})
 .add_alias("_RDivScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_rdiv_scalar)
 .add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs* attrs) {attrs->parsed = std::stod(attrs->dict["scalar"]);})
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarBackward<cpu, mshadow_op::rdiv_grad>);
+.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = std::stod(attrs->dict["scalar"]); })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
+  cpu, mshadow_op::rdiv_grad>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_mod_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::mod>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::mod>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_mod_scalar"})
 .add_alias("_ModScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_mod_scalar)
 .add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs* attrs) {attrs->parsed = std::stod(attrs->dict["scalar"]);})
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarBackward<cpu, mshadow_op::mod_grad>);
+.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = std::stod(attrs->dict["scalar"]); })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
+  cpu, mshadow_op::mod_grad>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rmod_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::rmod>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::rmod>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rmod_scalar"})
 .add_alias("_RModScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_rmod_scalar)
 .add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs* attrs) {attrs->parsed = std::stod(attrs->dict["scalar"]);})
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarBackward<cpu, mshadow_op::rmod_grad>);
+.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = std::stod(attrs->dict["scalar"]); })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
+  cpu, mshadow_op::rmod_grad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
index a843f67c2723..21be0a0e12ea 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
@@ -28,37 +29,50 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(_plus_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow::op::plus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow::op::plus>);
 
 NNVM_REGISTER_OP(_minus_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow::op::minus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow::op::minus>);
 
 NNVM_REGISTER_OP(_rminus_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::rminus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rminus>);
 
 NNVM_REGISTER_OP(_mul_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow::op::mul>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow::op::mul>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, mshadow::op::mul>);
+
+NNVM_REGISTER_OP(_backward_mul_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow::op::mul>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, mshadow::op::mul>);
 
 NNVM_REGISTER_OP(_div_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow::op::div>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow::op::div>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, mshadow::op::div>);
+
+NNVM_REGISTER_OP(_backward_div_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow::op::div>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, mshadow::op::div>);
 
 NNVM_REGISTER_OP(_rdiv_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::rdiv>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rdiv>);
 
 NNVM_REGISTER_OP(_backward_rdiv_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarBackward<gpu, mshadow_op::rdiv_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu,
+  mshadow_op::rdiv_grad>);
 
 NNVM_REGISTER_OP(_mod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::mod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::mod>);
 
 NNVM_REGISTER_OP(_backward_mod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarBackward<gpu, mshadow_op::mod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::mod_grad>);
 
 NNVM_REGISTER_OP(_rmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::rmod>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rmod>);
 
 NNVM_REGISTER_OP(_backward_rmod_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarBackward<gpu, mshadow_op::rmod_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::rmod_grad>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
index 785fce2dcbb6..0f00e4893da6 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
@@ -28,57 +29,62 @@
 namespace mxnet {
 namespace op {
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_maximum_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::maximum>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::maximum>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_maximum_scalar"})
 .add_alias("_MaximumScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_maximum_scalar)
 .add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs* attrs) {attrs->parsed = std::stod(attrs->dict["scalar"]);})
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarBackward<cpu, mshadow_op::ge>);
+.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = std::stod(attrs->dict["scalar"]); })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::ge>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_minimum_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::minimum>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::minimum>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_minimum_scalar"})
 .add_alias("_MinimumScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_minimum_scalar)
 .add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs* attrs) {attrs->parsed = std::stod(attrs->dict["scalar"]);})
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarBackward<cpu, mshadow_op::le>);
+.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = std::stod(attrs->dict["scalar"]); })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<cpu, mshadow_op::le>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_power_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::power>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::power>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_power_scalar"})
 .add_alias("_PowerScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_power_scalar)
 .add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs* attrs) {attrs->parsed = std::stod(attrs->dict["scalar"]);})
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarBackward<cpu, mshadow_op::power_grad>);
+.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = std::stod(attrs->dict["scalar"]); })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
+  cpu, mshadow_op::power_grad>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_rpower_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::rpower>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<
+  cpu, mshadow_op::rpower>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_rpower_scalar"})
 .add_alias("_RPowerScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_rpower_scalar)
 .add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs* attrs) {attrs->parsed = std::stod(attrs->dict["scalar"]);})
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarBackward<cpu, mshadow_op::rpower_grad>);
+.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = std::stod(attrs->dict["scalar"]); })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
+  cpu, mshadow_op::rpower_grad>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_hypot_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::hypot>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<
+  cpu, mshadow_op::hypot>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_hypot_scalar" })
 .add_alias("_HypotScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_hypot_scalar)
 .add_argument("scalar", "float", "scalar value")
-.set_attr_parser([](NodeAttrs* attrs) {attrs->parsed = std::stod(attrs->dict["scalar"]); })
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarBackward<cpu, mshadow_op::hypot_grad_left>);
+.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = std::stod(attrs->dict["scalar"]); })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
+  cpu, mshadow_op::hypot_grad_left>);
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(smooth_l1)
-.describe(R"code(Calculate Smooth L1 Loss(lhs, scalar) by summing
+  .describe(R"code(Calculate Smooth L1 Loss(lhs, scalar) by summing
 
 .. math::
 
@@ -95,12 +101,14 @@ Example::
   smooth_l1([1, 2, 3, 4], sigma=1) = [0.5, 1.5, 2.5, 3.5]
 
 )code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::smooth_l1_loss>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<
+  cpu, mshadow_op::smooth_l1_loss>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_smooth_l1" });
 
 MXNET_OPERATOR_REGISTER_BINARY(_backward_smooth_l1)
-.set_attr_parser([](NodeAttrs* attrs) {attrs->parsed = std::stod(attrs->dict["scalar"]); })
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarBackward<cpu, mshadow_op::smooth_l1_gradient>);
+.set_attr_parser([](NodeAttrs *attrs) { attrs->parsed = std::stod(attrs->dict["scalar"]); })
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Backward<
+  cpu, mshadow_op::smooth_l1_gradient>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
index 74e6b7de478b..96884724d4db 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_extended.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
@@ -28,40 +29,45 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(_maximum_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::maximum>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::maximum>);
 
 NNVM_REGISTER_OP(_backward_maximum_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarBackward<gpu, mshadow_op::ge>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::ge>);
 
 NNVM_REGISTER_OP(_minimum_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::minimum>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::minimum>);
 
 NNVM_REGISTER_OP(_backward_minimum_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarBackward<gpu, mshadow_op::le>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<gpu, mshadow_op::le>);
 
 NNVM_REGISTER_OP(_power_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::power>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::power>);
 
 NNVM_REGISTER_OP(_backward_power_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarBackward<gpu, mshadow_op::power_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::power_grad>);
 
 NNVM_REGISTER_OP(_rpower_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::rpower>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::rpower>);
 
 NNVM_REGISTER_OP(_backward_rpower_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarBackward<gpu, mshadow_op::rpower_grad>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::rpower_grad>);
 
 NNVM_REGISTER_OP(_hypot_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::hypot>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::hypot>);
 
 NNVM_REGISTER_OP(_backward_hypot_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarBackward<gpu, mshadow_op::hypot_grad_left>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::hypot_grad_left>);
 
 NNVM_REGISTER_OP(smooth_l1)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::smooth_l1_loss>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<
+  gpu, mshadow_op::smooth_l1_loss>);
 
 NNVM_REGISTER_OP(_backward_smooth_l1)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarBackward<gpu, mshadow_op::smooth_l1_gradient>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Backward<
+  gpu, mshadow_op::smooth_l1_gradient>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
index 6771fff21387..61f1dd0d1c37 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cc
  * \brief CPU Implementation of unary function.
  */
@@ -29,32 +30,32 @@ namespace mxnet {
 namespace op {
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_equal_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::eq>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::eq>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_EqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_not_equal_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::ne>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::ne>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_NotEqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_greater_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::gt>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::gt>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_GreaterScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_greater_equal_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::ge>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::ge>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_GreaterEqualScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_lesser_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::lt>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::lt>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_LesserScalar");
 
 MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_lesser_equal_scalar)
-.set_attr<FCompute>("FCompute<cpu>", BinaryScalarCompute<cpu, mshadow_op::le>)
+.set_attr<FCompute>("FCompute<cpu>", BinaryScalarOp::Compute<cpu, mshadow_op::le>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_alias("_LesserEqualScalar");
 
diff --git a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu
index 9fee4e9e4c87..91bcaa8de412 100644
--- a/src/operator/tensor/elemwise_binary_scalar_op_logic.cu
+++ b/src/operator/tensor/elemwise_binary_scalar_op_logic.cu
@@ -18,33 +18,32 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file elemwise_binary_scalar_op.cu
  * \brief GPU Implementation of unary function.
  */
-#include "./elemwise_unary_op.h"
-#include "./elemwise_binary_op.h"
-#include "./elemwise_binary_scalar_op.h"
+#include "elemwise_binary_scalar_op.h"
 
 namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_equal_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::eq>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::eq>);
 
 NNVM_REGISTER_OP(_not_equal_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::ne>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::ne>);
 
 NNVM_REGISTER_OP(_greater_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::gt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::gt>);
 
 NNVM_REGISTER_OP(_greater_equal_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::ge>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::ge>);
 
 NNVM_REGISTER_OP(_lesser_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::lt>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::lt>);
 
 NNVM_REGISTER_OP(_lesser_equal_scalar)
-.set_attr<FCompute>("FCompute<gpu>", BinaryScalarCompute<gpu, mshadow_op::le>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow_op::le>);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_scatter_op.cc b/src/operator/tensor/elemwise_scatter_op.cc
new file mode 100644
index 000000000000..ec5df9b4e14b
--- /dev/null
+++ b/src/operator/tensor/elemwise_scatter_op.cc
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./elemwise_binary_op-inl.h"
+#include "./elemwise_binary_scalar_op.h"
+#include "./elemwise_scatter_op.h"
+
+namespace mxnet {
+namespace op {
+
+static bool fail_storage_type_inference(const NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int>* in_attrs,
+                                        std::vector<int>* out_attrs) {
+  dispatch_fallback(out_attrs, dispatch_mode);
+  if (*dispatch_mode == DispatchMode::kFComputeFallback) {
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+static bool StorageTypeRspOrDenseOutput(const NodeAttrs& attrs,
+                                        const int dev_mask,
+                                        DispatchMode* dispatch_mode,
+                                        std::vector<int>* in_attrs,
+                                        std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const auto lhs_stype = static_cast<NDArrayStorageType>((*in_attrs)[0]);
+  if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)
+      && common::ContainsOnlyStorage(*out_attrs, kDefaultStorage)) {
+    if (storage_type_assign(&out_attrs[0], kDefaultStorage,
+                            dispatch_mode, DispatchMode::kFCompute)) {
+      return true;
+    }
+  }
+  if (lhs_stype == kRowSparseStorage) {
+    if (storage_type_assign(&out_attrs[0], kRowSparseStorage,
+                            dispatch_mode,
+                            DispatchMode::kFComputeEx)) {
+      return true;
+    }
+  }
+  return fail_storage_type_inference(attrs, dev_mask, dispatch_mode, in_attrs, out_attrs);
+}
+
+static bool StorageTypeScatteredScalarOp(const NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int>* in_attrs,
+                                         std::vector<int>* out_attrs) {
+  // Supports kDefaultStorage, kRowSparseStorage and kCSRStorage
+  const auto stype = static_cast<NDArrayStorageType>((*in_attrs)[0]);
+  if (storage_type_assign(out_attrs,
+                         stype,
+                         dispatch_mode,
+                         stype == kDefaultStorage ? DispatchMode::kFCompute
+                                                  : DispatchMode::kFComputeEx)) {
+    return true;
+  }
+  return fail_storage_type_inference(attrs, dev_mask, dispatch_mode, in_attrs, out_attrs);
+}
+
+/*! \brief _scatter_elemwise_div */
+MXNET_OPERATOR_REGISTER_BINARY(_scatter_elemwise_div)
+.set_attr<FCompute>("FCompute<cpu>", ElemwiseScatterBinaryOp::Compute<cpu, mshadow::op::div>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ElemwiseScatterBinaryOp::ComputeEx<cpu, mshadow::op::div>)
+.describe(R"code(Divides arguments element-wise.  If the left-hand-side input is 'row_sparse', then
+only the values which exist in the left-hand sparse array are computed.  The 'missing' values
+are ignored.
+
+The storage type of ``_scatter_elemwise_div`` output depends on storage types of inputs
+
+- _scatter_elemwise_div(row_sparse, row_sparse) = row_sparse
+- _scatter_elemwise_div(row_sparse, dense) = row_sparse
+- _scatter_elemwise_div(row_sparse, csr) = row_sparse
+- otherwise, ``_scatter_elemwise_div`` behaves exactly like elemwise_div and generates output
+with default storage
+
+)code")
+.set_attr<FInferStorageType>("FInferStorageType", StorageTypeRspOrDenseOutput)
+.set_attr<FResourceRequest>("FResourceRequest",
+                            [](const NodeAttrs& attrs) {
+                              return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+                            })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_div"});
+
+/*! \brief _scatter_plus_scalar */
+MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_scatter_plus_scalar)
+.describe(R"code(Adds a scalar to a tensor element-wise.  If the left-hand-side input is
+'row_sparse' or 'csr', then only the values which exist in the left-hand sparse array are computed.
+The 'missing' values are ignored.
+
+The storage type of ``_scatter_plus_scalar`` output depends on storage types of inputs
+
+- _scatter_plus_scalar(row_sparse, scalar) = row_sparse
+- _scatter_plus_scalar(csr, scalar) = csr
+- otherwise, ``_scatter_plus_scalar`` behaves exactly like _plus_scalar and generates output
+with default storage
+
+)code")
+.set_attr<FInferStorageType>("FInferStorageType", StorageTypeScatteredScalarOp)
+.set_attr<FCompute>("FCompute<cpu>",
+                    ElemwiseScatterBinaryScalarOp::Compute<cpu, mshadow::op::plus>)
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+                      ElemwiseScatterBinaryScalarOp::ComputeEx<cpu, mshadow::op::plus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+
+/*! \brief _scatter_minus_scalar */
+MXNET_OPERATOR_REGISTER_BINARY_SCALAR(_scatter_minus_scalar)
+  .describe(R"code(Subtracts a scalar to a tensor element-wise.  If the left-hand-side input is
+'row_sparse' or 'csr', then only the values which exist in the left-hand sparse array are computed.
+The 'missing' values are ignored.
+
+The storage type of ``_scatter_minus_scalar`` output depends on storage types of inputs
+
+- _scatter_minus_scalar(row_sparse, scalar) = row_sparse
+- _scatter_minus_scalar(csr, scalar) = csr
+- otherwise, ``_scatter_minus_scalar`` behaves exactly like _minus_scalar and generates output
+with default storage
+
+)code")
+.set_attr<FInferStorageType>("FInferStorageType", StorageTypeScatteredScalarOp)
+.set_attr<FCompute>("FCompute<cpu>",
+                    ElemwiseScatterBinaryScalarOp::Compute<cpu, mshadow::op::minus>)
+.set_attr<FComputeEx>("FComputeEx<cpu>",
+                      ElemwiseScatterBinaryScalarOp::ComputeEx<cpu, mshadow::op::minus>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_scatter_op.cu b/src/operator/tensor/elemwise_scatter_op.cu
new file mode 100644
index 000000000000..28c8df3de6c3
--- /dev/null
+++ b/src/operator/tensor/elemwise_scatter_op.cu
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "./elemwise_binary_scalar_op.h"
+#include "./elemwise_scatter_op.h"
+
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_scatter_elemwise_div)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseScatterBinaryOp::Compute<gpu, mshadow::op::div>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", ElemwiseScatterBinaryOp::ComputeEx<gpu, mshadow::op::div>);
+
+NNVM_REGISTER_OP(_scatter_plus_scalar)
+.set_attr<FCompute>("FCompute<gpu>",
+                    ElemwiseScatterBinaryScalarOp::Compute<gpu, mshadow::op::plus>)
+.set_attr<FComputeEx>("FComputeEx<gpu>",
+                      ElemwiseScatterBinaryScalarOp::ComputeEx<gpu, mshadow::op::plus>);
+
+NNVM_REGISTER_OP(_scatter_minus_scalar)
+.set_attr<FCompute>("FCompute<gpu>", BinaryScalarOp::Compute<gpu, mshadow::op::minus>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", BinaryScalarOp::ComputeEx<gpu, mshadow::op::minus>);
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/tensor/elemwise_scatter_op.h b/src/operator/tensor/elemwise_scatter_op.h
new file mode 100644
index 000000000000..de6b23cae18a
--- /dev/null
+++ b/src/operator/tensor/elemwise_scatter_op.h
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elementwise_scatter_op.h
+ * \brief Function definition of elementwise scatter operators
+ */
+#ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_SCATTER_OP_H_
+#define MXNET_OPERATOR_TENSOR_ELEMWISE_SCATTER_OP_H_
+
+#include <vector>
+#include "./elemwise_binary_op.h"
+#include "./elemwise_binary_scalar_op.h"
+#include "sparse_retain-inl.h"
+#include "cast_storage-inl.h"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief Shared helper functions for scatter ops
+ */
+class ScatterOpBase {
+  /*! \brief Protected in order to prevent widespread use. Scatter ops is a special case */
+ protected:
+  /*!
+   * \brief For some situations, we need to do the computation as dense and then use
+   * sparse-retain to strip out the portions we aren't interested in.
+   * \note If your operastor uses this function, it must request kTempStorage
+   * \tparam xpu gpu or cpu
+   * \tparam Function Function to call with dense inputs and outputs
+   * \param attrs Operator attributes
+   * \param ctx Operator context
+   * \param inputs Input NDArrays
+   * \param req Operation request
+   * \param outputs Output NDArrays
+   * \param function
+   */
+  template<typename xpu, typename Function>
+  static void ComputeAsDense(const nnvm::NodeAttrs &attrs,
+                             const OpContext &ctx,
+                             const std::vector<NDArray> &inputs,
+                             const std::vector<OpReqType> &req,
+                             const std::vector<NDArray> &outputs,
+                             Function function) {
+    std::vector<bool> output_converted;
+    std::vector<TBlob>   input_data, output_data;
+    std::vector<NDArray> other_inputs, other_outputs;
+    other_inputs.reserve(inputs.size());
+    input_data.reserve(inputs.size());
+    output_data.reserve(outputs.size());
+    other_outputs.reserve(outputs.size());
+    output_converted.reserve(outputs.size());
+    // Inputs...
+    for (const NDArray& nd : inputs) {
+      if (nd.storage_type() != kDefaultStorage) {
+        NDArray in(nd.shape(), ctx.run_ctx.get_ctx());
+        CastStorageComputeEx<xpu>(attrs, ctx, { nd }, req, { in });
+        other_inputs.push_back(in);
+        input_data.push_back(in.data());
+      } else {
+        input_data.push_back(nd.data());
+      }
+    }
+
+    // Outputs...
+    for (const NDArray& nd : outputs) {
+      if (nd.storage_type() != kDefaultStorage) {
+        NDArray out(nd.shape(), ctx.run_ctx.get_ctx());
+        CastStorageComputeEx<xpu>(attrs, ctx, { nd }, req, { out });
+        other_outputs.push_back(out);
+        output_data.push_back(out.data());
+        output_converted.push_back(true);
+      } else {
+        other_outputs.push_back(nd);
+        output_data.push_back(nd.data());
+        output_converted.push_back(false);
+      }
+    }
+
+    // Call the function
+    function(attrs, ctx, input_data, req, output_data);
+
+    // Convert output(s) back if necessary
+    for (size_t i = 0, n = outputs.size(); i < n; ++i) {
+      if (output_converted[i]) {
+        CastStorageComputeEx<xpu>(attrs,
+                                  ctx,
+                                  { other_outputs[i] },
+                                  req,
+                                  { outputs[i] });
+      }
+    }
+  }
+
+  /*!
+   * \brief Execute the supplied function/operation, followed by a sparse retain operation
+   * of the lhs argument's rows only (row indices)
+   * \tparam xpu gpu or cpu
+   * \tparam Function Function type call to wrap and return sparse-retained output
+   * \param attrs Operator attributes
+   * \param ctx Operator context
+   * \param inputs Input NDArrays
+   * \param req Operation request
+   * \param outputs Output NDArrays
+   * \param pre_retain Whether to call SparseRetain before calling the given function
+   * \param function Function call to wrap and return sparse-retained output
+   */
+  template <typename xpu, typename Function>
+  static void ScatterWrap(const nnvm::NodeAttrs &attrs,
+                          const OpContext &ctx,
+                          const std::vector<NDArray> &inputs,
+                          const std::vector<OpReqType> &req,
+                          const std::vector<NDArray> &outputs,
+                          bool pre_retain,
+                          Function function) {
+    CHECK_EQ(outputs.size(), 1U);
+    if (inputs[0].storage_type() == kRowSparseStorage
+        && outputs[0].storage_type() == kRowSparseStorage) {
+      if (pre_retain && inputs[1].storage_type() == kRowSparseStorage) {
+        // Retain only rhs rows which have same row as lhs input
+        NDArray retained_input(outputs[0].storage_type(), outputs[0].shape(), outputs[0].ctx());
+        SparseRetainOpForwardEx<xpu>(attrs, ctx,
+                                     { inputs[1], inputs[0].aux_ndarray(rowsparse::kIdx) },
+                                     req,
+                                     {retained_input});
+        CHECK(retained_input.storage_initialized());
+        // Perform the operation
+        function(attrs, ctx, {inputs[0], retained_input}, req, outputs);
+        // Sanity check
+        DCHECK_LE(outputs[0].aux_shape(rowsparse::kIdx).Size(),
+                  inputs[0].aux_shape(rowsparse::kIdx).Size());
+      } else {
+        // Perform the operation as usual
+        NDArray temp_out(outputs[0].storage_type(), outputs[0].shape(), outputs[0].ctx());
+        function(attrs, ctx, inputs, req, { temp_out });
+        CHECK(temp_out.storage_initialized());
+        CHECK_EQ(temp_out.storage_type(), kRowSparseStorage);
+        // Sparse-retain the output based upon lhs-input sparsity
+        const NDArray indices(inputs[0].aux_data(rowsparse::kIdx), inputs[0].ctx().dev_id);
+        SparseRetainOpForwardEx<xpu>(attrs, ctx, { temp_out, indices },
+                                     req, outputs);
+        DCHECK_LE(outputs[0].aux_shape(rowsparse::kIdx).Size(),
+                  inputs[0].aux_shape(rowsparse::kIdx).Size());
+      }
+    } else {
+      function(attrs, ctx, inputs, req, outputs);
+    }
+  }
+};
+
+/*! \brief Scatter elemwise binary op handlers */
+class ElemwiseScatterBinaryOp : public ElemwiseBinaryOp,
+                                public ScatterOpBase {
+  /*! \brief  CPU version, RspRsp knows how to do an efficient scatter,
+   * otherwise retain rhs + normal op */
+  template<typename OP>
+  static void ComputeEx_(mshadow::Stream<cpu> *stream,
+                         const nnvm::NodeAttrs &attrs,
+                         const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+    // row_sparse-op-row_sparse or row_sparse-op-default can call RspRsp
+    const NDArrayStorageType input0_stype = inputs[0].storage_type();
+    const NDArrayStorageType input1_stype = inputs[1].storage_type();
+    if (input0_stype == kRowSparseStorage
+        && (input1_stype == kRowSparseStorage || input1_stype == kDefaultStorage)
+        && outputs[0].storage_type() == kRowSparseStorage) {
+      mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+      MSHADOW_TYPE_SWITCH(inputs[0].dtype(), DType, {
+        MSHADOW_IDX_TYPE_SWITCH(inputs[0].aux_type(rowsparse::kIdx), IType, {
+          RspRspOp<DType, IType, OP>(s, attrs, ctx, inputs[0], inputs[1], req[0], outputs[0],
+                                     false, true, false, true);
+        });
+      });
+      CHECK_EQ(inputs[0].aux_shape(rowsparse::kIdx).Size(),
+               outputs[0].aux_shape(rowsparse::kIdx).Size());
+    } else {
+      ScatterWrap<cpu>(attrs, ctx, inputs, req,
+                       outputs, true, [input0_stype, input1_stype](const nnvm::NodeAttrs &attrs,
+                                         const OpContext &ctx,
+                                         const std::vector<NDArray> &inputs,
+                                         const std::vector<OpReqType> &req,
+                                         const std::vector<NDArray> &outputs) {
+          if ((input0_stype == kCSRStorage || input1_stype == kCSRStorage)
+              && input0_stype != input1_stype) {
+            // Fallback to dense + retain
+            ComputeAsDense<cpu>(attrs, ctx, inputs, req,
+                                outputs, ElemwiseBinaryOp::Compute<cpu, OP>);
+          } else {
+            // Normal operation + retain
+            ElemwiseBinaryOp::ComputeEx<cpu, OP>(attrs, ctx, inputs, req, outputs);
+          }
+        });
+    }
+  }
+
+#ifdef __CUDACC__
+  /*! \brief GPU version, fallback op + retain */
+  template<typename OP>
+  static void ComputeEx_(mshadow::Stream<gpu> *stream,
+                         const nnvm::NodeAttrs &attrs,
+                         const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+    ScatterWrap<gpu>(attrs, ctx, inputs, req,
+                     outputs, false, [](const nnvm::NodeAttrs &attrs,
+                                        const OpContext &ctx,
+                                        const std::vector<NDArray> &inputs,
+                                        const std::vector<OpReqType> &req,
+                                        const std::vector<NDArray> &outputs) {
+        ComputeAsDense<gpu>(attrs, ctx, inputs, req, outputs, ElemwiseBinaryOp::Compute<gpu, OP>);
+      });
+  }
+#endif  // #ifdef __CUDACC__
+
+ public:
+  /*! \brief General compute for operations which include sparse tensors */
+  template<typename xpu, typename OP>
+  static void ComputeEx(const nnvm::NodeAttrs &attrs,
+                        const OpContext &ctx,
+                        const std::vector<NDArray> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<NDArray> &outputs) {
+    DCHECK_EQ(inputs.size(), 2U);
+    DCHECK_EQ(outputs.size(), 1U);
+    ComputeEx_<OP>(ctx.get_stream<xpu>(), attrs, ctx, inputs, req, outputs);
+  }
+};
+
+/*! \brief Scatter elemwise binary scalar op handlers */
+class ElemwiseScatterBinaryScalarOp : public BinaryScalarOp,
+                                      public ScatterOpBase {
+  /*! \brief  CPU version, retain rhs + normal op */
+  template<typename OP>
+  static void ComputeEx_(mshadow::Stream<cpu> *stream,
+                         const nnvm::NodeAttrs &attrs,
+                         const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+    ScatterWrap<cpu>(attrs, ctx, inputs, req,
+                     outputs, true, [](const nnvm::NodeAttrs &attrs,
+                                        const OpContext &ctx,
+                                        const std::vector<NDArray> &inputs,
+                                        const std::vector<OpReqType> &req,
+                                        const std::vector<NDArray> &outputs) {
+        // Normal operation + retain
+        BinaryScalarOp::ComputeEx<cpu, OP>(attrs, ctx, inputs, req, outputs);
+    });
+  }
+
+#ifdef __CUDACC__
+  /*! \brief GPU version, fallback op + retain */
+  template<typename OP>
+  static void ComputeEx_(mshadow::Stream<gpu> *stream,
+                         const nnvm::NodeAttrs &attrs,
+                         const OpContext &ctx,
+                         const std::vector<NDArray> &inputs,
+                         const std::vector<OpReqType> &req,
+                         const std::vector<NDArray> &outputs) {
+    CHECK_NE(inputs[0].storage_type(), kDefaultStorage);
+    if (outputs[0].storage_type() == inputs[0].storage_type()) {
+      BinaryScalarOp::ComputeEx<gpu, OP>(attrs, ctx, inputs, req, outputs);
+    } else {
+      ScatterWrap<cpu>(attrs, ctx, inputs, req,
+                       outputs, false, [](const nnvm::NodeAttrs &attrs,
+                                          const OpContext &ctx,
+                                          const std::vector<NDArray> &inputs,
+                                          const std::vector<OpReqType> &req,
+                                          const std::vector<NDArray> &outputs) {
+          // Fallback to dense + retain
+          ComputeAsDense<gpu>(attrs, ctx, inputs, req, outputs, BinaryScalarOp::Compute<gpu, OP>);
+      });
+    }
+  }
+#endif  // __CUDACC__
+
+ public:
+  using BinaryScalarOp::Compute;
+
+  /*! \brief General compute for operations which include sparse tensors */
+  template<typename xpu, typename OP>
+  static void ComputeEx(const nnvm::NodeAttrs &attrs,
+                        const OpContext &ctx,
+                        const std::vector<NDArray> &inputs,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<NDArray> &outputs) {
+    DCHECK_EQ(inputs.size(), 1U);
+    DCHECK_EQ(outputs.size(), 1U);
+    CHECK_NE(inputs[0].storage_type(), kDefaultStorage);
+    if (inputs[0].storage_type() == kRowSparseStorage
+        && outputs[0].storage_type() == kRowSparseStorage) {
+      UnaryOp::MapToFCompute<xpu>(attrs, ctx, inputs, req, outputs, Compute<xpu, OP>);
+    } else {
+      ComputeEx_<OP>(ctx.get_stream<xpu>(), attrs, ctx, inputs, req, outputs);
+    }
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_SCATTER_OP_H_
diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc
index 652be72f3fab..041a0be00796 100644
--- a/src/operator/tensor/elemwise_sum.cc
+++ b/src/operator/tensor/elemwise_sum.cc
@@ -18,10 +18,12 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file elemwise_sum.cc
  * \brief elementwise sum operator
 */
 #include "./elemwise_sum.h"
+#include "../../ndarray/ndarray_function.h"
 
 namespace mxnet {
 namespace op {
@@ -54,14 +56,70 @@ std::vector<nnvm::NodeEntry> ElementWiseSumGrad(
   return ret;
 }
 
+bool ElementWiseSumShape(const nnvm::NodeAttrs& attrs,
+                         std::vector<TShape> *in_attrs,
+                         std::vector<TShape> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
+    attrs, in_attrs, out_attrs, TShape());
+}
+
+bool ElementWiseSumType(const nnvm::NodeAttrs& attrs,
+                        std::vector<int> *in_attrs,
+                        std::vector<int> *out_attrs) {
+  CHECK_EQ(out_attrs->size(), 1);
+  return ElemwiseAttr<int, type_is_none, type_assign, true, type_string>(
+    attrs, in_attrs, out_attrs, -1);
+}
+
+bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                           const int dev_mask,
+                                           DispatchMode* dispatch_mode,
+                                           std::vector<int> *in_attrs,
+                                           std::vector<int> *out_attrs) {
+  CHECK(!in_attrs->empty());
+  CHECK_EQ(out_attrs->size(), 1U);
+  return ElemwiseStorageAttr<false, true, false>(attrs, dev_mask, dispatch_mode,
+                                                 in_attrs, out_attrs);
+}
+
+void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& op_ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  CHECK(!inputs.empty());
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo";
+  if (inputs[0].storage_type() == kRowSparseStorage) {
+    mshadow::Stream<cpu>* s = op_ctx.get_stream<cpu>();
+    Resource rsc = ResourceManager::Get()->Request(op_ctx.run_ctx.get_ctx(),
+        ResourceRequest(ResourceRequest::kTempSpace));
+    NDArray out_nd = outputs[0];
+    mxnet::ndarray::ElementwiseSum<cpu>(s, rsc, inputs, &out_nd);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, op_ctx, inputs, req, outputs);
+  }
+}
+
 NNVM_REGISTER_OP(add_n)
 .add_alias("ElementWiseSum")
+.add_alias("_sparse_add_n")
+.add_alias("_sparse_ElementWiseSum")
 .describe(R"doc(Adds all input arguments element-wise.
 
 .. math::
    add\_n(a_1, a_2, ..., a_n) = a_1 + a_2 + ... + a_n
 
 ``add_n`` is potentially more efficient than calling ``add`` by `n` times.
+
+The storage type of ``add_n`` output depends on storage types of inputs
+
+- add_n(row_sparse, row_sparse, ..) = row_sparse
+- otherwise, ``add_n`` generates output with default storage
+
 )doc" ADD_FILELINE)
 .set_attr_parser(ParamParser<ElementWiseSumParam>)
 .set_num_inputs([](const nnvm::NodeAttrs& attrs) {
@@ -79,16 +137,20 @@ NNVM_REGISTER_OP(add_n)
   })
 .set_attr<std::string>("key_var_num_args", "num_args")
 .set_attr<FCompute>("FCompute<cpu>", ElementWiseSumCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ElementWiseSumComputeExCPU)
 .set_attr<nnvm::FInplaceOption>(
     "FInplaceOption", [](const NodeAttrs& attrs) {
       return std::vector<std::pair<int, int> >{{0, 0}};
     })
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<-1, 1>)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<-1, 1>)
-.set_attr<nnvm::FGradient>("FGradient", CloneGradient{"_backward_add_n"})
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", ElementWiseSumShape)
+.set_attr<nnvm::FInferType>("FInferType", ElementWiseSumType)
+.set_attr<FInferStorageType>("FInferStorageType", ElementWiseSumForwardInferStorageType)
+.set_attr<nnvm::FGradient>("FGradient", ElementWiseSumGrad)
 .add_argument("args", "NDArray-or-Symbol[]", "Positional input arguments");
 
-
-
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_sum.cu b/src/operator/tensor/elemwise_sum.cu
index ce734ad90c8a..21a80f6264a4 100644
--- a/src/operator/tensor/elemwise_sum.cu
+++ b/src/operator/tensor/elemwise_sum.cu
@@ -18,16 +18,39 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file elemwise_sum.cu
  * \brief elementwise sum operator
 */
 #include "./elemwise_sum.h"
+#include "../../ndarray/ndarray_function.h"
 
 namespace mxnet {
 namespace op {
 
+void ElementWiseSumComputeExGPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& op_ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  CHECK(!inputs.empty());
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExGPU only supports req = kWriteTo";
+  if (inputs[0].storage_type() == kRowSparseStorage) {
+    mshadow::Stream<gpu>* s = op_ctx.get_stream<gpu>();
+    NDArray out_nd = outputs[0];
+    mxnet::ndarray::ElementwiseSum<gpu>(s, op_ctx.requested[0], inputs, &out_nd);
+  } else {
+    LOG(FATAL) << "Not implemented: "
+               << operator_string(attrs, op_ctx, inputs, req, outputs);
+  }
+}
+
 NNVM_REGISTER_OP(add_n)
-.set_attr<FCompute>("FCompute<gpu>", ElementWiseSumComputeWithHalf2<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", ElementWiseSumComputeWithHalf2<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", ElementWiseSumComputeExGPU);
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_sum.h b/src/operator/tensor/elemwise_sum.h
index 3d6d72511156..acf73e722b41 100644
--- a/src/operator/tensor/elemwise_sum.h
+++ b/src/operator/tensor/elemwise_sum.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file elemwise_sum.h
  * \brief elementwise sum
  * \author Bing Xu
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
deleted file mode 100644
index defe72d3738c..000000000000
--- a/src/operator/tensor/elemwise_unary_op.cc
+++ /dev/null
@@ -1,703 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file elemwise_unary_op.cc
- * \brief CPU Implementation of unary function.
- */
-#include "./elemwise_unary_op.h"
-#include "./elemwise_binary_op.h"
-
-namespace mxnet {
-namespace op {
-// relu
-MXNET_OPERATOR_REGISTER_UNARY(relu)
-.describe(R"code(Computes rectified linear.
-
-.. math::
-   max(features, 0)
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_relu"})
-.set_attr<FCompute>("FCompute<cpu>",
-    UnaryLaunch<cpu, kernel_launch_op::relu>);
-
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_relu)
-.set_attr<FCompute>("FCompute<cpu>",
-    BinaryLaunch<cpu, kernel_launch_op::relu_grad>);
-
-
-// sigmoid
-MXNET_OPERATOR_REGISTER_UNARY(sigmoid)
-.describe(R"code(Computes sigmoid of x element-wise.
-
-.. math::
-   y = 1 / (1 + exp(-x))
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sigmoid"})
-.set_attr<FCompute>("FCompute<cpu>",
-    UnaryLaunch<cpu, kernel_launch_op::sigmoid>);
-
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_sigmoid)
-.set_attr<FCompute>("FCompute<cpu>",
-    BinaryLaunch<cpu, kernel_launch_op::sigmoid_grad>);
-
-
-// copy
-MXNET_OPERATOR_REGISTER_UNARY(_copy)
-.MXNET_DESCRIBE("Returns a copy of the input.")
-.add_alias("identity")
-.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
-  [](const NodeAttrs& attrs){
-    return std::vector<bool>{true};
-  })
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
-
-NNVM_REGISTER_OP(_backward_copy)
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 0}};
-  })
-.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
-  [](const NodeAttrs& attrs){
-    return std::vector<bool>{true};
-  })
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>);
-
-MXNET_OPERATOR_REGISTER_UNARY(BlockGrad)
-.add_alias("stop_gradient")
-.describe(R"code(Stops gradient computation.
-
-Stops the accumulated gradient of the inputs from flowing through this operator
-in the backward direction. In other words, this operator prevents the contribution
-of its inputs to be taken into account for computing gradients.
-
-Example::
-
-  v1 = [1, 2]
-  v2 = [0, 1]
-  a = Variable('a')
-  b = Variable('b')
-  b_stop_grad = stop_gradient(3 * b)
-  loss = MakeLoss(b_stop_grad + a)
-
-  executor = loss.simple_bind(ctx=cpu(), a=(1,2), b=(1,2))
-  executor.forward(is_train=True, a=v1, b=v2)
-  executor.outputs
-  [ 1.  5.]
-
-  executor.backward()
-  executor.grad_arrays
-  [ 0.  0.]
-  [ 1.  1.]
-
-)code" ADD_FILELINE)
-.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
-  [](const NodeAttrs& attrs){
-    return std::vector<bool>{true};
-  })
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
-
-MXNET_OPERATOR_REGISTER_UNARY(make_loss)
-.describe(R"code(Stops gradient computation.
-.. note:: ``make_loss`` is deprecated, use ``MakeLoss``.
-)code" ADD_FILELINE)
-.set_attr<nnvm::FListOutputNames>("FListOutputNames",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"loss"};
-  })
-.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
-  [](const NodeAttrs& attrs){
-    return std::vector<bool>{true};
-  })
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient",
-  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
-    auto p = MakeNode("ones_like", n->attrs.name + "_backward",
-                      &(n->inputs), nullptr, &n);
-    std::vector<nnvm::NodeEntry> ret;
-    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
-    return ret;
-  });
-
-// identity output as first input, but attributes are constrainted to be like rhs
-NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
-.set_num_inputs(2)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"lhs", "rhs"};
-  })
-.set_attr<nnvm::FInplaceOption>(
-    "FInplaceOption", [](const NodeAttrs& attrs) {
-      return std::vector<std::pair<int, int> >{{0, 0}};
-    })
-.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
-    [](const NodeAttrs& attrs){
-      return std::vector<bool>{true};
-    })
-.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
-    [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 1); })
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
-.set_attr<nnvm::FGradient>(
-    "FGradient",  [](const nnvm::NodePtr& n,
-                     const std::vector<nnvm::NodeEntry>& ograds) {
-      auto lhs = MakeNonlossGradNode(
-          "_backward_copy", n, ograds, {},
-          std::unordered_map<std::string, std::string>());
-      auto ng = MakeNode("zeros_like", n->attrs.name + "rhs_backward",
-                         {n->inputs[1]}, nullptr, &n);
-      lhs.push_back(nnvm::NodeEntry{ng, 0, 0});
-      return lhs;
-    })
-.add_argument("lhs", "NDArray-or-Symbol", "First input.")
-.add_argument("rhs", "NDArray-or-Symbol", "Second input.");
-
-DMLC_REGISTER_PARAMETER(CastParam);
-NNVM_REGISTER_OP(Cast)
-.add_alias("cast")
-.describe(R"code(Casts all elements of the input to a new type.
-
-.. note:: ``Cast`` is deprecated. Use ``cast`` instead.
-
-Example::
-
-   cast([0.9, 1.3], dtype='int32') = [0, 1]
-   cast([1e20, 11.1], dtype='float16') = [inf, 11.09375]
-   cast([300, 11.1, 10.9, -1, -3], dtype='uint8') = [44, 11, 10, 255, 253]
-
-)code" ADD_FILELINE)
-.set_attr_parser(ParamParser<CastParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
-.set_attr<nnvm::FInferType>("FInferType", CastType)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 0}};
-  })
-.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
-  [](const NodeAttrs& attrs){
-    return std::vector<bool>{true};
-  })
-.set_attr<FCompute>("FCompute<cpu>", CastCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_cast"})
-.add_argument("data", "NDArray-or-Symbol", "The input.")
-.add_arguments(CastParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_cast)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption",
-  [](const NodeAttrs& attrs){
-    return std::vector<std::pair<int, int> >{{0, 0}};
-  })
-.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
-  [](const NodeAttrs& attrs){
-    return std::vector<bool>{true};
-  })
-.set_attr<FCompute>("FCompute<cpu>", CastCompute<cpu>);
-
-// negative
-MXNET_OPERATOR_REGISTER_UNARY(negative)
-.MXNET_DESCRIBE("Numerical negative of the argument, element-wise.")
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::negation>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
-
-// reciprocal
-MXNET_OPERATOR_REGISTER_UNARY(reciprocal)
-.describe(R"code(Returns the reciprocal of the argument, element-wise.
-
-Calculates 1/x.
-
-Example::
-
-    reciprocal([-2, 1, 3, 1.6, 0.2]) = [-0.5, 1.0, 0.33333334, 0.625, 5.0]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::reciprocal>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_reciprocal"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_reciprocal)
-.set_attr<FCompute>("FCompute<cpu>",
-  BinaryCompute<cpu, unary_bwd<mshadow_op::reciprocal_grad> >);
-
-// abs
-MXNET_OPERATOR_REGISTER_UNARY(abs)
-.describe(R"code(Returns element-wise absolute value of the input.
-
-Example::
-
-   abs([-2, 0, 3]) = [2, 0, 3]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::abs>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_abs"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_abs)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::sign> >);
-
-// sign
-MXNET_OPERATOR_REGISTER_UNARY(sign)
-.describe(R"code(Returns element-wise sign of the input.
-
-Example::
-
-   sign([-2, 0, 3]) = [-1, 0, 1]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::sign>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_sign"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_sign)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::sign_grad> >);
-
-// round
-MXNET_OPERATOR_REGISTER_UNARY(round)
-.describe(R"code(Returns element-wise rounded value to the nearest integer of the input.
-
-Example::
-
-   round([-1.5, 1.5, -1.9, 1.9, 2.1]) = [-2.,  2., -2.,  2.,  2.]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::round>)
-.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
-
-// rint
-MXNET_OPERATOR_REGISTER_UNARY(rint)
-.describe(R"code(Returns element-wise rounded value to the nearest integer of the input.
-
-.. note::
-   - For input ``n.5`` ``rint`` returns ``n`` while ``round`` returns ``n+1``.
-   - For input ``-n.5`` both ``rint`` and ``round`` returns ``-n-1``.
-
-Example::
-
-   rint([-1.5, 1.5, -1.9, 1.9, 2.1]) = [-2.,  1., -2.,  2.,  2.]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::rint>);
-
-// ceil
-MXNET_OPERATOR_REGISTER_UNARY(ceil)
-.describe(R"code(Returns element-wise ceiling of the input.
-
-The ceil of the scalar x is the smallest integer i, such that i >= x.
-
-Example::
-
-   ceil([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -1.,  2.,  2.,  3.]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::ceil>);
-
-// floor
-MXNET_OPERATOR_REGISTER_UNARY(floor)
-.describe(R"code(Returns element-wise floor of the input.
-
-The floor of the scalar x is the largest integer i, such that i <= x.
-
-Example::
-
-   floor([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-3., -2.,  1.,  1.,  2.]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::floor>);
-
-// trunc
-MXNET_OPERATOR_REGISTER_UNARY(trunc)
-.describe(R"code(Return the element-wise truncated value of the input.
-
-The truncated value of the scalar x is the nearest integer i which is closer to
-zero than x is. In short, the fractional part of the signed number x is discarded.
-
-Example::
-
-   trunc([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -1.,  1.,  1.,  2.]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::trunc>);
-
-// fix
-MXNET_OPERATOR_REGISTER_UNARY(fix)
-.describe(R"code(Returns element-wise rounded value to the nearest integer towards zero of the input.
-
-Example::
-
-   fix([-2.1, -1.9, 1.9, 2.1]) = [-2., -1.,  1., 2.]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::fix>);
-
-// square
-MXNET_OPERATOR_REGISTER_UNARY(square)
-.describe(R"code(Returns element-wise squared value of the input.
-
-.. math::
-   square(x) = x^2
-
-Example::
-
-   square([2, 3, 4]) = [4, 9, 16]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::square>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_square)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::square_grad> >);
-
-// sqrt
-MXNET_OPERATOR_REGISTER_UNARY(sqrt)
-.describe(R"code(Returns element-wise square-root value of the input.
-
-.. math::
-   \textrm{sqrt}(x) = \sqrt{x}
-
-Example::
-
-   sqrt([4, 9, 16]) = [2, 3, 4]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::square_root>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sqrt"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_sqrt)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::square_root_grad> >);
-
-// rsqrt
-MXNET_OPERATOR_REGISTER_UNARY(rsqrt)
-.describe(R"code(Returns element-wise inverse square-root value of the input.
-
-.. math::
-   rsqrt(x) = 1/\sqrt{x}
-
-Example::
-
-   rsqrt([4,9,16]) = [0.5, 0.33333334, 0.25]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::reciprocal_square_root>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rsqrt"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_rsqrt)
-.set_attr<FCompute>("FCompute<cpu>",
-  BinaryCompute<cpu, unary_bwd<mshadow_op::reciprocal_square_root_grad> >);
-
-// exp
-MXNET_OPERATOR_REGISTER_UNARY(exp)
-.describe(R"code(Returns element-wise exponential value of the input.
-
-.. math::
-   exp(x) = e^x \approx 2.718^x
-
-Example::
-
-   exp([0, 1, 2]) = [inf, 1, 0.707]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::exp>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
-
-// log
-MXNET_OPERATOR_REGISTER_UNARY(log)
-.describe(R"code(Returns element-wise Natural logarithmic value of the input.
-
-The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::log>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
-
-// log10
-MXNET_OPERATOR_REGISTER_UNARY(log10)
-.describe(R"code(Returns element-wise Base-10 logarithmic value of the input.
-
-``10**log10(x) = x``
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::log10>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
-
-// log2
-MXNET_OPERATOR_REGISTER_UNARY(log2)
-.describe(R"code(Returns element-wise Base-2 logarithmic value of the input.
-
-``2**log2(x) = x``
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::log2>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_log)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::log_grad> >);
-
-// sin
-MXNET_OPERATOR_REGISTER_UNARY(sin)
-.describe(R"code(Computes the element-wise sine of the input array.
-
-The input should be in radians (:math:`2\pi` rad equals 360 degrees).
-
-.. math::
-   sin([0, \pi/4, \pi/2]) = [0, 0.707, 1]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::sin>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sin" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_sin)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::sin_grad> >);
-
-// log1p
-MXNET_OPERATOR_REGISTER_UNARY(log1p)
-.describe(R"code(Returns element-wise ``log(1 + x)`` value of the input.
-
-This function is more accurate than ``log(1 + x)``  for small ``x`` so that
-:math:`1+x\approx 1`
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::log1p>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log1p"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_log1p)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::log1p_grad> >);
-
-// expm1
-MXNET_OPERATOR_REGISTER_UNARY(expm1)
-.describe(R"code(Returns ``exp(x) - 1`` computed element-wise on the input.
-
-This function provides greater precision than ``exp(x) - 1`` for small values of ``x``.
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::expm1>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_expm1"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_expm1)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::exp> >);
-
-// cos
-MXNET_OPERATOR_REGISTER_UNARY(cos)
-.describe(R"code(Computes the element-wise cosine of the input array.
-
-The input should be in radians (:math:`2\pi` rad equals 360 degrees).
-
-.. math::
-   cos([0, \pi/4, \pi/2]) = [1, 0.707, 0]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::cos>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_cos"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_cos)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::cos_grad> >);
-
-// tan
-MXNET_OPERATOR_REGISTER_UNARY(tan)
-.describe(R"code(Computes the element-wise tangent of the input array.
-
-The input should be in radians (:math:`2\pi` rad equals 360 degrees).
-
-.. math::
-   tan([0, \pi/4, \pi/2]) = [0, 1, -inf]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::tan>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{ "_backward_tan" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_tan)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::tan_grad> >);
-
-// arcsin
-MXNET_OPERATOR_REGISTER_UNARY(arcsin)
-.describe(R"code(Returns element-wise inverse sine of the input array.
-
-The input should be in the range `[-1, 1]`.
-The output is in the closed interval of [:math:`-\pi/2`, :math:`\pi/2`].
-
-.. math::
-   arcsin([-1, -.707, 0, .707, 1]) = [-\pi/2, -\pi/4, 0, \pi/4, \pi/2]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arcsin>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsin" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_arcsin)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::arcsin_grad> >);
-
-// arccos
-MXNET_OPERATOR_REGISTER_UNARY(arccos)
-.describe(R"code(Returns element-wise inverse cosine of the input array.
-
-The input should be in range `[-1, 1]`.
-The output is in the closed interval :math:`[0, \pi]`
-
-.. math::
-   arccos([-1, -.707, 0, .707, 1]) = [\pi, 3\pi/4, \pi/2, \pi/4, 0]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arccos>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccos" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_arccos)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::arccos_grad> >);
-
-// arctan
-MXNET_OPERATOR_REGISTER_UNARY(arctan)
-.describe(R"code(Returns element-wise inverse tangent of the input array.
-
-The output is in the closed interval :math:`[-\pi/2, \pi/2]`
-
-.. math::
-   arctan([-1, 0, 1]) = [-\pi/4, 0, \pi/4]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arctan>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctan" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_arctan)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::arctan_grad> >);
-
-// degrees
-MXNET_OPERATOR_REGISTER_UNARY(degrees)
-.describe(R"code(Converts each element of the input array from radians to degrees.
-
-.. math::
-   degrees([0, \pi/2, \pi, 3\pi/2, 2\pi]) = [0, 90, 180, 270, 360]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::degrees>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_degrees" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_degrees)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::degrees_grad> >);
-
-// radians
-MXNET_OPERATOR_REGISTER_UNARY(radians)
-.describe(R"code(Converts each element of the input array from degrees to radians.
-
-.. math::
-   radians([0, 90, 180, 270, 360]) = [0, \pi/2, \pi, 3\pi/2, 2\pi]
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::radians>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_radians" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_radians)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::radians_grad> >);
-
-// sinh
-MXNET_OPERATOR_REGISTER_UNARY(sinh)
-.describe(R"code(Returns the hyperbolic sine of the input array, computed element-wise.
-
-.. math::
-   sinh(x) = 0.5\times(exp(x) - exp(-x))
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::sinh>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sinh" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_sinh)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::sinh_grad> >);
-
-// cosh
-MXNET_OPERATOR_REGISTER_UNARY(cosh)
-.describe(R"code(Returns the hyperbolic cosine  of the input array, computed element-wise.
-
-.. math::
-   cosh(x) = 0.5\times(exp(x) + exp(-x))
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::cosh>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_cosh" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_cosh)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::cosh_grad> >);
-
-// tanh
-MXNET_OPERATOR_REGISTER_UNARY(tanh)
-.describe(R"code(Returns the hyperbolic tangent of the input array, computed element-wise.
-
-.. math::
-   tanh(x) = sinh(x) / cosh(x)
-
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::tanh>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{ "_backward_tanh" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_tanh)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::tanh_grad> >);
-
-// arcsinh
-MXNET_OPERATOR_REGISTER_UNARY(arcsinh)
-.describe(R"code(Returns the element-wise inverse hyperbolic sine of the input array, computed element-wise.
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arcsinh>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsinh" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_arcsinh)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::arcsinh_grad> >);
-
-// arccosh
-MXNET_OPERATOR_REGISTER_UNARY(arccosh)
-.describe(R"code(Returns the element-wise inverse hyperbolic cosine of the input array, computed element-wise.
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arccosh>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccosh" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_arccosh)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::arccosh_grad> >);
-
-// arctanh
-MXNET_OPERATOR_REGISTER_UNARY(arctanh)
-.describe(R"code(Returns the element-wise inverse hyperbolic tangent of the input array, computed element-wise.
-)code" ADD_FILELINE)
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::arctanh>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctanh" });
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_arctanh)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::arctanh_grad> >);
-
-// gamma
-MXNET_OPERATOR_REGISTER_UNARY(gamma)
-.MXNET_DESCRIBE("Returns the gamma function (extension of the factorial function to the reals)"
-  " , computed element-wise on the input array.")
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::gamma>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_gamma"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_gamma)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::gamma_grad> >);
-
-// gammaln
-MXNET_OPERATOR_REGISTER_UNARY(gammaln)
-.MXNET_DESCRIBE("Returns element-wise log of the absolute value of the gamma function"
-  " of the input.")
-.set_attr<FCompute>("FCompute<cpu>", UnaryCompute<cpu, mshadow_op::gammaln>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_gammaln"});
-
-MXNET_OPERATOR_REGISTER_BINARY(_backward_gammaln)
-.set_attr<FCompute>("FCompute<cpu>", BinaryCompute<cpu, unary_bwd<mshadow_op::gammaln_grad> >);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op.cu b/src/operator/tensor/elemwise_unary_op.cu
deleted file mode 100644
index 4211ea305b4e..000000000000
--- a/src/operator/tensor/elemwise_unary_op.cu
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file elemwise_unary_op.cu
- * \brief GPU Implementation of unary function.
- */
-#include "./elemwise_unary_op.h"
-#include "./elemwise_binary_op.h"
-
-namespace mxnet {
-namespace op {
-NNVM_REGISTER_OP(relu)
-.set_attr<FCompute>("FCompute<gpu>", UnaryLaunch<gpu, kernel_launch_op::relu>);
-
-NNVM_REGISTER_OP(_backward_relu)
-.set_attr<FCompute>("FCompute<gpu>", BinaryLaunch<gpu, kernel_launch_op::relu_grad>);
-
-NNVM_REGISTER_OP(sigmoid)
-.set_attr<FCompute>("FCompute<gpu>", UnaryLaunch<gpu, kernel_launch_op::sigmoid>);
-
-NNVM_REGISTER_OP(_backward_sigmoid)
-.set_attr<FCompute>("FCompute<gpu>", BinaryLaunch<gpu, kernel_launch_op::sigmoid_grad>);
-
-// copy
-NNVM_REGISTER_OP(_copy)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_copy)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
-
-NNVM_REGISTER_OP(BlockGrad)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
-
-NNVM_REGISTER_OP(make_loss)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
-
-// identity output as first input, but attributes are constrainted to be like rhs
-NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
-
-NNVM_REGISTER_OP(Cast)
-.set_attr<FCompute>("FCompute<gpu>", CastCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_cast)
-.set_attr<FCompute>("FCompute<gpu>", CastCompute<gpu>);
-
-// negative
-NNVM_REGISTER_OP(negative)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::negation>);
-
-// reciprocal
-NNVM_REGISTER_OP(reciprocal)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::reciprocal>);
-
-NNVM_REGISTER_OP(_backward_reciprocal)
-.set_attr<FCompute>("FCompute<gpu>",
-  BinaryCompute<gpu, unary_bwd<mshadow_op::reciprocal_grad> >);
-
-// abs
-NNVM_REGISTER_OP(abs)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::abs>);
-
-NNVM_REGISTER_OP(_backward_abs)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::sign> >);
-
-// sign
-NNVM_REGISTER_OP(sign)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::sign>);
-
-NNVM_REGISTER_OP(_backward_sign)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::sign_grad> >);
-
-// round
-NNVM_REGISTER_OP(round)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::round>);
-
-// ceil
-NNVM_REGISTER_OP(ceil)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::ceil>);
-
-// floor
-NNVM_REGISTER_OP(floor)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::floor>);
-
-// trunc
-NNVM_REGISTER_OP(trunc)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::trunc>);
-
-// rint
-NNVM_REGISTER_OP(rint)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::rint>);
-
-// fix
-NNVM_REGISTER_OP(fix)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::fix>);
-
-// square
-NNVM_REGISTER_OP(square)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::square>);
-
-NNVM_REGISTER_OP(_backward_square)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::square_grad> >);
-
-// sqrt
-NNVM_REGISTER_OP(sqrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::square_root>);
-
-NNVM_REGISTER_OP(_backward_sqrt)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::square_root_grad> >);
-
-// rsqrt
-NNVM_REGISTER_OP(rsqrt)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::reciprocal_square_root>);
-
-NNVM_REGISTER_OP(_backward_rsqrt)
-.set_attr<FCompute>("FCompute<gpu>",
-  BinaryCompute<gpu, unary_bwd<mshadow_op::reciprocal_square_root_grad> >);
-
-// exp
-NNVM_REGISTER_OP(exp)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::exp>);
-
-// log
-NNVM_REGISTER_OP(log)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::log>);
-
-// log10
-NNVM_REGISTER_OP(log10)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::log10>);
-
-// log2
-NNVM_REGISTER_OP(log2)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::log2>);
-
-NNVM_REGISTER_OP(_backward_log)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::log_grad> >);
-
-// log1p
-NNVM_REGISTER_OP(log1p)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::log1p>);
-
-NNVM_REGISTER_OP(_backward_log1p)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::log1p_grad> >);
-
-// expm1
-NNVM_REGISTER_OP(expm1)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::expm1>);
-
-NNVM_REGISTER_OP(_backward_expm1)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::exp> >);
-
-// sin
-NNVM_REGISTER_OP(sin)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::sin>);
-
-NNVM_REGISTER_OP(_backward_sin)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::sin_grad> >);
-
-// cos
-NNVM_REGISTER_OP(cos)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::cos>);
-
-NNVM_REGISTER_OP(_backward_cos)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::cos_grad> >);
-
-// tan
-NNVM_REGISTER_OP(tan)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::tan>);
-
-NNVM_REGISTER_OP(_backward_tan)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::tan_grad> >);
-
-// arcsin
-NNVM_REGISTER_OP(arcsin)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::arcsin>);
-
-NNVM_REGISTER_OP(_backward_arcsin)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::arcsin_grad> >);
-
-// arccos
-NNVM_REGISTER_OP(arccos)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::arccos>);
-
-NNVM_REGISTER_OP(_backward_arccos)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::arccos_grad> >);
-
-// arctan
-NNVM_REGISTER_OP(arctan)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::arctan>);
-
-NNVM_REGISTER_OP(_backward_arctan)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::arctan_grad> >);
-
-// degrees
-NNVM_REGISTER_OP(degrees)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::degrees>);
-
-NNVM_REGISTER_OP(_backward_degrees)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::degrees_grad> >);
-
-// radians
-NNVM_REGISTER_OP(radians)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::radians>);
-
-NNVM_REGISTER_OP(_backward_radians)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::radians_grad> >);
-
-// cosh
-NNVM_REGISTER_OP(cosh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::cosh>);
-
-NNVM_REGISTER_OP(_backward_cosh)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::cosh_grad> >);
-
-// sinh
-NNVM_REGISTER_OP(sinh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::sinh>);
-
-NNVM_REGISTER_OP(_backward_sinh)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::sinh_grad> >);
-
-// tanh
-NNVM_REGISTER_OP(tanh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::tanh>);
-
-NNVM_REGISTER_OP(_backward_tanh)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::tanh_grad> >);
-
-// arcsinh
-NNVM_REGISTER_OP(arcsinh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::arcsinh>);
-
-NNVM_REGISTER_OP(_backward_arcsinh)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::arcsinh_grad> >);
-
-// arccosh
-NNVM_REGISTER_OP(arccosh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::arccosh>);
-
-NNVM_REGISTER_OP(_backward_arccosh)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::arccosh_grad> >);
-
-// arctanh
-NNVM_REGISTER_OP(arctanh)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::arctanh>);
-
-NNVM_REGISTER_OP(_backward_arctanh)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::arctanh_grad> >);
-
-// gamma
-NNVM_REGISTER_OP(gamma)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::gamma>);
-
-NNVM_REGISTER_OP(_backward_gamma)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::gamma_grad> >);
-
-// gammaln
-NNVM_REGISTER_OP(gammaln)
-.set_attr<FCompute>("FCompute<gpu>", UnaryCompute<gpu, mshadow_op::gammaln>);
-
-NNVM_REGISTER_OP(_backward_gammaln)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, unary_bwd<mshadow_op::gammaln_grad> >);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index b6994844e0fe..82ecf4f5ad87 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -18,7 +18,8 @@
  */
 
 /*!
- * \file elementwise_unary_op-inl.h
+ *  Copyright (c) 2015 by Contributors
+ * \file elementwise_unary_op.h
  * \brief Function definition of elementwise unary operators
  */
 #ifndef MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_H_
@@ -27,73 +28,339 @@
 #include <mxnet/operator_util.h>
 #include <vector>
 #include <utility>
-#include "../mxnet_op.h"
+#include <algorithm>
 #include "../mshadow_op.h"
+#include "../mxnet_op.h"
 #include "../elemwise_op_common.h"
-#include "../special_functions-inl.h"
+#include "../../ndarray/ndarray_function.h"
 
 namespace mxnet {
 namespace op {
-template<typename xpu, typename op>
-void UnaryLaunch(const nnvm::NodeAttrs& attrs,
-                        const OpContext& ctx,
-                        const std::vector<TBlob>& inputs,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
 
-  CHECK_EQ(inputs.size(), 1U);
-  CHECK_EQ(outputs.size(), 1U);
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Kernel<op, xpu>::Launch(s, outputs[0].Size(),
-      outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
-  });
-}
+class OpBase {
+ protected:
+  /*!
+   * \brief Launch CPU-only kernel without OMP (temporary solution until OMP-tuned kernels arrive)
+   * \tparam OP Kernel operation type
+   * \tparam Args Argument types to be passed to kernel
+   * \param s CPU stream
+   * \param N Number of iterations
+   * \param args Arguments to be passed to kernel
+   */
+  template <typename OP, typename ...Args>
+  static inline void SerialLaunchCPU(mshadow::Stream<cpu> *s, const int N, Args... args) {
+    for (int i = 0; i < N; ++i) {
+      OP::Map(i, args...);
+    }
+  }
 
-template<typename GRAD_OP>
-struct unary_bwd {
+  /*! \brief simple kernel to set to a scalar value of arbitrary type */
+  template<int req>
+  using set_to_scalar = mxnet_op::op_with_req<mshadow_op::identity, req>;
+
+  /*! \brief Copy blob data */
+  template<typename xpu>
+  static void inline CopyBlob(mshadow::Stream<xpu> *s,
+                              const TBlob *dest_blob,
+                              const OpReqType reqi,
+                              const TBlob& src_blob) {
+    CHECK_EQ(src_blob.type_flag_, dest_blob->type_flag_);
+    CHECK_EQ(src_blob.shape_, dest_blob->shape_);
+    MSHADOW_TYPE_SWITCH(src_blob.type_flag_, DType, {
+      // Check if the pointers are the same (in-place operation needs no copy)
+      if (reqi != kNullOp && src_blob.dptr<DType>() != dest_blob->dptr<DType>()) {
+        mshadow::Copy(dest_blob->FlatTo1D<xpu, DType>(s), src_blob.FlatTo1D<xpu, DType>(s), s);
+      }
+    });
+  }
+
+  /*! \brief Allocate geometry-related blob data for sparse tensors
+   * \param dest Destination sparse NDArray
+   * \param clone_from sparse NDArray from which to clone storage attributes
+   */
+  static void AllocateGeometry(const NDArray *dest,
+                               const OpReqType req,
+                               const NDArray* clone_from = nullptr) {
+    if (req != kNullOp) {
+      if (clone_from) {
+        const TShape& ishape = clone_from->storage_shape();
+        dest->CheckAndAllocData(ishape);
+        CHECK_EQ(dest->storage_type(), clone_from->storage_type());
+        for (size_t i = 0, n = clone_from->aux_shapes().size(); i < n; ++i) {
+          dest->CheckAndAllocAuxData(i, clone_from->aux_shape(i));
+        }
+        DCHECK_EQ(dest->aux_shapes().size(), clone_from->aux_shapes().size());
+      } else {
+        for (size_t i = 0, n = dest->aux_shapes().size(); i < n; ++i) {
+          dest->CheckAndAllocAuxData(i, dest->aux_shape(i));
+        }
+        dest->CheckAndAllocData(dest->storage_shape());
+      }
+    }
+  }
+
+  /*! \brief Copy the geometry-related blobs (row sparse indexes, etc.) */
+  template<typename xpu>
+  static inline void CopyGeometryBlobs(mshadow::Stream<xpu> *s,
+                                       const NDArray *dest,
+                                       const OpReqType reqi,
+                                       const NDArray &src) {
+    CHECK_EQ(src.aux_shapes().size(), dest->aux_shapes().size());
+    // My assumption is that the geometry blobs are not large enough to justify an omp loop here,
+    // since the thread synchronization calls for each fork will take longer
+    // than copying a few floats
+    for (size_t i = 0, n = src.aux_shapes().size(); i < n; ++i) {
+      const TBlob src_blob = src.aux_data(i);
+      const TBlob dest_blob = dest->aux_data(i);
+      CopyBlob<xpu>(s, &dest_blob, reqi, src_blob);
+    }
+  }
+
+  /*! \brief Generic copy NDArray */
+  template<typename xpu>
+  static inline void CopyNDArray(mshadow::Stream<xpu> *s,
+                                 const NDArray *dest,
+                                 const OpReqType reqi,
+                                 const NDArray& src) {
+    DCHECK_EQ(dest->storage_type(), src.storage_type());
+    AllocateGeometry(dest, reqi, &src);
+    CopyGeometryBlobs(s, dest, reqi, src);
+    CopyBlob(s, &dest->data(), reqi, src.data());
+  }
+
+  /*! \brief Map NDArray vectors to TBlob vectors and pass to compute function */
+  template<typename xpu, typename FComputer>
+  static inline void MapToFCompute(const nnvm::NodeAttrs &attrs,
+                                   const OpContext &ctx,
+                                   const std::vector<NDArray> &inputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<NDArray> &outputs,
+                                   FComputer computer) {
+    std::vector<TBlob> in_blobs, out_blobs;
+    in_blobs.reserve(inputs.size());
+    out_blobs.reserve(outputs.size());
+    for (size_t i = 0, n = inputs.size(); i < n; ++i) {
+      in_blobs.emplace_back(std::move(inputs[i].data()));
+    }
+    for (size_t i = 0, n = outputs.size(); i < n; ++i) {
+      out_blobs.emplace_back(std::move(outputs[i].data()));
+    }
+    computer(attrs, ctx, in_blobs, req, out_blobs);
+  }
+
+  /*! \brief Keep row shape[0] dimension and gather the remaining dimensions in location shape[1] */
+  template<typename DType, typename xpu>
+  static inline mshadow::Tensor<xpu, 2, DType> AsRowise2D(mshadow::Stream<xpu> *s,
+                                                          const TBlob& blob) {
+    const size_t dim = blob.shape_.ndim();
+    if (dim) {
+      TShape shape({blob.shape_[0], 1});
+      for (size_t i = 1; i < dim; ++i) {
+        shape[1] *= blob.shape_[i];
+      }
+      return mshadow::Tensor<xpu, 2, DType>(
+        blob.dptr<DType>(), mshadow::Shape2(shape[0], shape[1]), s);
+    }
+    return mshadow::Tensor<xpu, 2, DType>();
+  }
+
+  /*! \brief Fill dense output block with a single scalar value */
   template<typename DType>
-  MSHADOW_XINLINE static DType Map(DType a, DType b) {
-    return DType(a*GRAD_OP::Map(b));
+  static inline void FillDense(mshadow::Stream<cpu> *s,
+                               const size_t size,
+                               const DType val,
+                               const OpReqType req,
+                               DType *out) {
+    MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+      SerialLaunchCPU<OpBase::set_to_scalar<Req>>(s, size, out, val);
+    });
   }
-};
+};  // OpBase
 
-template<typename xpu, typename OP>
-void UnaryCompute(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<TBlob>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
-    ASSIGN_DISPATCH(out, req[0], F<OP>(inputs[0].FlatTo1D<xpu, DType>(s)));
-  });
-}
+/*! \brief Unary operator class */
+class UnaryOp : public OpBase {
+  /*! \brief Infer the output storage geometry
+   * \return boolean signifying whether the proper storage geometry was initialized
+   */
+  template<int n_in, int n_out>
+  static bool InitStorageGeometry(const nnvm::NodeAttrs& attrs,
+                                  const std::vector<NDArray>& inputs,
+                                  const std::vector<NDArray>& outputs) {
+    CHECK_EQ(inputs.size(), static_cast<size_t>(n_in))
+      << " in operator " << attrs.name;
+    CHECK_EQ(outputs.size(), static_cast<size_t>(n_out))
+      << " in operator " << attrs.name;
+    static_assert(n_in > 0 && n_out > 0, "Invalid input and/or output count values");
+    const TShape& isshape = inputs[0].storage_shape();
+    if (!shape_is_none(isshape)) {
+      NDArray *output = nullptr;
+      for (size_t i = 0, n = inputs.size(); i < n; ++i) {
+        const NDArray &input = inputs[i];
+        if (i < n_out) {
+          output = const_cast<NDArray *>(&outputs[i]);
+        }
+        CHECK_EQ(output->shape(), inputs[i].shape());
+        CHECK_EQ(output->storage_type(), input.storage_type());
+        CHECK_EQ(output->aux_shapes().size(), input.aux_shapes().size());
+        std::vector<TShape> aux_shapes;
+        const size_t aux_shape_count = input.aux_shapes().size();
+        aux_shapes.reserve(aux_shape_count);
+        for (size_t j = 0; j < aux_shape_count; ++j) {
+          aux_shapes.emplace_back(input.aux_shape(j));
+        }
+        output->CheckAndAlloc(aux_shapes);
+        DCHECK_EQ(output->storage_shape(), input.storage_shape());
+      }
+      return true;
+    }
+    if (isshape.ndim() > 0 && !isshape.Size()
+      && inputs[0].storage_type() != kDefaultStorage) {
+      return true;  // 0% density
+    } else {
+      CHECK(false);  // implement when necessary
+    }
+    return false;
+  }
 
+ public:
+  /*! \brief Map NDArray vectors to TBlob vectors and pass to compute function */
+  template<typename xpu, typename FComputer>
+  static inline void MapToFCompute(const nnvm::NodeAttrs &attrs,
+                                   const OpContext &ctx,
+                                   const std::vector<NDArray> &inputs,
+                                   const std::vector<OpReqType> &req,
+                                   const std::vector<NDArray> &outputs,
+                                   FComputer computer) {
+    UnaryOp::template InitStorageGeometry<1, 1>(attrs, inputs, outputs);
+    CHECK_EQ(inputs.size(), outputs.size());  // need to figure out what to do for binary type
+    CHECK_NE(outputs[0].storage_type(), kDefaultStorage);
+    CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type());
+    AllocateGeometry(&outputs[0], req[0], &inputs[0]);
+    CopyGeometryBlobs<xpu>(ctx.get_stream<xpu>(), &outputs[0], req[0], inputs[0]);
+    outputs[0].CheckAndAllocData(inputs[0].storage_shape());
+    if (inputs[0].storage_shape().Size()) {
+      OpBase::MapToFCompute<xpu>(attrs, ctx, inputs, req, outputs, computer);
+    }
+  }
 
-template<typename xpu>
-void IdentityCompute(const nnvm::NodeAttrs& attrs,
-                     const OpContext& ctx,
-                     const std::vector<TBlob>& inputs,
-                     const std::vector<OpReqType>& req,
-                     const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (req[0] == kNullOp) return;
-  if (req[0] == kWriteInplace) {
-    CHECK_EQ(inputs[0].dptr_, outputs[0].dptr_); return;
+  template<typename xpu, typename OP>
+  static void Compute(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+        mxnet_op::Kernel<mxnet_op::op_with_req<OP, Req>, xpu>::Launch(
+          s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+      });
+    });
   }
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
-    ASSIGN_DISPATCH(out, req[0], F<mshadow_op::identity>(inputs[0].FlatTo1D<xpu, DType>(s)));
-  });
-}
+
+  template<typename xpu, typename OP>
+  static void ComputeEx(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<NDArray>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<NDArray>& outputs) {
+    CHECK_EQ(inputs.size(), 1U);
+    CHECK_EQ(outputs.size(), 1U);
+    CHECK_NE(inputs[0].storage_type(), kDefaultStorage);
+    CHECK_NE(outputs[0].storage_type(), kDefaultStorage)
+      << "Operation requires a sparse output storage type";
+    if (inputs[0].storage_shape().Size()) {
+      MapToFCompute<xpu>(attrs, ctx, inputs, req, outputs, Compute<xpu, OP>);
+    }
+  }
+
+  template<typename xpu, typename op>
+  static void ComputeWithHalf2(const nnvm::NodeAttrs &attrs,
+                               const OpContext &ctx,
+                               const std::vector<TBlob> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<TBlob> &outputs) {
+    using namespace mshadow;
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    CHECK_EQ(inputs.size(), 1U);
+    CHECK_EQ(outputs.size(), 1U);
+    MSHADOW_TYPE_SWITCH_WITH_HALF2(outputs[0].type_flag_, DType, {
+      Kernel<op, xpu>::Launch(s, outputs[0].Size(),
+                              outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+    });
+  }
+
+  template<typename xpu>
+  static void IdentityCompute(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<TBlob>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<TBlob>& outputs) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    switch (req[0]) {
+      case kWriteTo:
+        CHECK_EQ(outputs[0].dev_mask(), inputs[0].dev_mask());
+        mxnet_op::copy(ctx.get_stream<xpu>(), outputs[0], inputs[0]);
+        break;
+      case kAddTo: {
+          Stream<xpu> *s = ctx.get_stream<xpu>();
+          MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+            mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, kAddTo>, xpu>::Launch(
+              s, inputs[0].Size(), outputs[0].dptr<DType>(), inputs[0].dptr<DType>());
+          });
+        }
+        break;
+      case kWriteInplace:
+        CHECK_EQ(inputs[0].dptr_, outputs[0].dptr_);
+        break;
+      case kNullOp:
+        break;
+    }
+  }
+
+  template<typename xpu>
+  static void IdentityComputeEx(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+    CHECK_EQ(inputs.size(), 1U);
+    CHECK_EQ(outputs.size(), 1U);
+    const auto in_stype = inputs[0].storage_type();
+    const auto out_stype = outputs[0].storage_type();
+    if (in_stype == out_stype && (in_stype == kRowSparseStorage || in_stype == kCSRStorage)) {
+      MapToFCompute<xpu>(attrs, ctx, inputs, req, outputs, IdentityCompute<xpu>);
+    } else {
+      LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+    }
+  }
+
+  template<typename xpu>
+  static void IdentityComputeFirstItemEx(const nnvm::NodeAttrs& attrs,
+                                         const OpContext& ctx,
+                                         const std::vector<NDArray>& inputs,
+                                         const std::vector<OpReqType>& req,
+                                         const std::vector<NDArray>& outputs) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(inputs.size(), 2);
+    CHECK_EQ(outputs.size(), 1);
+    const auto lhs_stype = inputs[0].storage_type();
+    const auto out_stype = outputs[0].storage_type();
+    if (lhs_stype == out_stype && (lhs_stype == kRowSparseStorage || lhs_stype == kCSRStorage)) {
+      // csr, _ -> csr, or rsp, _ -> rsp
+      OpBase::CopyNDArray(ctx.get_stream<xpu>(), &outputs[0], req[0], inputs[0]);
+    } else {
+      LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+    }
+  }
+};
+
+/*! \brief Map legacy unary_bwd to backward_grad */
+template<typename GRAD_OP>
+using unary_bwd = ::mxnet::op::mxnet_op::backward_grad<GRAD_OP>;
 
 struct CastParam : public dmlc::Parameter<CastParam> {
   // use int for enumeration
@@ -137,43 +404,9 @@ void CastCompute(const nnvm::NodeAttrs& attrs,
   });
 }
 
-namespace kernel_launch_op {
-/*! \brief sigmoid unit */
-struct sigmoid {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
-                                  const DType *in) {
-    out[i] = DType(DType(1.0f) / (DType(1.0f) + expf(-in[i])));
-  }
-};
-struct sigmoid_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
-                                  const DType *out_grad, const DType *in) {
-    DType x = in[i];
-    out[i] = out_grad[i] * DType(x * (DType(1.0f) - x));
-  }
-};
-/*! \brief Rectified Linear Operation */
-struct relu {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
-                                  const DType *in) {
-    DType x = in[i];
-    out[i] = DType(x > DType(0.0f) ? x : DType(0.0f));
-  }
-};
-struct relu_grad {
-  template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, DType *out,
-                                  const DType *out_grad, const DType *in) {
-    out[i] = out_grad[i] * DType(in[i] > DType(0.0f) ? DType(1.0f) : DType(0.0f));
-  }
-};
-}  // namespace kernel_launch_op
-
-#define MXNET_OPERATOR_REGISTER_UNARY(name)                         \
-  NNVM_REGISTER_OP(name)                                            \
+/*! \brief Unary compute */
+#define MXNET_OPERATOR_REGISTER_UNARY(__name$)                      \
+  NNVM_REGISTER_OP(__name$)                                         \
   .set_num_inputs(1)                                                \
   .set_num_outputs(1)                                               \
   .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)  \
@@ -184,6 +417,28 @@ struct relu_grad {
     })                                                              \
   .add_argument("data", "NDArray-or-Symbol", "The input array.")
 
+/*! \brief Unary compute, with FComputeEx for csr and rsp available  */
+#define MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(__name$, __xpu$, __kernel$)                     \
+  MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                           \
+  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)  \
+  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::Compute<__xpu$, __kernel$>)                \
+  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::ComputeEx<__xpu$, __kernel$>)
+
+/*! \brief Unary compute, with FComputeEx for rsp available  */
+#define MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(__name$, __xpu$, __kernel$)                         \
+  MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                           \
+  .set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>) \
+  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::Compute<__xpu$, __kernel$>)                \
+  .set_attr<FComputeEx>("FComputeEx<" #__xpu$ ">", UnaryOp::ComputeEx<__xpu$, __kernel$>)
+
+/*! \brief Unary compute, dense result.
+ *  FInferStorageType attr is not set using this macro. By default DefaultStorageType is used.
+ */
+#define MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(__name$, __xpu$, __kernel$)        \
+  MXNET_OPERATOR_REGISTER_UNARY(__name$)                                                \
+  .set_attr<FCompute>("FCompute<" #__xpu$ ">", UnaryOp::Compute<__xpu$, __kernel$>)
+
 }  // namespace op
 }  // namespace mxnet
+
 #endif  // MXNET_OPERATOR_TENSOR_ELEMWISE_UNARY_OP_H_
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc
new file mode 100644
index 000000000000..916c385467cf
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op_basic.cc
@@ -0,0 +1,751 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_unary_op_basic.cc
+ * \brief CPU Implementation of unary function.
+ */
+#include <mxnet/base.h>
+#include "elemwise_unary_op.h"
+#include "./elemwise_binary_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+// infer storage function for _identity_with_attr_like_rhs op
+static bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs,
+                                           const int dev_mask,
+                                           DispatchMode* dispatch_mode,
+                                           std::vector<int> *in_attrs,
+                                           std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  auto& lhs_stype = in_attrs->at(0);
+  const auto& rhs_stype = in_attrs->at(1);
+  auto& out_stype = out_attrs->at(0);
+  bool dispatched = false;
+
+  CHECK_NE(rhs_stype, kUndefinedStorage);
+  type_assign(&out_stype, rhs_stype);
+  type_assign(&lhs_stype, rhs_stype);
+  if (!dispatched && lhs_stype == kDefaultStorage && rhs_stype == kDefaultStorage &&
+      out_stype == kDefaultStorage) {
+    // dns, dns -> dns
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched && (lhs_stype == kRowSparseStorage || lhs_stype == kCSRStorage) &&
+      (lhs_stype == out_stype)) {
+    // rsp, _ -> rsp, or csr, _ -> csr
+    dispatched = storage_type_assign(&out_stype, static_cast<NDArrayStorageType>(out_stype),
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched && (rhs_stype == kRowSparseStorage || rhs_stype == kCSRStorage)) {
+    // rsp, _ -> rsp, or csr, _ -> csr
+    dispatched = storage_type_assign(&out_stype, static_cast<NDArrayStorageType>(rhs_stype),
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+// relu
+MXNET_OPERATOR_REGISTER_UNARY(relu)
+MXNET_ADD_SPARSE_OP_ALIAS(relu)
+.describe(R"code(Computes rectified linear.
+
+.. math::
+   max(features, 0)
+
+The storage type of ``relu`` output depends upon the input storage type:
+
+   - relu(default) = default
+   - relu(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, false>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::relu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::ComputeEx<cpu, mshadow_op::relu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_relu"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_relu,
+                                               unary_bwd<mshadow_op::relu_grad>);
+
+// sigmoid
+MXNET_OPERATOR_REGISTER_UNARY(sigmoid)
+MXNET_ADD_SPARSE_OP_ALIAS(sigmoid)
+.describe(R"code(Computes sigmoid of x element-wise.
+
+.. math::
+   y = 1 / (1 + exp(-x))
+
+The storage type of ``sigmoid`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::sigmoid>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sigmoid"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_sigmoid,
+                                               unary_bwd<mshadow_op::sigmoid_grad>);
+
+// copy
+MXNET_OPERATOR_REGISTER_UNARY(_copy)
+.MXNET_DESCRIBE("Returns a copy of the input.")
+.add_alias("identity")
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_copy"});
+
+NNVM_REGISTER_OP(_backward_copy)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  });
+
+MXNET_OPERATOR_REGISTER_UNARY(BlockGrad)
+MXNET_ADD_SPARSE_OP_ALIAS(stop_gradient)
+.add_alias("stop_gradient")
+.describe(R"code(Stops gradient computation.
+
+Stops the accumulated gradient of the inputs from flowing through this operator
+in the backward direction. In other words, this operator prevents the contribution
+of its inputs to be taken into account for computing gradients.
+
+Example::
+
+  v1 = [1, 2]
+  v2 = [0, 1]
+  a = Variable('a')
+  b = Variable('b')
+  b_stop_grad = stop_gradient(3 * b)
+  loss = MakeLoss(b_stop_grad + a)
+
+  executor = loss.simple_bind(ctx=cpu(), a=(1,2), b=(1,2))
+  executor.forward(is_train=True, a=v1, b=v2)
+  executor.outputs
+  [ 1.  5.]
+
+  executor.backward()
+  executor.grad_arrays
+  [ 0.  0.]
+  [ 1.  1.]
+
+)code" ADD_FILELINE)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+MXNET_OPERATOR_REGISTER_UNARY(make_loss)
+MXNET_ADD_SPARSE_OP_ALIAS(make_loss)
+  .describe(R"code(Make your own loss function in network construction.
+
+This operator accepts a customized loss function symbol as a terminal loss and
+the symbol should be an operator with no backward dependency.
+The output of this function is the gradient of loss with respect to the input data.
+
+For example, if you are a making a cross entropy loss function. Assume ``out`` is the
+predicted output and ``label`` is the true label, then the cross entropy can be defined as::
+
+  cross_entropy = label * log(out) + (1 - label) * log(1 - out)
+  loss = make_loss(cross_entropy)
+
+We will need to use ``make_loss`` when we are creating our own loss function or we want to
+combine multiple loss functions. Also we may want to stop some variables' gradients
+from backpropagation. See more detail in ``BlockGrad`` or ``stop_gradient``.
+
+The storage type of ``make_loss`` output depends upon the input storage type:
+
+   - make_loss(default) = default
+   - make_loss(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"loss"};
+  })
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeEx<cpu>)
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    auto p = MakeNode("ones_like", n->attrs.name + "_backward",
+                      &(n->inputs), nullptr, &n);
+    std::vector<nnvm::NodeEntry> ret;
+    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
+    return ret;
+  });
+
+// identity output as first input, but attributes (shape and type) are constrained to be like rhs
+// storage type attribute is not constrained to be like rhs if it is already defined
+NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
+.set_num_inputs(2)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) { return std::vector<std::string>{"lhs", "rhs"}; })
+.set_attr<nnvm::FInplaceOption>(
+    "FInplaceOption", [](const NodeAttrs& attrs) {
+      return std::vector<std::pair<int, int> >{{0, 0}};
+    })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+    [](const NodeAttrs& attrs){ return std::vector<bool>{true}; })
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+    [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 1); })
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", UnaryOp::IdentityComputeFirstItemEx<cpu>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", IdentityAttrLikeRhsStorageType)
+.set_attr<nnvm::FGradient>(
+    "FGradient",  [](const nnvm::NodePtr& n,
+                     const std::vector<nnvm::NodeEntry>& ograds) {
+      if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
+      auto lhs = MakeGradNode("_backward_copy", n, ograds,
+                              std::unordered_map<std::string, std::string>());
+      auto ng = MakeNode("zeros_like", n->attrs.name + "_rhs_backward",
+                         {n->inputs[1]}, nullptr, &n);
+      lhs.push_back(nnvm::NodeEntry{ng, 0, 0});
+      return lhs;
+    })
+.add_argument("lhs", "NDArray-or-Symbol", "First input.")
+.add_argument("rhs", "NDArray-or-Symbol", "Second input.");
+
+
+NNVM_REGISTER_OP(reshape_like)
+.describe("Reshape lhs to have the same shape as rhs.")
+.set_num_inputs(2)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) { return std::vector<std::string>{"lhs", "rhs"}; })
+.set_attr<nnvm::FInplaceOption>(
+    "FInplaceOption", [](const NodeAttrs& attrs) {
+      return std::vector<std::pair<int, int> >{{0, 0}};
+    })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+    [](const NodeAttrs& attrs){ return std::vector<bool>{true}; })
+.set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
+    [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 1); })
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+.set_attr<nnvm::FInferShape>("FInferShape",
+    [](const nnvm::NodeAttrs& attrs,
+       std::vector<TShape> *in_attrs,
+       std::vector<TShape> *out_attrs) {
+      if ((*in_attrs)[0].ndim()) {
+        CHECK_EQ((*in_attrs)[0].Size(), (*in_attrs)[1].Size())
+            << "Cannot reshape lhs with shape " << (*in_attrs)[0] << "to rhs "
+            << "with shape " << (*in_attrs)[1] << " because they have different "
+            << "size.";
+      }
+      SHAPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[1]);
+      return true;
+    })
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
+.set_attr<nnvm::FGradient>(
+    "FGradient",  [](const nnvm::NodePtr& n,
+                     const std::vector<nnvm::NodeEntry>& ograds) {
+      if (CheckGradAllZero(ograds)) return MakeZeroGradNodes(n, ograds);
+      auto lhs = MakeGradNode("_backward_copy", n, ograds,
+                              std::unordered_map<std::string, std::string>());
+      auto ng = MakeNode("zeros_like", n->attrs.name + "_rhs_backward",
+                         {n->inputs[1]}, nullptr, &n);
+      lhs.push_back(nnvm::NodeEntry{ng, 0, 0});
+      return lhs;
+    })
+.add_argument("lhs", "NDArray-or-Symbol", "First input.")
+.add_argument("rhs", "NDArray-or-Symbol", "Second input.");
+
+
+DMLC_REGISTER_PARAMETER(CastParam);
+NNVM_REGISTER_OP(Cast)
+.add_alias("cast")
+.describe(R"code(Casts all elements of the input to a new type.
+
+.. note:: ``Cast`` is deprecated. Use ``cast`` instead.
+
+Example::
+
+   cast([0.9, 1.3], dtype='int32') = [0, 1]
+   cast([1e20, 11.1], dtype='float16') = [inf, 11.09375]
+   cast([300, 11.1, 10.9, -1, -3], dtype='uint8') = [44, 11, 10, 255, 253]
+
+)code" ADD_FILELINE)
+.set_attr_parser(ParamParser<CastParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", CastType)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<FCompute>("FCompute<cpu>", CastCompute<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_cast"})
+.add_argument("data", "NDArray-or-Symbol", "The input.")
+.add_arguments(CastParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_cast)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FInplaceIdentity>("FInplaceIdentity",
+  [](const NodeAttrs& attrs){
+    return std::vector<bool>{true};
+  })
+.set_attr<FCompute>("FCompute<cpu>", CastCompute<cpu>);
+
+// negative
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(negative, cpu, mshadow_op::negation)
+MXNET_ADD_SPARSE_OP_ALIAS(negative)
+.describe(R"code(Numerical negative of the argument, element-wise.
+
+The storage type of ``negative`` output depends upon the input storage type:
+
+   - negative(default) = default
+   - negative(row_sparse) = row_sparse
+   - negative(csr) = csr
+
+)code")
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"negative"});
+
+// reciprocal
+MXNET_OPERATOR_REGISTER_UNARY(reciprocal)
+.describe(R"code(Returns the reciprocal of the argument, element-wise.
+
+Calculates 1/x.
+
+Example::
+
+    reciprocal([-2, 1, 3, 1.6, 0.2]) = [-0.5, 1.0, 0.33333334, 0.625, 5.0]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::reciprocal>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_reciprocal"});
+
+MXNET_OPERATOR_REGISTER_BINARY(_backward_reciprocal)
+.set_attr<FCompute>("FCompute<cpu>",
+  ElemwiseBinaryOp::Compute<cpu, unary_bwd<mshadow_op::reciprocal_grad> >);
+
+// abs
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(abs, cpu, mshadow_op::abs)
+MXNET_ADD_SPARSE_OP_ALIAS(abs)
+.describe(R"code(Returns element-wise absolute value of the input.
+
+Example::
+
+   abs([-2, 0, 3]) = [2, 0, 3]
+
+The storage type of ``abs`` output depends upon the input storage type:
+
+   - abs(default) = default
+   - abs(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_abs"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_abs, unary_bwd<mshadow_op::sign>);
+
+// sign
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(sign, cpu, mshadow_op::sign)
+MXNET_ADD_SPARSE_OP_ALIAS(sign)
+.describe(R"code(Returns element-wise sign of the input.
+
+Example::
+
+   sign([-2, 0, 3]) = [-1, 0, 1]
+
+The storage type of ``sign`` output depends upon the input storage type:
+
+   - sign(default) = default
+   - sign(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_sign"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_sign, unary_bwd<mshadow_op::sign_grad>);
+
+// round
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(round, cpu, mshadow_op::round)
+MXNET_ADD_SPARSE_OP_ALIAS(round)
+.describe(R"code(Returns element-wise rounded value to the nearest integer of the input.
+
+Example::
+
+   round([-1.5, 1.5, -1.9, 1.9, 2.1]) = [-2.,  2., -2.,  2.,  2.]
+
+The storage type of ``round`` output depends upon the input storage type:
+
+  - round(default) = default
+  - round(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes);
+
+// rint
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(rint, cpu, mshadow_op::rint)
+MXNET_ADD_SPARSE_OP_ALIAS(rint)
+.describe(R"code(Returns element-wise rounded value to the nearest integer of the input.
+
+.. note::
+   - For input ``n.5`` ``rint`` returns ``n`` while ``round`` returns ``n+1``.
+   - For input ``-n.5`` both ``rint`` and ``round`` returns ``-n-1``.
+
+Example::
+
+   rint([-1.5, 1.5, -1.9, 1.9, 2.1]) = [-2.,  1., -2.,  2.,  2.]
+
+The storage type of ``rint`` output depends upon the input storage type:
+
+   - rint(default) = default
+   - rint(row_sparse) = row_sparse
+
+)code" ADD_FILELINE);
+
+// ceil
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(ceil, cpu, mshadow_op::ceil)
+MXNET_ADD_SPARSE_OP_ALIAS(ceil)
+.describe(R"code(Returns element-wise ceiling of the input.
+
+The ceil of the scalar x is the smallest integer i, such that i >= x.
+
+Example::
+
+   ceil([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -1.,  2.,  2.,  3.]
+
+The storage type of ``ceil`` output depends upon the input storage type:
+
+   - ceil(default) = default
+   - ceil(row_sparse) = row_sparse
+
+)code" ADD_FILELINE);
+
+// floor
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(floor, cpu, mshadow_op::floor)
+MXNET_ADD_SPARSE_OP_ALIAS(floor)
+.describe(R"code(Returns element-wise floor of the input.
+
+The floor of the scalar x is the largest integer i, such that i <= x.
+
+Example::
+
+   floor([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-3., -2.,  1.,  1.,  2.]
+
+The storage type of ``floor`` output depends upon the input storage type:
+
+   - floor(default) = default
+   - floor(row_sparse) = row_sparse
+
+)code" ADD_FILELINE);
+
+// trunc
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(trunc, cpu, mshadow_op::trunc)
+MXNET_ADD_SPARSE_OP_ALIAS(trunc)
+.describe(R"code(Return the element-wise truncated value of the input.
+
+The truncated value of the scalar x is the nearest integer i which is closer to
+zero than x is. In short, the fractional part of the signed number x is discarded.
+
+Example::
+
+   trunc([-2.1, -1.9, 1.5, 1.9, 2.1]) = [-2., -1.,  1.,  1.,  2.]
+
+The storage type of ``trunc`` output depends upon the input storage type:
+
+   - trunc(default) = default
+   - trunc(row_sparse) = row_sparse
+
+)code" ADD_FILELINE);
+
+// fix
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(fix, cpu, mshadow_op::fix)
+MXNET_ADD_SPARSE_OP_ALIAS(fix)
+.describe(R"code(Returns element-wise rounded value to the nearest \
+integer towards zero of the input.
+
+Example::
+
+   fix([-2.1, -1.9, 1.9, 2.1]) = [-2., -1.,  1., 2.]
+
+The storage type of ``fix`` output depends upon the input storage type:
+
+   - fix(default) = default
+   - fix(row_sparse) = row_sparse
+
+)code" ADD_FILELINE);
+
+// square
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP_CSR(square, cpu, mshadow_op::square)
+MXNET_ADD_SPARSE_OP_ALIAS(square)
+.describe(R"code(Returns element-wise squared value of the input.
+
+.. math::
+   square(x) = x^2
+
+Example::
+
+   square([2, 3, 4]) = [4, 9, 16]
+
+The storage type of ``square`` output depends upon the input storage type:
+
+   - square(default) = default
+   - square(row_sparse) = row_sparse
+   - square(csr) = csr
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_square,
+                                               unary_bwd<mshadow_op::square_grad>);
+
+// sqrt
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(sqrt, cpu, mshadow_op::square_root)
+MXNET_ADD_SPARSE_OP_ALIAS(sqrt)
+.describe(R"code(Returns element-wise square-root value of the input.
+
+.. math::
+   \textrm{sqrt}(x) = \sqrt{x}
+
+Example::
+
+   sqrt([4, 9, 16]) = [2, 3, 4]
+
+The storage type of ``sqrt`` output depends upon the input storage type:
+
+   - sqrt(default) = default
+   - sqrt(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_sqrt"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_sqrt,
+                                                  unary_bwd<mshadow_op::square_root_grad>);
+
+// rsqrt
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(rsqrt, cpu, mshadow_op::reciprocal_square_root)
+MXNET_ADD_SPARSE_OP_ALIAS(rsqrt)
+.describe(R"code(Returns element-wise inverse square-root value of the input.
+
+.. math::
+   rsqrt(x) = 1/\sqrt{x}
+
+Example::
+
+   rsqrt([4,9,16]) = [0.5, 0.33333334, 0.25]
+
+The storage type of ``rsqrt`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rsqrt"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(
+  _backward_rsqrt, unary_bwd<mshadow_op::reciprocal_square_root_grad>);
+
+// cbrt
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(cbrt, cpu, mshadow_op::cube_root)
+.describe(R"code(Returns element-wise cube-root value of the input.
+
+.. math::
+   cbrt(x) = \sqrt[3]{x}
+
+Example::
+
+   cbrt([1, 8, -125]) = [1, 2, -5]
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_cbrt"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_cbrt,
+                                                  unary_bwd<mshadow_op::cube_root_grad>);
+
+// rcbrt
+MXNET_OPERATOR_REGISTER_UNARY(rcbrt)
+.describe(R"code(Returns element-wise inverse cube-root value of the input.
+
+.. math::
+   rcbrt(x) = 1/\sqrt[3]{x}
+
+Example::
+
+   rcbrt([1,8,-125]) = [1.0, 0.5, -0.2]
+
+)code" ADD_FILELINE)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::Compute<cpu, mshadow_op::reciprocal_cube_root>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_rcbrt"});
+
+MXNET_OPERATOR_REGISTER_BINARY(_backward_rcbrt)
+.set_attr<FCompute>("FCompute<cpu>",
+                    ElemwiseBinaryOp::Compute<cpu,
+                      unary_bwd<mshadow_op::reciprocal_cube_root_grad>>);
+
+// exp
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(exp, cpu, mshadow_op::exp)
+MXNET_ADD_SPARSE_OP_ALIAS(exp)
+.describe(R"code(Returns element-wise exponential value of the input.
+
+.. math::
+   exp(x) = e^x \approx 2.718^x
+
+Example::
+
+   exp([0, 1, 2]) = [1., 2.71828175, 7.38905621]
+
+The storage type of ``exp`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_mul"});
+
+// log
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log, cpu, mshadow_op::log)
+MXNET_ADD_SPARSE_OP_ALIAS(log)
+.describe(R"code(Returns element-wise Natural logarithmic value of the input.
+
+The natural logarithm is logarithm in base *e*, so that ``log(exp(x)) = x``
+
+The storage type of ``log`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log"});
+
+// log10
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log10, cpu, mshadow_op::log10)
+MXNET_ADD_SPARSE_OP_ALIAS(log10)
+.describe(R"code(Returns element-wise Base-10 logarithmic value of the input.
+
+``10**log10(x) = x``
+
+The storage type of ``log10`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log10"});
+
+// log2
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(log2, cpu, mshadow_op::log2)
+MXNET_ADD_SPARSE_OP_ALIAS(log2)
+.describe(R"code(Returns element-wise Base-2 logarithmic value of the input.
+
+``2**log2(x) = x``
+
+The storage type of ``log2`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log2"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log,
+                                                  unary_bwd<mshadow_op::log_grad>);
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log10,
+                                                  unary_bwd<mshadow_op::log10_grad>);
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log2,
+                                                  unary_bwd<mshadow_op::log2_grad>);
+
+// log1p
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(log1p, cpu, mshadow_op::log1p)
+MXNET_ADD_SPARSE_OP_ALIAS(log1p)
+.describe(R"code(Returns element-wise ``log(1 + x)`` value of the input.
+
+This function is more accurate than ``log(1 + x)``  for small ``x`` so that
+:math:`1+x\approx 1`
+
+The storage type of ``log1p`` output depends upon the input storage type:
+
+   - log1p(default) = default
+   - log1p(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_log1p"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_log1p,
+                                                  unary_bwd<mshadow_op::log1p_grad>);
+
+// expm1
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(expm1, cpu, mshadow_op::expm1)
+MXNET_ADD_SPARSE_OP_ALIAS(expm1)
+.describe(R"code(Returns ``exp(x) - 1`` computed element-wise on the input.
+
+This function provides greater precision than ``exp(x) - 1`` for small values of ``x``.
+
+The storage type of ``expm1`` output depends upon the input storage type:
+
+   - expm1(default) = default
+   - expm1(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_expm1"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_expm1, unary_bwd<mshadow_op::exp>);
+
+
+// gamma
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(gamma, cpu, mshadow_op::gamma)
+MXNET_ADD_SPARSE_OP_ALIAS(gamma)
+.describe(R"code(Returns the gamma function (extension of the factorial function \
+to the reals), computed element-wise on the input array.
+
+The storage type of ``gamma`` output is always dense
+
+)code")
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_gamma"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_gamma,
+                                                  unary_bwd<mshadow_op::gamma_grad>);
+
+// gammaln
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(gammaln, cpu, mshadow_op::gammaln)
+MXNET_ADD_SPARSE_OP_ALIAS(gammaln)
+.describe(R"code(Returns element-wise log of the absolute value of the gamma function \
+of the input.
+
+The storage type of ``gammaln`` output is always dense
+
+)code")
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_gammaln"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_gammaln,
+                                                  unary_bwd<mshadow_op::gammaln_grad>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_basic.cu b/src/operator/tensor/elemwise_unary_op_basic.cu
new file mode 100644
index 000000000000..41eef903401c
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op_basic.cu
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_unary_op_trig.cu
+ * \brief GPU Implementation of unary trigometric functions.
+ */
+#include "./elemwise_binary_op.h"
+
+namespace mxnet {
+namespace op {
+NNVM_REGISTER_OP(relu)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::relu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::relu>);
+
+NNVM_REGISTER_OP(_backward_relu)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::relu_grad>>);
+
+NNVM_REGISTER_OP(sigmoid)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sigmoid>);
+
+NNVM_REGISTER_OP(_backward_sigmoid)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::sigmoid_grad>>);
+
+// copy
+NNVM_REGISTER_OP(_copy)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::IdentityComputeEx<gpu>);
+
+NNVM_REGISTER_OP(_backward_copy)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
+
+NNVM_REGISTER_OP(BlockGrad)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
+
+NNVM_REGISTER_OP(make_loss)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::IdentityComputeEx<gpu>);
+
+// identity output as first input, but attributes are constrainted to be like rhs
+NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::IdentityComputeFirstItemEx<gpu>);
+
+NNVM_REGISTER_OP(reshape_like)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
+
+NNVM_REGISTER_OP(Cast)
+.set_attr<FCompute>("FCompute<gpu>", CastCompute<gpu>);
+
+NNVM_REGISTER_OP(_backward_cast)
+.set_attr<FCompute>("FCompute<gpu>", CastCompute<gpu>);
+
+// negative
+NNVM_REGISTER_OP(negative)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::negation>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::negation>);
+
+// reciprocal
+NNVM_REGISTER_OP(reciprocal)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal>);
+
+NNVM_REGISTER_OP(_backward_reciprocal)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::reciprocal_grad> >);
+
+// abs
+NNVM_REGISTER_OP(abs)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::abs>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::abs>);
+
+NNVM_REGISTER_OP(_backward_abs)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::sign> >);
+
+// sign
+NNVM_REGISTER_OP(sign)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sign>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::sign>);
+
+NNVM_REGISTER_OP(_backward_sign)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::sign_grad> >);
+
+// round
+NNVM_REGISTER_OP(round)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::round>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::round>);
+
+// ceil
+NNVM_REGISTER_OP(ceil)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::ceil>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::ceil>);
+
+// floor
+NNVM_REGISTER_OP(floor)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::floor>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::floor>);
+
+// trunc
+NNVM_REGISTER_OP(trunc)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::trunc>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::trunc>);
+
+// rint
+NNVM_REGISTER_OP(rint)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::rint>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::rint>);
+
+
+// fix
+NNVM_REGISTER_OP(fix)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::fix>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::fix>);
+
+
+// square
+NNVM_REGISTER_OP(square)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::square>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::square>);
+
+NNVM_REGISTER_OP(_backward_square)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::square_grad> >);
+
+// sqrt
+NNVM_REGISTER_OP(sqrt)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::square_root>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::square_root>);
+
+
+NNVM_REGISTER_OP(_backward_sqrt)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::square_root_grad> >);
+
+// rsqrt
+NNVM_REGISTER_OP(rsqrt)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal_square_root>);
+
+NNVM_REGISTER_OP(_backward_rsqrt)
+.set_attr<FCompute>("FCompute<gpu>",
+  ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::reciprocal_square_root_grad> >);
+
+// cbrt
+NNVM_REGISTER_OP(cbrt)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::cube_root>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::cube_root>);
+
+
+NNVM_REGISTER_OP(_backward_cbrt)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::cube_root_grad> >);
+
+// rcbrt
+NNVM_REGISTER_OP(rcbrt)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::reciprocal_cube_root>);
+
+NNVM_REGISTER_OP(_backward_rcbrt)
+.set_attr<FCompute>("FCompute<gpu>",
+  ElemwiseBinaryOp::Compute<gpu, unary_bwd<mshadow_op::reciprocal_cube_root_grad> >);
+
+// exp
+NNVM_REGISTER_OP(exp)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::exp>);
+
+// log
+NNVM_REGISTER_OP(log)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log>);
+
+// log10
+NNVM_REGISTER_OP(log10)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log10>);
+
+// log2
+NNVM_REGISTER_OP(log2)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log2>);
+
+NNVM_REGISTER_OP(_backward_log)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::log_grad> >);
+
+NNVM_REGISTER_OP(_backward_log10)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::log10_grad> >);
+
+NNVM_REGISTER_OP(_backward_log2)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::log2_grad> >);
+
+// log1p
+NNVM_REGISTER_OP(log1p)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::log1p>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::log1p>);
+
+NNVM_REGISTER_OP(_backward_log1p)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::log1p_grad> >);
+
+// expm1
+NNVM_REGISTER_OP(expm1)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::expm1>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::expm1>);
+
+NNVM_REGISTER_OP(_backward_expm1)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::exp> >);
+
+// gamma
+NNVM_REGISTER_OP(gamma)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::gamma>);
+
+NNVM_REGISTER_OP(_backward_gamma)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::gamma_grad> >);
+
+// gammaln
+NNVM_REGISTER_OP(gammaln)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::gammaln>);
+
+NNVM_REGISTER_OP(_backward_gammaln)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::gammaln_grad> >);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_trig.cc b/src/operator/tensor/elemwise_unary_op_trig.cc
new file mode 100644
index 000000000000..15c8a96ab2c8
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op_trig.cc
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_unary_op_trig.cc
+ * \brief CPU Implementation of unary trigometric functions.
+ */
+#include <mxnet/base.h>
+#include "elemwise_unary_op.h"
+#include "./elemwise_binary_op-inl.h"
+
+namespace mxnet {
+namespace op {
+
+// sin
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(sin, cpu, mshadow_op::sin)
+MXNET_ADD_SPARSE_OP_ALIAS(sin)
+.describe(R"code(Computes the element-wise sine of the input array.
+
+The input should be in radians (:math:`2\pi` rad equals 360 degrees).
+
+.. math::
+   sin([0, \pi/4, \pi/2]) = [0, 0.707, 1]
+
+The storage type of ``sin`` output depends upon the input storage type:
+
+   - sin(default) = default
+   - sin(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sin" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_sin, unary_bwd<mshadow_op::sin_grad>);
+
+// cos
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(cos, cpu, mshadow_op::cos)
+MXNET_ADD_SPARSE_OP_ALIAS(cos)
+.describe(R"code(Computes the element-wise cosine of the input array.
+
+The input should be in radians (:math:`2\pi` rad equals 360 degrees).
+
+.. math::
+   cos([0, \pi/4, \pi/2]) = [1, 0.707, 0]
+
+The storage type of ``cos`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_cos"});
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_cos, unary_bwd<mshadow_op::cos_grad>);
+
+// tan
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(tan, cpu, mshadow_op::tan)
+MXNET_ADD_SPARSE_OP_ALIAS(tan)
+.describe(R"code(Computes the element-wise tangent of the input array.
+
+The input should be in radians (:math:`2\pi` rad equals 360 degrees).
+
+.. math::
+   tan([0, \pi/4, \pi/2]) = [0, 1, -inf]
+
+The storage type of ``tan`` output depends upon the input storage type:
+
+   - tan(default) = default
+   - tan(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{ "_backward_tan" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_tan, unary_bwd<mshadow_op::tan_grad>);
+
+// arcsin
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(arcsin, cpu, mshadow_op::arcsin)
+MXNET_ADD_SPARSE_OP_ALIAS(arcsin)
+.describe(R"code(Returns element-wise inverse sine of the input array.
+
+The input should be in the range `[-1, 1]`.
+The output is in the closed interval of [:math:`-\pi/2`, :math:`\pi/2`].
+
+.. math::
+   arcsin([-1, -.707, 0, .707, 1]) = [-\pi/2, -\pi/4, 0, \pi/4, \pi/2]
+
+The storage type of ``arcsin`` output depends upon the input storage type:
+
+   - arcsin(default) = default
+   - arcsin(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsin" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arcsin,
+                                                  unary_bwd<mshadow_op::arcsin_grad>);
+
+// arccos
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(arccos, cpu, mshadow_op::arccos)
+MXNET_ADD_SPARSE_OP_ALIAS(arccos)
+.describe(R"code(Returns element-wise inverse cosine of the input array.
+
+The input should be in range `[-1, 1]`.
+The output is in the closed interval :math:`[0, \pi]`
+
+.. math::
+   arccos([-1, -.707, 0, .707, 1]) = [\pi, 3\pi/4, \pi/2, \pi/4, 0]
+
+The storage type of ``arccos`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccos" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arccos,
+                                                  unary_bwd<mshadow_op::arccos_grad>);
+
+// arctan
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(arctan, cpu, mshadow_op::arctan)
+MXNET_ADD_SPARSE_OP_ALIAS(arctan)
+.describe(R"code(Returns element-wise inverse tangent of the input array.
+
+The output is in the closed interval :math:`[-\pi/2, \pi/2]`
+
+.. math::
+   arctan([-1, 0, 1]) = [-\pi/4, 0, \pi/4]
+
+The storage type of ``arctan`` output depends upon the input storage type:
+
+   - arctan(default) = default
+   - arctan(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctan" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arctan,
+                                                  unary_bwd<mshadow_op::arctan_grad>);
+
+// degrees
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(degrees, cpu, mshadow_op::degrees)
+MXNET_ADD_SPARSE_OP_ALIAS(degrees)
+.describe(R"code(Converts each element of the input array from radians to degrees.
+
+.. math::
+   degrees([0, \pi/2, \pi, 3\pi/2, 2\pi]) = [0, 90, 180, 270, 360]
+
+The storage type of ``degrees`` output depends upon the input storage type:
+
+   - degrees(default) = default
+   - degrees(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_degrees" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_degrees,
+                                                  unary_bwd<mshadow_op::degrees_grad>);
+
+// radians
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(radians, cpu, mshadow_op::radians)
+MXNET_ADD_SPARSE_OP_ALIAS(radians)
+.describe(R"code(Converts each element of the input array from degrees to radians.
+
+.. math::
+   radians([0, 90, 180, 270, 360]) = [0, \pi/2, \pi, 3\pi/2, 2\pi]
+
+The storage type of ``radians`` output depends upon the input storage type:
+
+   - radians(default) = default
+   - radians(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_radians" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_radians,
+                                                  unary_bwd<mshadow_op::radians_grad>);
+
+// sinh
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(sinh, cpu, mshadow_op::sinh)
+MXNET_ADD_SPARSE_OP_ALIAS(sinh)
+.describe(R"code(Returns the hyperbolic sine of the input array, computed element-wise.
+
+.. math::
+   sinh(x) = 0.5\times(exp(x) - exp(-x))
+
+The storage type of ``sinh`` output depends upon the input storage type:
+
+   - sinh(default) = default
+   - sinh(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_sinh" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_sinh, unary_bwd<mshadow_op::sinh_grad>);
+
+// cosh
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(cosh, cpu, mshadow_op::cosh)
+MXNET_ADD_SPARSE_OP_ALIAS(cosh)
+.describe(R"code(Returns the hyperbolic cosine  of the input array, computed element-wise.
+
+.. math::
+   cosh(x) = 0.5\times(exp(x) + exp(-x))
+
+The storage type of ``cosh`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_cosh" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_cosh, unary_bwd<mshadow_op::cosh_grad>);
+
+// tanh
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(tanh, cpu, mshadow_op::tanh)
+MXNET_ADD_SPARSE_OP_ALIAS(tanh)
+.describe(R"code(Returns the hyperbolic tangent of the input array, computed element-wise.
+
+.. math::
+   tanh(x) = sinh(x) / cosh(x)
+
+The storage type of ``tanh`` output depends upon the input storage type:
+
+   - tanh(default) = default
+   - tanh(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{ "_backward_tanh" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_tanh, unary_bwd<mshadow_op::tanh_grad>);
+
+// arcsinh
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(arcsinh, cpu, mshadow_op::arcsinh)
+MXNET_ADD_SPARSE_OP_ALIAS(arcsinh)
+.describe(R"code(Returns the element-wise inverse hyperbolic sine of the input array, \
+computed element-wise.
+
+The storage type of ``arcsinh`` output depends upon the input storage type:
+
+   - arcsinh(default) = default
+   - arcsinh(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arcsinh" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arcsinh,
+                                                  unary_bwd<mshadow_op::arcsinh_grad>);
+
+// arccosh
+MXNET_OPERATOR_REGISTER_UNARY_WITH_SPARSE_DR(arccosh, cpu, mshadow_op::arccosh)
+MXNET_ADD_SPARSE_OP_ALIAS(arccosh)
+.describe(R"code(Returns the element-wise inverse hyperbolic cosine of the input array, \
+computed element-wise.
+
+The storage type of ``arccosh`` output is always dense
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arccosh" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arccosh,
+                                                  unary_bwd<mshadow_op::arccosh_grad>);
+
+// arctanh
+MXNET_OPERATOR_REGISTER_UNARY_WITH_RSP(arctanh, cpu, mshadow_op::arctanh)
+MXNET_ADD_SPARSE_OP_ALIAS(arctanh)
+.describe(R"code(Returns the element-wise inverse hyperbolic tangent of the input array, \
+computed element-wise.
+
+The storage type of ``arctanh`` output depends upon the input storage type:
+
+   - arctanh(default) = default
+   - arctanh(row_sparse) = row_sparse
+
+)code" ADD_FILELINE)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_arctanh" });
+
+MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU_DR(_backward_arctanh,
+                                                  unary_bwd<mshadow_op::arctanh_grad>);
+
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/elemwise_unary_op_trig.cu b/src/operator/tensor/elemwise_unary_op_trig.cu
new file mode 100644
index 000000000000..90f248b3fee8
--- /dev/null
+++ b/src/operator/tensor/elemwise_unary_op_trig.cu
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file elemwise_unary_op.cu
+ * \brief GPU Implementation of unary function.
+ */
+#include "./elemwise_binary_op.h"
+
+namespace mxnet {
+namespace op {
+
+// sin
+NNVM_REGISTER_OP(sin)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sin>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::sin>);
+
+NNVM_REGISTER_OP(_backward_sin)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::sin_grad> >);
+
+// cos
+NNVM_REGISTER_OP(cos)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::cos>);
+
+NNVM_REGISTER_OP(_backward_cos)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::cos_grad> >);
+
+// tan
+NNVM_REGISTER_OP(tan)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::tan>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::tan>);
+
+NNVM_REGISTER_OP(_backward_tan)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::tan_grad> >);
+
+// arcsin
+NNVM_REGISTER_OP(arcsin)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arcsin>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::arcsin>);
+
+NNVM_REGISTER_OP(_backward_arcsin)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::arcsin_grad> >);
+
+// arccos
+NNVM_REGISTER_OP(arccos)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arccos>);
+
+NNVM_REGISTER_OP(_backward_arccos)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::arccos_grad> >);
+
+// arctan
+NNVM_REGISTER_OP(arctan)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arctan>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::arctan>);
+
+NNVM_REGISTER_OP(_backward_arctan)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::arctan_grad> >);
+
+// degrees
+NNVM_REGISTER_OP(degrees)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::degrees>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::degrees>);
+
+NNVM_REGISTER_OP(_backward_degrees)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::degrees_grad> >);
+
+// radians
+NNVM_REGISTER_OP(radians)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::radians>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::radians>);
+
+NNVM_REGISTER_OP(_backward_radians)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::radians_grad> >);
+
+// cosh
+NNVM_REGISTER_OP(cosh)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::cosh>);
+
+NNVM_REGISTER_OP(_backward_cosh)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::cosh_grad> >);
+
+// sinh
+NNVM_REGISTER_OP(sinh)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::sinh>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::sinh>);
+
+NNVM_REGISTER_OP(_backward_sinh)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::sinh_grad> >);
+
+// tanh
+NNVM_REGISTER_OP(tanh)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::tanh>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::tanh>);
+
+NNVM_REGISTER_OP(_backward_tanh)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::tanh_grad> >);
+
+// arcsinh
+NNVM_REGISTER_OP(arcsinh)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arcsinh>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::arcsinh>);
+
+NNVM_REGISTER_OP(_backward_arcsinh)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::arcsinh_grad> >);
+
+// arccosh
+NNVM_REGISTER_OP(arccosh)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arccosh>);
+
+NNVM_REGISTER_OP(_backward_arccosh)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::arccosh_grad> >);
+
+// arctanh
+NNVM_REGISTER_OP(arctanh)
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::Compute<gpu, mshadow_op::arctanh>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", UnaryOp::ComputeEx<gpu, mshadow_op::arctanh>);
+
+NNVM_REGISTER_OP(_backward_arctanh)
+.set_attr<FCompute>("FCompute<gpu>", ElemwiseBinaryOp::Compute<
+  gpu, unary_bwd<mshadow_op::arctanh_grad> >);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index e5cb41088e22..7d885ad47386 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file indexing_op.cc
  * \brief
  * \author Siyi Li, Chi Zhang
@@ -26,9 +27,119 @@
 #include "./indexing_op.h"
 namespace mxnet {
 namespace op {
+
+template<>
+void SparseEmbeddingOpForwardRspImpl<cpu>(mshadow::Stream<cpu>* s,
+                                          const TBlob& data,
+                                          const NDArray& weight,
+                                          const OpReqType req,
+                                          const TBlob& output) {
+  if (req == kNullOp) return;
+  using namespace rowsparse;
+  using namespace mxnet_op;
+  // zeros weight
+  if (req == kWriteTo && !weight.storage_initialized()) {
+    size_t out_size = output.shape_.Size();
+    MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
+      Fill<false>(s, TBlob(output.dptr<DType>(), mshadow::Shape1(out_size),
+          cpu::kDevMask), kWriteTo, 0);
+    })
+    return;
+  }
+  // check out-of-bound indices
+  bool is_valid = true;
+  MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
+    DType min = 0;
+    DType max = static_cast<DType>(weight.shape()[0] - 1);
+    // check with single thread is faster since data is small
+    DType* data_ptr = data.dptr<DType>();
+    size_t data_size = data.shape_.Size();
+    for (size_t i = 0; i < data_size; i++) {
+      if (data_ptr[i] > max || data_ptr[i] < min) is_valid = false;
+    }
+  })
+  CHECK(is_valid) << "SparseEmbedding input contains data out of bound";
+  // the weight is actually dense
+  if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
+    EmbeddingOpForwardDnsImpl<cpu>(s, data, weight.data(), req, output);
+  } else {
+    EmbeddingOpForwardRspImpl<cpu>(s, data, weight, req, output);
+  }
+}
+
+
+template<>
+inline void SparseEmbeddingOpBackwardRspImpl<cpu>(const OpContext& ctx,
+                                                  const TBlob& ograd,
+                                                  const TBlob& data,
+                                                  const OpReqType req,
+                                                  const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  // Request temporary storage for marking non-zero rows and prefix sum
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  // TODO(haibin) request less storage to save space in the future
+  size_t workspace_size = 2 * (num_rows * sizeof(dim_t));
+  Tensor<cpu, 1, char> workspace =
+    ctx.requested[embedding::kTempSpace].get_space_typed<cpu, 1, char>(
+      Shape1(workspace_size), s);
+  dim_t* row_flg = reinterpret_cast<dim_t*>(workspace.dptr_);
+  dim_t* prefix_sum = row_flg + num_rows;
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        // mark row flags
+        Fill<false>(s, TBlob(row_flg, Shape1(num_rows), cpu::kDevMask), kWriteTo, 0);
+        Kernel<MarkRowFlgKernel, cpu>::Launch(s, data_size, row_flg, data.dptr<IType>());
+        // calculate inclusive prefix sum
+        // TODO(haibin) ideally this is should be done in parallel
+        prefix_sum[0] = row_flg[0];
+        for (dim_t i = 1; i < num_rows; i++) {
+          prefix_sum[i] = prefix_sum[i - 1] + row_flg[i];
+        }
+        // total number of non-zero rows
+        dim_t nnr = prefix_sum[num_rows - 1];
+        if (nnr == 0) {
+          FillZerosRspImpl(s, output);
+          return;
+        }
+        output.CheckAndAlloc({Shape1(nnr)});
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+        // fill row_idx array of output matrix, using the row_flg values
+        Kernel<FillRspRowIdxKernel, cpu>::Launch(s, num_rows,
+            grad_row_idx, prefix_sum, num_rows);
+        // prefill with zeros
+        DType* grad_data = output.data().dptr<DType>();
+        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length),
+            cpu::kDevMask), kWriteTo, 0);
+        // add the final gradients
+        const int num_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+        dim_t segment_len = (nnr + num_threads - 1) / num_threads;
+        Kernel<AddTakeGradRspKernel, cpu>::Launch(s, num_threads, grad_data, prefix_sum,
+                                                  ograd.dptr<DType>(), row_length,
+                                                  data.dptr<IType>(), data_size, segment_len,
+                                                  num_rows);
+      });
+    });
+  });
+}
+
+
 DMLC_REGISTER_PARAMETER(EmbeddingParam);
 DMLC_REGISTER_PARAMETER(TakeParam);
 DMLC_REGISTER_PARAMETER(OneHotParam);
+DMLC_REGISTER_PARAMETER(ScatterNDParam);
 
 NNVM_REGISTER_OP(Embedding)
 .describe(R"code(Maps integer indices to vector representations (embeddings).
@@ -94,6 +205,76 @@ Examples::
 .add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.")
 .add_arguments(EmbeddingParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_contrib_SparseEmbedding)
+.describe(R"code(Maps integer indices to vector representations (embeddings).
+
+This operator maps words to real-valued vectors in a high-dimensional space,
+called word embeddings. These embeddings can capture semantic and syntactic properties of the words.
+For example, it has been noted that in the learned embedding spaces, similar words tend
+to be close to each other and dissimilar words far apart.
+
+For an input array of shape (d1, ..., dK),
+the shape of an output array is (d1, ..., dK, output_dim).
+All the input values should be integers in the range [0, input_dim).
+
+If the input_dim is ip0 and output_dim is op0, then shape of the embedding weight matrix must be
+(ip0, op0).
+
+The storage type of weight must be `row_sparse`, and the gradient of the weight will be of
+`row_sparse` storage type, too.
+
+.. Note::
+
+    `SparseEmbedding` is designed for the use case where `input_dim` is very large (e.g. 100k).
+    The operator is available on both CPU and GPU.
+
+Examples::
+
+  input_dim = 4
+  output_dim = 5
+
+  // Each row in weight matrix y represents a word. So, y = (w0,w1,w2,w3)
+  y = [[  0.,   1.,   2.,   3.,   4.],
+       [  5.,   6.,   7.,   8.,   9.],
+       [ 10.,  11.,  12.,  13.,  14.],
+       [ 15.,  16.,  17.,  18.,  19.]]
+
+  // Input array x represents n-grams(2-gram). So, x = [(w1,w3), (w0,w2)]
+  x = [[ 1.,  3.],
+       [ 0.,  2.]]
+
+  // Mapped input x to its vector representation y.
+  SparseEmbedding(x, y, 4, 5) = [[[  5.,   6.,   7.,   8.,   9.],
+                                 [ 15.,  16.,  17.,  18.,  19.]],
+
+                                [[  0.,   1.,   2.,   3.,   4.],
+                                 [ 10.,  11.,  12.,  13.,  14.]]]
+
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<EmbeddingParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "weight"};
+  })
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", EmbeddingOpShape)
+.set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType)
+.set_attr<FInferStorageType>("FInferStorageType", SparseEmbeddingOpForwardStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingOpForwardEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    return MakeNonlossGradNode("_backward_SparseEmbedding", n, ograds,
+                               {n->inputs[0]}, n->attrs.dict);
+  })
+.add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.")
+.add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.")
+.add_arguments(EmbeddingParam::__FIELDS__());
+
 NNVM_REGISTER_OP(_backward_Embedding)
 .set_num_inputs(2)
 .set_num_outputs(2)
@@ -104,6 +285,16 @@ NNVM_REGISTER_OP(_backward_Embedding)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>);
 
+NNVM_REGISTER_OP(_backward_SparseEmbedding)
+.set_num_inputs(2)
+.set_num_outputs(2)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FInferStorageType>("FInferStorageType", SparseEmbeddingOpBackwardStorageType)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingOpBackwardEx<cpu>);
 
 NNVM_REGISTER_OP(take)
 .describe(R"code(Takes elements from an input array along the given axis.
@@ -248,5 +439,143 @@ Examples::
 .add_argument("indices", "NDArray-or-Symbol", "array of locations where to set on_value")
 .add_arguments(OneHotParam::__FIELDS__());
 
+
+NNVM_REGISTER_OP(gather_nd)
+.describe(R"code(Gather elements or slices from `data` and store to a tensor whose
+shape is defined by `indices`. `gather_nd` and `scatter_nd` are inverse functions
+to each other.
+
+Given `data` with shape `(X_0, X_1, ..., X_{N-1})` and indices with shape
+`(M, Y_0, ..., Y_{K-1})`, the output will have shape `(Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1})`,
+where `M <= N`. If `M == N`, output shape will simply be `(Y_0, ..., Y_{K-1})`.
+
+The elements in output is defined as follows::
+
+  output[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}] = data[indices[0, y_0, ..., y_{K-1}],
+                                                      ...,
+                                                      indices[M-1, y_0, ..., y_{K-1}],
+                                                      x_M, ..., x_{N-1}]
+
+Examples::
+
+  data = [[0, 1], [2, 3]]
+  indices = [[1, 1, 0], [0, 1, 0]]
+  gather_nd(data, indices) = [2, 3, 0]
+
+)code")
+.set_num_outputs(1)
+.set_num_inputs(2)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "indices"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", GatherNDShape)
+.set_attr<nnvm::FInferType>("FInferType", GatherNDType)
+.set_attr<FCompute>("FCompute<cpu>", GatherNDForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    auto p = nnvm::Node::Create();
+    p->attrs.op = nnvm::Op::Get("scatter_nd");
+    p->attrs.name = n->attrs.name + "_backward";
+    p->inputs.push_back(ograds[0]);
+    p->inputs.push_back(n->inputs[1]);
+    p->control_deps.emplace_back(n);
+    auto zero = MakeNode("zeros_like", n->attrs.name + "_backward_indices",
+                         {n->inputs[1]}, nullptr, &n);
+    std::vector<nnvm::NodeEntry> ret;
+    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
+    ret.emplace_back(nnvm::NodeEntry{zero, 0, 0});
+    return ret;
+  })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.add_argument("data", "NDArray-or-Symbol", "data")
+.add_argument("indices", "NDArray-or-Symbol", "indices");
+
+
+NNVM_REGISTER_OP(scatter_nd)
+.describe(R"code(Scatters data into a new tensor according to indices.
+`gather_nd` and `scatter_nd` are inverse functions to each other.
+
+Given `data` with shape `(Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1})` and indices with shape
+`(M, Y_0, ..., Y_{K-1})`, the output will have shape `(X_0, X_1, ..., X_{N-1})`,
+where `M <= N`. If `M == N`, data shape should simply be `(Y_0, ..., Y_{K-1})`.
+
+The elements in output is defined as follows::
+
+  output[indices[0, y_0, ..., y_{K-1}],
+         ...,
+         indices[M-1, y_0, ..., y_{K-1}],
+         x_M, ..., x_{N-1}] = data[y_0, ..., y_{K-1}, x_M, ..., x_{N-1}]
+
+all other entries in output are 0.
+
+Examples::
+
+  data = [2, 3, 0]
+  indices = [[1, 1, 0], [0, 1, 0]]
+  shape = (2, 2)
+  scatter_nd(data, indices, shape) = [[0, 0], [2, 3]]
+
+)code")
+.set_num_outputs(1)
+.set_num_inputs(2)
+.set_attr_parser(ParamParser<ScatterNDParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "indices"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", ScatterNDShape)
+.set_attr<nnvm::FInferType>("FInferType", ScatterNDType)
+.set_attr<FCompute>("FCompute<cpu>", ScatterNDForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    auto p = nnvm::Node::Create();
+    p->attrs.op = nnvm::Op::Get("gather_nd");
+    p->attrs.name = n->attrs.name + "_backward";
+    p->inputs.push_back(ograds[0]);
+    p->inputs.push_back(n->inputs[1]);
+    p->control_deps.emplace_back(n);
+    auto zero = MakeNode("zeros_like", n->attrs.name + "_backward_indices",
+                         {n->inputs[1]}, nullptr, &n);
+    std::vector<nnvm::NodeEntry> ret;
+    ret.emplace_back(nnvm::NodeEntry{p, 0, 0});
+    ret.emplace_back(nnvm::NodeEntry{zero, 0, 0});
+    return ret;
+  })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.add_argument("data", "NDArray-or-Symbol", "data")
+.add_argument("indices", "NDArray-or-Symbol", "indices")
+.add_arguments(ScatterNDParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_scatter_set_nd)
+.describe(R"code(This operator has the same functionality as scatter_nd
+except that it does not reset the elements not indexed by the input
+index `NDArray` in the input data `NDArray`.
+
+.. note:: This operator is for internal use only.
+
+Examples::
+
+  data = [2, 3, 0]
+  indices = [[1, 1, 0], [0, 1, 0]]
+  out = [[1, 1], [1, 1]]
+  scatter_nd(data=data, indices=indices, out=out)
+  out = [[0, 1], [2, 3]]
+
+)code")
+.set_num_outputs(1)
+.set_num_inputs(2)
+.set_attr_parser(ParamParser<ScatterNDParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "indices"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", ScatterNDShape)
+.set_attr<nnvm::FInferType>("FInferType", ScatterNDType)
+.set_attr<FCompute>("FCompute<cpu>", ScatterSetNDForward<cpu>)
+.add_argument("data", "NDArray-or-Symbol", "data")
+.add_argument("indices", "NDArray-or-Symbol", "indices")
+.add_arguments(ScatterNDParam::__FIELDS__());
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index d57628a4389c..f029f0209957 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -18,20 +18,177 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file indexing_op.cu
  * \brief
  * \author Siyi Li, Chi Zhang
 */
 
 #include "./indexing_op.h"
+#include "./util/tensor_util-inl.cuh"
+
 namespace mxnet {
 namespace op {
+
+/*! \brief If there are out-of-bound indices, out will be assigned to 1.
+ */
+
+struct is_valid_check {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int32_t* out, const DType* data,
+                                  const DType min, const DType max) {
+    if (data[i] < min || data[i] > max) *out = 1;
+  }
+};
+
+
+struct AddTakeGradRspGPUKernel {
+  template<typename DType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             DType* out,
+                                             const nnvm::dim_t* prefix_sum,
+                                             const IType* data,
+                                             const DType* ograd,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    const dim_t data_i = tid / row_length;
+    const dim_t grad_i = tid % row_length;
+    const dim_t irow = static_cast<dim_t>(data[data_i]);
+    const dim_t rsp_row = prefix_sum[irow] - 1;
+    const DType val = ograd[data_i * row_length + grad_i];
+    atomicAdd(static_cast<DType *>(&(out[rsp_row*row_length+grad_i])), val);
+  }
+};
+
+template<>
+void SparseEmbeddingOpForwardRspImpl<gpu>(mshadow::Stream<gpu>* s,
+                                          const TBlob& data,
+                                          const NDArray& weight,
+                                          const OpReqType req,
+                                          const TBlob& output) {
+  if (req == kNullOp) return;
+  using namespace rowsparse;
+  using namespace mxnet_op;
+  // zeros weight
+  if (req == kWriteTo && !weight.storage_initialized()) {
+    size_t out_size = output.shape_.Size();
+    MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
+      Fill<false>(s, TBlob(output.dptr<DType>(), mshadow::Shape1(out_size),
+          gpu::kDevMask), kWriteTo, 0);
+    })
+    return;
+  }
+  // check out-of-bound indices
+  int32_t is_valid = 0;
+  MSHADOW_TYPE_SWITCH(data.type_flag_, DType, {
+    DType min = 0;
+    DType max = static_cast<DType>(weight.shape()[0] - 1);
+    DType* data_ptr = data.dptr<DType>();
+    size_t data_size = data.shape_.Size();
+    int32_t* is_valid_ptr = NULL;
+    CUDA_CALL(cudaMalloc(&is_valid_ptr, sizeof(int32_t)));
+    Kernel<set_zero, gpu>::Launch(s, 1, is_valid_ptr);
+    Kernel<is_valid_check, gpu>::Launch(s, data_size, is_valid_ptr, data_ptr, min, max);
+    CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(int32_t),
+              cudaMemcpyDeviceToHost));
+  })
+  CHECK_EQ(is_valid, 0) << "SparseEmbedding input contains data out of bound";
+  // the weight is actually dense
+  if (weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
+    EmbeddingOpForwardDnsImpl<gpu>(s, data, weight.data(), req, output);
+  } else {
+    EmbeddingOpForwardRspImpl<gpu>(s, data, weight, req, output);
+  }
+}
+
+
+template<>
+inline void SparseEmbeddingOpBackwardRspImpl<gpu>(const OpContext& ctx,
+                                                  const TBlob& ograd,
+                                                  const TBlob& data,
+                                                  const OpReqType req,
+                                                  const NDArray& output) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  using namespace rowsparse;
+  using nnvm::dim_t;
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseEmbedding layer doesn't support "
+                          << "weight gradient calculation with req != write";
+
+  // Request temporary storage for marking non-zero rows and prefix sum
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  dim_t num_rows = output.shape()[0];
+  dim_t row_length = output.shape()[1];
+  dim_t data_size = static_cast<dim_t>(data.shape_.Size());
+  dim_t num_threads;
+
+  MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+    MSHADOW_SGL_DBL_TYPE_SWITCH(ograd.type_flag_, DType, {
+      MSHADOW_IDX_TYPE_SWITCH(output.aux_type(kIdx), RType, {
+        dim_t* prefix_sum = NULL;
+        void* d_temp_storage = NULL;
+        size_t temp_storage_bytes = 0;
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      Stream<gpu>::GetStream(s));
+        Tensor<gpu, 1, char> workspace = ctx.requested[0]
+            .get_space_typed<gpu, 1, char>(Shape1(num_rows * sizeof(dim_t) +
+                                           temp_storage_bytes), s);
+        prefix_sum = reinterpret_cast<dim_t*>(workspace.dptr_);
+        d_temp_storage = workspace.dptr_ + num_rows*sizeof(dim_t);
+        num_threads = num_rows;
+        Fill<false>(s, TBlob(prefix_sum, Shape1(num_threads), gpu::kDevMask), kWriteTo, 0);
+        Kernel<MarkRowFlgKernel, gpu>::Launch(s, data_size, prefix_sum, data.dptr<IType>());
+
+        cub::DeviceScan::InclusiveSum(d_temp_storage,
+                                      temp_storage_bytes,
+                                      prefix_sum,
+                                      prefix_sum,
+                                      num_rows,
+                                      mshadow::Stream<gpu>::GetStream(s));
+        dim_t nnr = 0;
+        CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
+            cudaMemcpyDeviceToHost));
+
+        if (nnr == 0) {
+          FillZerosRspImpl(s, output);
+          return;
+        }
+        output.CheckAndAlloc({Shape1(nnr)});
+        RType* grad_row_idx = output.aux_data(kIdx).dptr<RType>();
+        // fill row_idx array of output matrix, using the row_flg values
+        Kernel<FillRspRowIdxKernel, gpu>::Launch(s, num_rows,
+            grad_row_idx, prefix_sum, num_rows);
+        // prefill with zeros
+        DType* grad_data = output.data().dptr<DType>();
+        Fill<false>(s, TBlob(grad_data, Shape1(nnr * row_length), gpu::kDevMask),
+            kWriteTo, 0);
+        // add the final gradients
+        num_threads = row_length * data_size;
+        Kernel<AddTakeGradRspGPUKernel, gpu>::Launch(s, num_threads, grad_data, prefix_sum,
+            data.dptr<IType>(), ograd.dptr<DType>(), row_length);
+      });
+    });
+  });
+}
+
 NNVM_REGISTER_OP(Embedding)
 .set_attr<FCompute>("FCompute<gpu>", EmbeddingOpForward<gpu>);
 
+NNVM_REGISTER_OP(_contrib_SparseEmbedding)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SparseEmbeddingOpForwardEx<gpu>);
+
 NNVM_REGISTER_OP(_backward_Embedding)
 .set_attr<FCompute>("FCompute<gpu>", EmbeddingOpBackward<gpu>);
 
+NNVM_REGISTER_OP(_backward_SparseEmbedding)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SparseEmbeddingOpBackwardEx<gpu>);
+
 NNVM_REGISTER_OP(take)
 .set_attr<FCompute>("FCompute<gpu>", TakeOpForward<gpu>);
 
@@ -44,6 +201,13 @@ NNVM_REGISTER_OP(batch_take)
 NNVM_REGISTER_OP(one_hot)
 .set_attr<FCompute>("FCompute<gpu>", OneHotOpForward<gpu>);
 
+NNVM_REGISTER_OP(gather_nd)
+.set_attr<FCompute>("FCompute<gpu>", GatherNDForward<gpu>);
+
+NNVM_REGISTER_OP(scatter_nd)
+.set_attr<FCompute>("FCompute<gpu>", ScatterNDForward<gpu>);
+
+NNVM_REGISTER_OP(_scatter_set_nd)
+.set_attr<FCompute>("FCompute<gpu>", ScatterSetNDForward<gpu>);
 }  // namespace op
 }  // namespace mxnet
-
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index ef42b01fb5b6..b0f06de9ae48 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file indexing_op.h
  * \brief
  * \author Bing Xu, Siyi Li, Chi Zhang
@@ -38,8 +39,13 @@
 #include "../operator_common.h"
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
+#include "./util/tensor_util-inl.h"
 #include "../mxnet_op.h"
 #include "./sort_op.h"
+#include "./dot-inl.h"
+#include "./init_op.h"
+#include "./matrix_op-inl.h"
+#include "../../engine/openmp.h"
 
 namespace mxnet {
 namespace op {
@@ -50,6 +56,7 @@ enum EmbeddingOpOutputs {kOut};
 enum EmbeddingOpResource {kTempSpace};
 }  // namespace embedding
 
+
 struct EmbeddingParam: public dmlc::Parameter<EmbeddingParam> {
   int input_dim;
   int output_dim;
@@ -170,6 +177,60 @@ inline bool EmbeddingOpType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+inline bool SparseEmbeddingOpForwardStorageType(const nnvm::NodeAttrs& attrs,
+                                                const int dev_mask,
+                                                DispatchMode* dispatch_mode,
+                                                std::vector<int>* in_attrs,
+                                                std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const int& data_stype = in_attrs->at(embedding::kData);
+  const int& weight_stype = in_attrs->at(embedding::kWeight);
+  int& out_stype = out_attrs->at(embedding::kOut);
+  bool dispatched = false;
+  if (!dispatched && data_stype == kDefaultStorage &&
+      weight_stype == kRowSparseStorage) {
+    // dns, rsp -> dns
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched) {
+    // nothing to fallback on
+    LOG(FATAL) << "Not implemented: "
+               << operator_stype_string(attrs, dev_mask, *in_attrs, *out_attrs);
+  }
+  return true;
+}
+
+
+inline bool SparseEmbeddingOpBackwardStorageType(const nnvm::NodeAttrs& attrs,
+                                                 const int dev_mask,
+                                                 DispatchMode* dispatch_mode,
+                                                 std::vector<int>* in_attrs,
+                                                 std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  const int ograd_stype = in_attrs->at(0);
+  const int data_stype = in_attrs->at(1);
+  int& data_grad_stype = out_attrs->at(0);
+  int& weight_grad_stype = out_attrs->at(1);
+  bool dispatched = false;
+  if (!dispatched && ograd_stype == kDefaultStorage &&
+      data_stype == kDefaultStorage) {
+    // dns, dns -> dns, rsp
+    if (type_assign(&data_grad_stype, kDefaultStorage) &&
+        type_assign(&weight_grad_stype, kRowSparseStorage) &&
+        dispatch_mode_assign(dispatch_mode, DispatchMode::kFComputeEx)) {
+      dispatched = true;
+    }
+  }
+  if (!dispatched) {
+    // nothing to fallback on
+    LOG(FATAL) << "Not implemented: "
+               << operator_stype_string(attrs, dev_mask, *in_attrs, *out_attrs);
+  }
+  return true;
+}
 /*! \brief name the struct Take instead of take
  * to avoid conflict with the take function in mshadow
  */
@@ -189,13 +250,132 @@ struct Take {
   }
 };
 
+// Embedding forward implementation with dense weight
+template<typename xpu>
+void EmbeddingOpForwardDnsImpl(mshadow::Stream<xpu>* s,
+                               const TBlob& data,
+                               const TBlob& weight,
+                               const OpReqType req,
+                               const TBlob& output) {
+  using namespace mxnet_op;
+  const TShape& ishape = data.shape_;
+  const TShape& oshape = output.shape_;
+
+  MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+      Tensor<xpu, 1, IType> idx = data.get_with_shape<xpu, 1, IType>(
+        Shape1(ishape.ProdShape(0, ishape.ndim())), s);
+      Tensor<xpu, 2, DType> wmat = weight.get<xpu, 2, DType>(s);
+      Tensor<xpu, 2, DType> out = output.get_with_shape<xpu, 2, DType>(
+        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+      Kernel<Take, xpu>::Launch(s, oshape.Size(), out.dptr_, wmat.dptr_,
+                                idx.dptr_, wmat.shape_[1], wmat.shape_[0]);
+    });
+  });
+}
+
+
+template<int req>
+struct TakeRspKernel {
+  /*!
+   * \brief
+   * \param i           thread id
+   * \param data        input data
+   * \param out         output
+   * \param weight_idx  indices of rsp weight
+   * \param weight_data data of rsp weight
+   * \param row_length  number of elements per row
+   * \param nnr         number of non-zero rows
+   */
+  template<typename DType, typename IType, typename RType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  const IType* data,
+                                  DType* out,
+                                  const RType* weight_idx,
+                                  const DType* weight_data,
+                                  const nnvm::dim_t row_length,
+                                  const nnvm::dim_t nnr) {
+    using nnvm::dim_t;
+    const dim_t val = static_cast<dim_t>(data[i]);
+    const DType zero = 0;
+    // Use binary search to find the lower_bound of val in weight_idx array
+    // (adapted based on the binary search in dot kernel)
+    const RType* first = weight_idx;
+    const RType* last = weight_idx + nnr;
+    const RType* it;
+    dim_t count = last - first, step;
+    while (count > 0) {
+      it = first;
+      step = count / 2;
+      it += step;
+      if (*it < val) {
+        first = ++it;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    // end of binary search
+    const dim_t idx_offset = first - weight_idx;
+    const dim_t out_offset = i * row_length;
+    const dim_t weight_offset = idx_offset * row_length;
+    // target idx might be missing in weight.idx. For example,
+    // weight.idx = [5,10] and data = [3,7], so binary search fails to
+    // find any matching indices in weight_idx.
+    if (idx_offset >= nnr || *(weight_idx + idx_offset) > val) {
+      // val not found, fill zeros
+      for (int j = 0; j < row_length; j++) {
+        KERNEL_ASSIGN(out[out_offset + j], req, zero);
+      }
+    } else {
+      for (int j = 0; j < row_length; j++) {
+        KERNEL_ASSIGN(out[out_offset + j], req, weight_data[weight_offset + j]);
+      }
+    }
+  }
+};
+
+template<typename xpu>
+inline void EmbeddingOpForwardRspImpl(mshadow::Stream<xpu>* s,
+                                      const TBlob& data,
+                                      const NDArray& weight,
+                                      const OpReqType req,
+                                      const TBlob& output) {
+  using namespace mxnet_op;
+  using namespace rowsparse;
+  MSHADOW_TYPE_SWITCH(output.type_flag_, DType, {
+    MSHADOW_TYPE_SWITCH(data.type_flag_, IType, {
+      MSHADOW_TYPE_SWITCH(weight.aux_type(kIdx), RType, {
+        MXNET_ASSIGN_REQ_SWITCH(req, req_t, {
+          size_t data_size = data.shape_.Size();
+          // only using the second dim since weight.ndim() == 2
+          const nnvm::dim_t row_length = weight.shape()[1];
+          Kernel<TakeRspKernel<req_t>, xpu>::Launch(s, data_size, data.dptr<IType>(),
+                                                    output.dptr<DType>(),
+                                                    weight.aux_data(kIdx).dptr<RType>(),
+                                                    weight.data().dptr<DType>(),
+                                                    row_length, weight.aux_shape(kIdx)[0]);
+        });
+      });
+    });
+  });
+}
+
+
+// Embedding forward implementation with row_sparse weight
+template<typename xpu>
+void SparseEmbeddingOpForwardRspImpl(mshadow::Stream<xpu>* s,
+                                     const TBlob& data,
+                                     const NDArray& weight,
+                                     const OpReqType req,
+                                     const TBlob& output);
+
 template<typename xpu>
 void EmbeddingOpForward(const nnvm::NodeAttrs& attrs,
                         const OpContext& ctx,
                         const std::vector<TBlob>& inputs,
                         const std::vector<OpReqType>& req,
                         const std::vector<TBlob>& outputs) {
-  using namespace mxnet_op;
   CHECK_EQ(req[embedding::kOut], kWriteTo);
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1U);
@@ -203,22 +383,36 @@ void EmbeddingOpForward(const nnvm::NodeAttrs& attrs,
           << "Embedding layer expects its weight to be two-dimensional. "
           << inputs[embedding::kWeight].ndim()
           << " dimensional input is given instead";
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  EmbeddingOpForwardDnsImpl<xpu>(s, inputs[embedding::kData], inputs[embedding::kWeight],
+                                 req[embedding::kOut], outputs[embedding::kOut]);
+}
 
-  const TShape& ishape = inputs[embedding::kData].shape_;
-  const TShape& oshape = outputs[embedding::kOut].shape_;
-
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, IType, {
-      Tensor<xpu, 1, IType> data = inputs[embedding::kData].get_with_shape<xpu, 1, IType>(
-        Shape1(ishape.ProdShape(0, ishape.ndim())), s);
-      Tensor<xpu, 2, DType> wmat = inputs[embedding::kWeight].get<xpu, 2, DType>(s);
-      Tensor<xpu, 2, DType> out = outputs[embedding::kOut].get_with_shape<xpu, 2, DType>(
-        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
-      Kernel<Take, xpu>::Launch(s, oshape.Size(), out.dptr_, wmat.dptr_,
-        data.dptr_, wmat.shape_[1], wmat.shape_[0]);
-    });
-  });
+template<typename xpu>
+void SparseEmbeddingOpForwardEx(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  CHECK_EQ(req[embedding::kOut], kWriteTo);
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  const NDArray& data = inputs[embedding::kData];
+  const NDArray& weight = inputs[embedding::kWeight];
+  const NDArray& out = outputs[embedding::kOut];
+  CHECK_EQ(weight.shape().ndim(), 2U)
+          << "Embedding layer expects its weight to be two-dimensional. "
+          << weight.shape().ndim() << " dimensional input is given instead";
+  const auto data_stype = data.storage_type();
+  const auto weight_stype = weight.storage_type();
+  const auto out_stype = out.storage_type();
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (data_stype == kDefaultStorage && weight_stype == kRowSparseStorage &&
+      out_stype == kDefaultStorage) {
+    SparseEmbeddingOpForwardRspImpl<xpu>(s, data.data(), weight, req[0], out.data());
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
 }
 
 // Returns integer log2(a) rounded up
@@ -274,7 +468,8 @@ void AddTakeGradLargeBatchCaller(const OpContext& ctx, mshadow::Tensor<xpu, 2, D
   Tensor<xpu, 1, char> temp_storage(&workspace[pos], Shape1(temp_storage_size), s);
   Kernel<tcast_clip, xpu>::Launch(s, index.shape_.Size(), sorted_data.dptr_, index.dptr_,
     static_cast<int>(dst.shape_[0]));
-  original_index = range<int>(0, index.shape_.Size());
+  Kernel<range_fwd, xpu>::Launch(s, index.shape_.Size(),
+    1, 0, 1, kWriteTo, original_index.dptr_);
   int num_bits = ilog2((dst.shape_[0] - 1));
   mxnet::op::SortByKey(sorted_data, original_index, true, &temp_storage, 0, num_bits);
   mxnet::op::AddTakeGradLargeBatch(dst, sorted_data, original_index, src, &temp_storage);
@@ -333,6 +528,80 @@ void EmbeddingOpBackward(const nnvm::NodeAttrs& attrs,
   });
 }
 
+struct AddTakeGradRspKernel {
+  /*!
+   * \brief Each thread i is responsible for row slices in [segment_start, segment_end)
+            of the result gradient
+   * \param tid             global thread id
+   * \param grad            the gradient to calculate
+   * \param prefix_sum      the inclusive prefix sum of row ids of the gradient
+   * \param ograd           output gradient
+   * \param row_length      the length of the row slices of the gradient
+   * \param data_val        the values of input data
+   * \param data_size       number of values of input data
+   * \param segment_length  the length of row segment to process for each thread
+   * \param nnr             total number of non-zero rows of result gradient
+   */
+  template<typename DType, typename IType>
+  MSHADOW_CINLINE static void Map(int tid,
+                                  DType* grad,
+                                  const nnvm::dim_t* prefix_sum,
+                                  const DType* ograd,
+                                  const nnvm::dim_t row_length,
+                                  const IType* data_val,
+                                  const nnvm::dim_t data_size,
+                                  const nnvm::dim_t segment_length,
+                                  const nnvm::dim_t nnr) {
+    using nnvm::dim_t;
+    dim_t segment_start = tid * segment_length;
+    dim_t segment_end = std::min(nnr, segment_start + segment_length);
+    // scan all data
+    for (dim_t data_i = 0; data_i < data_size; data_i++) {
+      dim_t data = static_cast<dim_t>(data_val[data_i]);
+      dim_t grad_row_id = prefix_sum[data] - 1;
+      if (grad_row_id < segment_start || grad_row_id >= segment_end) continue;
+      // no projection is performed
+      dim_t ograd_i = data_i * row_length;
+      dim_t grad_i = grad_row_id * row_length;
+      for (dim_t offset = 0; offset < row_length; offset++) {
+        grad[grad_i + offset] += ograd[ograd_i + offset];
+      }
+    }
+  }
+};
+
+template<typename xpu>
+inline void SparseEmbeddingOpBackwardRspImpl(const OpContext& ctx,
+                                             const TBlob& ograd,
+                                             const TBlob& data,
+                                             const OpReqType req,
+                                             const NDArray& output);
+
+template<typename xpu>
+void SparseEmbeddingOpBackwardEx(const nnvm::NodeAttrs& attrs,
+                                 const OpContext& ctx,
+                                 const std::vector<NDArray>& inputs,
+                                 const std::vector<OpReqType>& req,
+                                 const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 2U);
+  const NDArray& weight_grad = outputs[1];
+  const NDArray& ograd = inputs[0];
+  const NDArray& data = inputs[1];
+  // check dtype
+  CHECK_EQ(weight_grad.dtype(), ograd.dtype());
+  // check req
+  CHECK_EQ(req[embedding::kData], kNullOp)
+          << "SparseEmbedding layer doesn't support calculate data gradient";
+  if (data.storage_type() == kDefaultStorage && ograd.storage_type() == kDefaultStorage &&
+      weight_grad.storage_type() == kRowSparseStorage) {
+    SparseEmbeddingOpBackwardRspImpl<xpu>(ctx, ograd.data(), data.data(),
+                                          req[embedding::kWeight], weight_grad);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
 namespace take_ {  // to avoid name conflict
 enum TakeOpInputs {kArr, kIdx};
 enum TakeOpOutputs {kOut};
@@ -380,7 +649,7 @@ inline bool TakeOpShape(const nnvm::NodeAttrs& attrs,
     using namespace mshadow;
     const TShape &arrshape = (*in_attrs)[take_::kArr];
     const TShape &idxshape = (*in_attrs)[take_::kIdx];
-    if (idxshape.ndim() == 0) return false;
+    if (idxshape.ndim() == 0U || idxshape.Size() == 0U) return false;
 
     out_attrs->clear();
 
@@ -685,6 +954,209 @@ void OneHotOpForward(const nnvm::NodeAttrs& attrs,
   });
 }
 
+inline bool GatherNDShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape> *in_attrs,
+                          std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  // The shape of indices
+  const TShape& dshape = (*in_attrs)[0];
+  const TShape& ishape = (*in_attrs)[1];
+
+  if (shape_is_none(dshape) || shape_is_none(ishape)) return false;
+
+  CHECK_GT(ishape.ndim(), 1)
+    << "gather_nd requires index tensor to have at least 2 dimensions";
+
+  CHECK_LE(ishape[0], dshape.ndim())
+    << "Number of indices exceeds data dimension";
+
+  CHECK_LE(ishape[0], 10)
+    << "gather_nd supports indexing along at most 10 dimensions.";
+
+  TShape oshape(ishape.ndim() - 1 + dshape.ndim() - ishape[0]);
+
+  for (size_t i = 0; i < ishape.ndim() - 1; ++i) oshape[i] = ishape[i+1];
+  for (int i = 0; i < dshape.ndim() - ishape[0]; ++i) {
+    oshape[ishape.ndim()-1+i] = dshape[ishape[0] + i];
+  }
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  return true;
+}
+
+inline bool GatherNDType(const nnvm::NodeAttrs& attrs,
+                         std::vector<int>* in_attrs,
+                         std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
+  return true;
+}
+
+struct gather_nd {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, OpReqType req, int N, int M, int K,
+                                  const mshadow::Shape<10> strides,
+                                  DType* out, const DType* data,
+                                  const IType* indices) {
+    int offset = 0;
+    for (int j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+    }
+    for (int j = 0; j < K; ++j) {
+      KERNEL_ASSIGN(out[i*K + j], req, data[offset+j]);
+    }
+  }
+};
+
+template<typename xpu>
+void GatherNDForward(const nnvm::NodeAttrs& attrs,
+                     const OpContext& ctx,
+                     const std::vector<TBlob>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<TBlob>& outputs) {
+  using namespace mxnet_op;
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TShape& dshape = inputs[0].shape_;
+  const TShape& ishape = inputs[1].shape_;
+  int M = ishape[0];
+  int N = ishape.Size() / M;
+  int K = dshape.ProdShape(M, dshape.ndim());
+  mshadow::Shape<10> strides;
+  for (int i = M-1, stride = K; i >= 0; stride *= dshape[i], --i) strides[i] = stride;
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {  // output data type switch
+    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {  // indices data type switch
+      Kernel<gather_nd, xpu>::Launch(
+          s, N, req[0], N, M, K, strides, outputs[0].dptr<DType>(),
+          inputs[0].dptr<DType>(), inputs[1].dptr<IType>());
+    });
+  });
+}
+
+
+struct ScatterNDParam : public dmlc::Parameter<ScatterNDParam> {
+  TShape shape;
+  DMLC_DECLARE_PARAMETER(ScatterNDParam) {
+    DMLC_DECLARE_FIELD(shape)
+      .describe("Shape of output.");
+  }
+};
+
+inline bool ScatterNDShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape> *in_attrs,
+                           std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const auto& params = dmlc::get<ScatterNDParam>(attrs.parsed);
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, params.shape);
+
+  const TShape& dshape = (*in_attrs)[0];
+  const TShape& ishape = (*in_attrs)[1];
+  const TShape& oshape = (*out_attrs)[0];
+
+  if (shape_is_none(dshape) || shape_is_none(ishape) || shape_is_none(oshape)) return false;
+
+  CHECK_GT(ishape.ndim(), 1)
+    << "scatter_nd requires index tensor to have at least 2 dimensions";
+
+  CHECK_LE(ishape[0], oshape.ndim())
+    << "Number of indices exceeds output dimension in operator scatter_nd";
+
+  CHECK_LE(ishape[0], 10)
+    << "scatter_nd supports indexing along at most 10 dimensions.";
+
+  bool valid = dshape.ndim() == ishape.ndim() - 1 + oshape.ndim() - ishape[0];
+
+  for (size_t i = 0; i < ishape.ndim() - 1; ++i) {
+    valid = valid && dshape[i] == ishape[i+1];
+  }
+  for (int i = 0; i < oshape.ndim() - ishape[0]; ++i) {
+    valid = valid && dshape[ishape.ndim()-1+i] == oshape[ishape[0] + i];
+  }
+
+  CHECK(valid)
+    << "Invalid data, indices, and output shape combination for scatter_nd: "
+    << dshape << ", " << ishape << ", " << oshape;
+
+  return true;
+}
+
+inline bool ScatterNDType(const nnvm::NodeAttrs& attrs,
+                          std::vector<int>* in_attrs,
+                          std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[0]);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[0]);
+  return true;
+}
+
+struct scatter_nd {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, OpReqType req, int N, int M, int K,
+                                  const mshadow::Shape<10> strides,
+                                  DType* out, const DType* data,
+                                  const IType* indices) {
+    int offset = 0;
+    for (int j = 0; j < M; ++j) {
+      offset += strides[j] * static_cast<int>(indices[j*N + i]);
+    }
+    for (int j = 0; j < K; ++j) {
+      KERNEL_ASSIGN(out[offset+j], req, data[i*K + j]);
+    }
+  }
+};
+
+template<typename xpu>
+void ScatterNDForward(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  if (req[0] == kNullOp) return;
+  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TShape& oshape = outputs[0].shape_;
+  const TShape& ishape = inputs[1].shape_;
+  int M = ishape[0];
+  int N = ishape.Size() / M;
+  int K = oshape.ProdShape(M, oshape.ndim());
+  mshadow::Shape<10> strides;
+  for (int i = M-1, stride = K; i >= 0; stride *= oshape[i], --i) strides[i] = stride;
+  MSHADOW_TYPE_SWITCH(inputs[0].type_flag_, DType, {  // output data type switch
+    if (kWriteTo == req[0]) {
+      Fill<true>(s, outputs[0], req[0], 0);
+    }
+    MSHADOW_TYPE_SWITCH(inputs[1].type_flag_, IType, {  // indices data type switch
+      mxnet_op::Kernel<scatter_nd, xpu>::Launch(
+        s, N, req[0], N, M, K, strides, outputs[0].dptr<DType>(),
+        inputs[0].dptr<DType>(), inputs[1].dptr<IType>());
+    });
+  });
+}
+
+/*!
+ * This is for internal use only.
+ * DO NOT call this function unless you have to.
+ */
+template<typename xpu>
+void ScatterSetNDForward(const nnvm::NodeAttrs& attrs,
+                         const OpContext& ctx,
+                         const std::vector<TBlob>& inputs,
+                         const std::vector<OpReqType>& req,
+                         const std::vector<TBlob>& outputs) {
+  ScatterNDForward<xpu>(attrs, ctx, inputs, {kWriteInplace}, outputs);
+}
+
 }  // namespace op
 }  // namespace mxnet
 #ifdef __CUDACC__
diff --git a/src/operator/tensor/init_op.cc b/src/operator/tensor/init_op.cc
index 8dac22a64966..5be4a2f4211f 100644
--- a/src/operator/tensor/init_op.cc
+++ b/src/operator/tensor/init_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file init_op.cc
  * \brief CPU Implementation of init op
  */
@@ -28,6 +29,7 @@ namespace mxnet {
 namespace op {
 
 DMLC_REGISTER_PARAMETER(InitOpParam);
+DMLC_REGISTER_PARAMETER(InitOpWithScalarParam);
 DMLC_REGISTER_PARAMETER(RangeParam);
 
 
@@ -38,7 +40,9 @@ NNVM_REGISTER_OP(_zeros)
 .set_attr_parser(ParamParser<InitOpParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", InitShape<InitOpParam>)
 .set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
+.set_attr<FInferStorageType>("FInferStorageType", InitStorageType<InitOpParam, true, true>)
 .set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", FillComputeZerosEx<cpu>)
 .add_arguments(InitOpParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_ones)
@@ -51,6 +55,16 @@ NNVM_REGISTER_OP(_ones)
 .set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 1>)
 .add_arguments(InitOpParam::__FIELDS__());
 
+NNVM_REGISTER_OP(_full)
+  .describe("fill target with a scalar value")
+  .set_num_inputs(0)
+  .set_num_outputs(1)
+  .set_attr_parser(ParamParser<InitOpWithScalarParam>)
+  .set_attr<nnvm::FInferShape>("FInferShape", InitShape<InitOpWithScalarParam>)
+  .set_attr<nnvm::FInferType>("FInferType", InitType<InitOpWithScalarParam>)
+  .set_attr<FCompute>("FCompute<cpu>", InitFillWithScalarCompute<cpu>)
+.add_arguments(InitOpWithScalarParam::__FIELDS__());
+
 NNVM_REGISTER_OP(_arange)
 .describe("Return evenly spaced values within a given interval. Similar to Numpy")
 .set_num_inputs(0)
@@ -62,9 +76,16 @@ NNVM_REGISTER_OP(_arange)
 .add_arguments(RangeParam::__FIELDS__());
 
 NNVM_REGISTER_OP(zeros_like)
+.add_alias("_sparse_zeros_like")
 .describe(R"code(Return an array of zeros with the same shape and type
 as the input array.
 
+The storage type of ``zeros_like`` output depends on the storage type of the input
+
+- zeros_like(row_sparse) = row_sparse
+- zeros_like(csr) = csr
+- zeros_like(default) = default
+
 Examples::
 
   x = [[ 1.,  1.,  1.],
@@ -78,9 +99,11 @@ Examples::
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>)
 .set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
     [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 0); })
 .set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", FillComputeZerosEx<cpu>)
 .set_attr<nnvm::FGradient>("FGradient", MakeZeroGradNodes)
 .add_argument("data", "NDArray-or-Symbol", "The input");
 
diff --git a/src/operator/tensor/init_op.cu b/src/operator/tensor/init_op.cu
index 6e2b65cc8519..aeea2895b001 100644
--- a/src/operator/tensor/init_op.cu
+++ b/src/operator/tensor/init_op.cu
@@ -18,25 +18,47 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file init_op.cu
  * \brief GPU Implementation of init op
  */
+#include <mshadow/tensor.h>
 #include "./init_op.h"
 
 namespace mxnet {
 namespace op {
 
+/*!
+ * \brief Fill a CSR NDArray with zeros by updating the aux shape
+ * \param s - The device stream
+ * \param dst - NDArray which is to be set to "all zeroes"
+ */
+void FillZerosCsrImpl(mshadow::Stream<mshadow::gpu> *s, const NDArray& dst) {
+  dst.set_aux_shape(csr::kIdx, mshadow::Shape1(0));
+  dst.CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(dst.shape()[0] + 1));
+  TBlob indptr_data = dst.aux_data(csr::kIndPtr);
+  MSHADOW_IDX_TYPE_SWITCH(dst.aux_type(csr::kIndPtr), IType, {
+    mxnet_op::Kernel<mxnet_op::set_zero, mshadow::gpu>::Launch(
+      s, indptr_data.Size(), indptr_data.dptr<IType>());
+  });
+}
+
 NNVM_REGISTER_OP(_zeros)
-.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", FillComputeZerosEx<gpu>);
 
 NNVM_REGISTER_OP(_ones)
 .set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
 
+NNVM_REGISTER_OP(_full)
+.set_attr<FCompute>("FCompute<gpu>", InitFillWithScalarCompute<gpu>);
+
 NNVM_REGISTER_OP(_arange)
 .set_attr<FCompute>("FCompute<gpu>", RangeCompute<gpu>);
 
 NNVM_REGISTER_OP(zeros_like)
-.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", FillComputeZerosEx<gpu>);
 
 NNVM_REGISTER_OP(ones_like)
 .set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index bdc74d332491..95e8184f8a05 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file init_op.h
  * \brief Function definition of initialization op
  */
@@ -32,7 +33,10 @@
 #include <vector>
 #include <string>
 #include <limits>
+#include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
+#include "../mxnet_op.h"
+#include "../mshadow_op.h"
 
 namespace mxnet {
 namespace op {
@@ -60,9 +64,9 @@ struct InitOpParam : public dmlc::Parameter<InitOpParam> {
 };
 
 struct RangeParam : public dmlc::Parameter<RangeParam> {
-  real_t start;
-  dmlc::optional<real_t> stop;
-  real_t step;
+  double start;
+  dmlc::optional<double> stop;
+  double step;
   int repeat;
   std::string ctx;
   int dtype;
@@ -70,7 +74,7 @@ struct RangeParam : public dmlc::Parameter<RangeParam> {
     DMLC_DECLARE_FIELD(start)
     .describe("Start of interval. The interval includes this value. The default start value is 0.");
     DMLC_DECLARE_FIELD(stop)
-    .set_default(dmlc::optional<real_t>())
+    .set_default(dmlc::optional<double>())
     .describe("End of interval. The interval does not include this value,"
               " except in some cases where step is not an integer and"
               " floating point round-off affects the length of out.");
@@ -91,10 +95,37 @@ struct RangeParam : public dmlc::Parameter<RangeParam> {
     .add_enum("float16", mshadow::kFloat16)
     .add_enum("uint8", mshadow::kUint8)
     .add_enum("int32", mshadow::kInt32)
+    .add_enum("int64", mshadow::kInt64)
     .describe("Target data type.");
   }
 };
 
+/*! \brief Initialize and fill output with an arbitrary value */
+struct InitOpWithScalarParam : dmlc::Parameter<InitOpWithScalarParam> {
+  TShape shape;
+  std::string ctx;
+  int dtype;
+  double value;
+  DMLC_DECLARE_PARAMETER(InitOpWithScalarParam) {
+    DMLC_DECLARE_FIELD(shape)
+      .set_default(TShape())
+      .describe("The shape of the output");
+    DMLC_DECLARE_FIELD(ctx)
+      .set_default("")
+      .describe("Context of output, in format [cpu|gpu|cpu_pinned](n)."
+                  "Only used for imperative calls.");
+    DMLC_DECLARE_FIELD(dtype).set_default(mshadow::kFloat32)
+      .add_enum("float32", mshadow::kFloat32)
+      .add_enum("float64", mshadow::kFloat64)
+      .add_enum("float16", mshadow::kFloat16)
+      .add_enum("uint8", mshadow::kUint8)
+      .add_enum("int32", mshadow::kInt32)
+      .describe("Target data type.");
+    DMLC_DECLARE_FIELD(value)
+      .describe("Value with which to fill newly created tensor");
+  }
+};
+
 /*! \brief Parse keyword arguments as PType arguments and save to parsed */
 inline void RangeParamParser(nnvm::NodeAttrs* attrs) {
   RangeParam param;
@@ -129,22 +160,216 @@ inline bool InitType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+template<typename ParamType, bool rsp, bool csr>
+inline bool InitStorageType(const nnvm::NodeAttrs& attrs,
+                            const int dev_mask,
+                            DispatchMode* dispatch_mode,
+                            std::vector<int> *in_attrs,
+                            std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 0U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  auto &out_stype = out_attrs->at(0);
+  bool dispatched = false;
+  type_assign(&out_stype, kDefaultStorage);
+  if (!dispatched && out_stype == kDefaultStorage) {
+    // default
+    dispatched = storage_type_assign(out_attrs, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+  if (!dispatched && rsp && out_stype == kRowSparseStorage) {
+    // rsp
+    dispatched = storage_type_assign(out_attrs, kRowSparseStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched && csr && out_stype == kCSRStorage) {
+    // csr
+    dispatched = storage_type_assign(out_attrs, kCSRStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+  return true;
+}
+
+/*!
+ * \brief General-purpose blob value-filling function
+ * \tparam xpu cpu or gpu
+ * \tparam ValueType Data type of supplied value
+ * \tparam is_integer Whether to optimize for an integer value
+ * \param s Stream
+ * \param b The blob to fill with a value
+ * \param req Request type (kNullOp, kWriteTo, etc)
+ * \param val The value to use for the filling operation
+ */
+template <bool is_integer = false, typename ValueType, typename xpu>
+void Fill(mshadow::Stream<xpu> *s, const TBlob& b, const OpReqType req, ValueType val) {
+  if (req != kNullOp) {
+    const size_t size = b.Size();
+    if (val == 0) {
+      if (req != kAddTo) {
+        if (b.dev_mask() == cpu::kDevMask) {
+          MSHADOW_TYPE_SWITCH(b.type_flag_, DType, {
+            memset(b.dptr_, 0, size * sizeof(DType));
+          });
+        } else {
+          // Optimize common use-case of filling with ones
+          MSHADOW_TYPE_SWITCH(b.type_flag_, DType, {
+            MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+              mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::set_to_int<0>, Req>, xpu>::Launch(
+                s, b.Size(), b.dptr<DType>());
+            });
+          });
+        }
+      }
+    } else if (is_integer && val == 1) {
+      // Optimize common use-case of filling with ones
+      MSHADOW_TYPE_SWITCH(b.type_flag_, DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+          mxnet_op::Kernel<mxnet_op::op_with_req<mxnet_op::set_one, Req>, xpu>::Launch(
+            s, b.Size(), b.dptr<DType>());
+        });
+      });
+    } else {
+      // Generic fill kernel from variable
+      MSHADOW_TYPE_SWITCH(b.type_flag_, DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req, Req, {
+          mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::identity, Req>, xpu>::Launch(
+            s, b.Size(), b.dptr<DType>(), static_cast<DType>(val));
+        });
+      });
+    }
+  }
+}
 
+/*! \brief Fill output with a scalar integer value */
 template<typename xpu, int value>
 void FillCompute(const nnvm::NodeAttrs& attrs,
                  const OpContext& ctx,
                  const std::vector<TBlob>& inputs,
                  const std::vector<OpReqType>& req,
                  const std::vector<TBlob>& outputs) {
+  Fill<true>(ctx.get_stream<xpu>(), outputs[0], req[0], value);
+}
+
+/*! \brief Fill output with an arbitrary value */
+template<typename xpu>
+void InitFillWithScalarCompute(const nnvm::NodeAttrs &attrs,
+                               const OpContext &ctx,
+                               const std::vector<TBlob> &inputs,
+                               const std::vector<OpReqType> &req,
+                               const std::vector<TBlob> &outputs) {
+  CHECK_EQ(inputs.size(), 0);
+  CHECK_EQ(outputs.size(), 1U);
+  const auto& param = nnvm::get<InitOpWithScalarParam>(attrs.parsed);
+  Fill<true>(ctx.get_stream<xpu>(), outputs[0], req[0], param.value);
+}
+
+struct PopulateFullIdxRspKernel {
+  template<typename IType>
+  MSHADOW_XINLINE static void Map(int i, IType* out) {
+    KERNEL_ASSIGN(out[i], kWriteTo, i);
+  }
+};
+MXNET_TUNABLE_MXNET_OP_FWD(PopulateFullIdxRspKernel);
+
+// Fill in the indices and values of a RowSparse NDArray to represent a zeros NDArray,
+// instead of the usual compact representation.
+template<typename xpu>
+inline void FillDnsZerosRspImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
+  using namespace rowsparse;
+  using namespace mshadow::expr;
+  using namespace mshadow;
+  using namespace mxnet_op;
+  CHECK_EQ(dst->storage_type(), kRowSparseStorage);
+  MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(kIdx), IType, {
+    const index_t num_rows = dst->shape()[0];
+    dst->CheckAndAlloc({Shape1(num_rows)});
+    Fill<true>(s, dst->data(), kWriteTo, 0);
+    auto idx = dst->aux_data(kIdx).FlatTo1D<xpu, IType>(s);
+    Kernel<PopulateFullIdxRspKernel, xpu>::Launch(s, num_rows, idx.dptr_);
+  });
+}
+
+// Fill full indices NDArray with zeros by updating the aux shape.
+template<typename xpu>
+void PopulateFullIdxRspImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
+  using namespace rowsparse;
+  CHECK_EQ(dst->storage_type(), kRowSparseStorage);
+  nnvm::dim_t nnr = dst->shape()[0];
+  dst->CheckAndAllocAuxData(kIdx, mshadow::Shape1(nnr));
+  MSHADOW_IDX_TYPE_SWITCH(dst->aux_type(kIdx), IType, {
+    IType* idx = dst->aux_data(kIdx).dptr<IType>();
+    mxnet_op::Kernel<PopulateFullIdxRspKernel, xpu>::Launch(s, nnr, idx);
+  });
+}
+
+/*!
+ * \brief Fill a rsp NDArray with zeros by updating the aux shape.
+ * \tparam xpu - cpu or gpu
+ * \param s - The device stream
+ * \param dst - NDArray which is to be set to "all zeroes"
+ */
+template<typename xpu>
+void FillZerosRspImpl(mshadow::Stream<xpu> *, const NDArray& dst) {
+  if (dst.storage_initialized()) {
+    // reset the shapes if it's not zeros (set_aux_shape() will set storage_shape to zero as well)
+    dst.set_aux_shape(rowsparse::kIdx, TShape(mshadow::Shape1(0)));
+  }
+}
+
+/*!
+ * \brief Fill a CSR NDArray with zeros by updating the aux shape
+ * \param s - The device stream
+ * \param dst - NDArray which is to be set to "all zeroes"
+ */
+inline void FillZerosCsrImpl(mshadow::Stream<mshadow::cpu> *s, const NDArray& dst) {
+  dst.set_aux_shape(csr::kIdx, mshadow::Shape1(0));
+  dst.CheckAndAllocAuxData(csr::kIndPtr, mshadow::Shape1(dst.shape()[0] + 1));
+  TBlob indptr_data = dst.aux_data(csr::kIndPtr);
+  Fill<true>(s, dst.aux_data(csr::kIndPtr), kWriteTo, 0);
+}
+void FillZerosCsrImpl(mshadow::Stream<mshadow::gpu> *s, const NDArray& dst);
+
+/*!
+ * \brief Fill an NDArray with zeros
+ * \tparam xpu - cpu or gpu
+ * \param attrs  - node attributes (unused)
+ * \param ctx - Device context
+ * \param inputs - NDArray inputs (unused)
+ * \param req - Request type (i.e. kWrite, kNullOp, etc.)
+ * \param outputs - Array which contains at position zero (0) the array to be set to zeros
+ */
+template<typename xpu>
+void FillComputeZerosEx(const nnvm::NodeAttrs& attrs,
+                        const OpContext& ctx,
+                        const std::vector<NDArray>& inputs,
+                        const std::vector<OpReqType>& req,
+                        const std::vector<NDArray>& outputs) {
   using namespace mshadow;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
-    ASSIGN_DISPATCH(out, req[0], scalar<DType>(value));
-  });
+  CHECK_EQ(outputs.size(), 1);
+  auto stype = outputs[0].storage_type();
+  if (req[0] == kNullOp) return;
+  CHECK_EQ(req[0], kWriteTo) << "kWriteTo is expected for FillComputeZerosEx";
+  if (stype == kRowSparseStorage) {
+    FillZerosRspImpl(s, outputs[0]);
+  } else if (stype == kCSRStorage) {
+    FillZerosCsrImpl(s, outputs[0]);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
 }
 
+struct range_fwd {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int repeat, DType start, DType step,
+                                  int req, DType* out) {
+    KERNEL_ASSIGN(out[i], req, start + (i/repeat) * step);
+  }
+};
 
 template<typename xpu>
 void RangeCompute(const nnvm::NodeAttrs& attrs,
@@ -152,16 +377,21 @@ void RangeCompute(const nnvm::NodeAttrs& attrs,
                   const std::vector<TBlob>& inputs,
                   const std::vector<OpReqType>& req,
                   const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
+  using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   const RangeParam& param = nnvm::get<RangeParam>(attrs.parsed);
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
-    ASSIGN_DISPATCH(out, req[0], range<DType>(param.start,
-                                              param.stop.value(),
-                                              param.step,
-                                              param.repeat));
+      // Force unsigned params to take two's complement form on ARM to ensure consistency with x86
+      // results.  Casting negative floats to unsigned types is undefined in the CPP standard.
+      auto step = std::is_signed<DType>() ? param.step : static_cast<int>(param.step);
+      auto start = std::is_signed<DType>() ? param.start : static_cast<int>(param.start);
+      Kernel<range_fwd, xpu>::Launch(s,
+                                     outputs[0].Size(),
+                                     static_cast<int>(param.repeat),
+                                     static_cast<DType>(start),
+                                     static_cast<DType>(step),
+                                     req[0],
+                                     outputs[0].dptr<DType>());
   });
 }
 
@@ -172,23 +402,22 @@ inline bool RangeShape(const nnvm::NodeAttrs& attrs,
   const RangeParam& param = nnvm::get<RangeParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 0U);
   CHECK_EQ(out_attrs->size(), 1U);
-  CHECK_NE(param.step, 0U)
+  CHECK_NE(param.step, 0)
     << "Range does not support step=0, received " << param.step;
   CHECK(param.repeat > 0)
     << "Range only supports repeat > 0, received " << param.repeat;
   if (param.step > 0) {
     CHECK(param.start < param.stop.value())
-      << "Range does not support (start, stop, step) = "
+      << "Invalid range (start, stop, step) = "
       << "(" << param.start << "," << param.stop.value() << "," << param.step << ")";
   } else {
     CHECK(param.start > param.stop.value())
-      << "Range does not support (start, stop, step)= "
+      << "Invalid range (start, stop, step)= "
       << "(" << param.start << "," << param.stop.value() << "," << param.step << ")";
   }
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0,
-                     mshadow::Shape1(param.repeat *
-                                     ceil((param.stop.value() -
-                                           param.start) / param.step)));
+  const double out_size = std::ceil((param.stop.value() - param.start) / param.step)
+                          * param.repeat;
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape({static_cast<nnvm::dim_t>(out_size)}));
   return true;
 }
 
diff --git a/src/operator/tensor/la_op.cc b/src/operator/tensor/la_op.cc
index 70d4f9b766ad..7083efe2f1cf 100644
--- a/src/operator/tensor/la_op.cc
+++ b/src/operator/tensor/la_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file la_op.cc
  * \brief CPU-Operators for advanced linear algebra.
  */
@@ -30,23 +31,25 @@ namespace op {
 DMLC_REGISTER_PARAMETER(LaMatrixMacParam);
 DMLC_REGISTER_PARAMETER(LaMatrixMultParam);
 DMLC_REGISTER_PARAMETER(LaTriangMatrixMultParam);
+DMLC_REGISTER_PARAMETER(LaSyrkParam);
 
-NNVM_REGISTER_OP(linalg_gemm)
+NNVM_REGISTER_OP(_linalg_gemm)
+.add_alias("linalg_gemm")
 .describe(R"code(Performs general matrix multiplication and accumulation.
-Input are three tensors *A*, *B*, *C* each of dimension *n >= 2* and each
-having the same shape on the leading *n-2* dimensions. For every *n-2* dimensional index *i* let
-*A*\ :sub:`i`\ , *B*\ :sub:`i`\ , *C*\ :sub:`i` be the matrices given by the last *2* dimensions.
-The operator performs the BLAS3 function *gemm*
+Input are tensors *A*, *B*, *C*, each of dimension *n >= 2* and having the same shape
+on the leading *n-2* dimensions.
 
-   *out*\ :sub:`i` = *alpha* \* *op*\ (*A*\ :sub:`i`\ ) \* *op*\ (*B*\ :sub:`i`\ ) + *beta* \* *C*\ :sub:`i`
+If *n=2*, the BLAS3 function *gemm* is performed:
 
-on all such triples of matrices. Here *alpha* and *beta* are scalar operator parameters and *op()*
-is either the identity or the matrix transposition.
+   *out* = *alpha* \* *op*\ (*A*) \* *op*\ (*B*) + *beta* \* *C*
 
-In case of *n=2*, a single *gemm* function is performed on the matrices *A*, *B*, *C*.
+Here, *alpha* and *beta* are scalar parameters, and *op()* is either the identity or
+matrix transposition (depending on *transpose_a*, *transpose_b*).
 
-.. note:: The operator does only support float32 and float64 data types and provides
-          proper backward gradients.
+If *n>2*, *gemm* is performed separately on the trailing two dimensions for all inputs
+(batch mode).
+
+.. note:: The operator supports float32 and float64 data types only.
 
 Examples::
 
@@ -54,14 +57,14 @@ Examples::
    A = [[1.0, 1.0], [1.0, 1.0]]
    B = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]
    C = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]
-   linalg_gemm(A, B, C, transpose_b = 1, alpha = 2.0 , beta = 10.0)
+   gemm(A, B, C, transpose_b=True, alpha=2.0, beta=10.0)
            = [[14.0, 14.0, 14.0], [14.0, 14.0, 14.0]]
 
    // Batch matrix multiply-add
    A = [[[1.0, 1.0]], [[0.1, 0.1]]]
    B = [[[1.0, 1.0]], [[0.1, 0.1]]]
    C = [[[10.0]], [[0.01]]]
-   linalg_gemm(A, B, C, transpose_b = 1, alpha = 2.0 , beta = 10.0)
+   gemm(A, B, C, transpose_b=True, alpha=2.0 , beta=10.0)
            = [[[104.0]], [[0.14]]]
 )code" ADD_FILELINE)
 .set_num_inputs(3)
@@ -91,35 +94,36 @@ NNVM_REGISTER_OP(_backward_linalg_gemm)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 4, 3, gemm_backward>);
 
-NNVM_REGISTER_OP(linalg_gemm2)
+NNVM_REGISTER_OP(_linalg_gemm2)
+.add_alias("linalg_gemm2")
 .describe(R"code(Performs general matrix multiplication.
-Input are two tensors *A*, *B* each of dimension *n >= 2* and each
-having the same shape on the leading *n-2* dimensions. For every *n-2* dimensional index *i* let
-*A*\ :sub:`i`\ , *B*\ :sub:`i`\  be the matrices given by the last *2* dimensions.
-The operator performs the BLAS3 function *gemm* (restricted to two arguments)
+Input are tensors *A*, *B*, each of dimension *n >= 2* and having the same shape
+on the leading *n-2* dimensions.
+
+If *n=2*, the BLAS3 function *gemm* is performed:
 
-   *out*\ :sub:`i` = *alpha* \* *op*\ (*A*\ :sub:`i`\ ) \* *op*\ (*B*\ :sub:`i`\ )
+   *out* = *alpha* \* *op*\ (*A*) \* *op*\ (*B*)
 
-on all such pairs of matrices. Here *alpha* is a scalar operator parameter and *op()* is either
-the identity or the matrix transposition.
+Here *alpha* is a scalar parameter and *op()* is either the identity or the matrix
+transposition (depending on *transpose_a*, *transpose_b*).
 
-In case of *n=2*, a single *gemm* function is performed on the matrices *A*, *B*.
+If *n>2*, *gemm* is performed separately on the trailing two dimensions for all inputs
+(batch mode).
 
-.. note:: The operator does only support float32 and float64 data types and provides
-          proper backward gradients.
+.. note:: The operator supports float32 and float64 data types only.
 
 Examples::
 
    // Single matrix multiply
    A = [[1.0, 1.0], [1.0, 1.0]]
    B = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]
-   linalg_gemm2(A, B, transpose_b = 1, alpha = 2.0)
+   gemm2(A, B, transpose_b=True, alpha=2.0)
             = [[4.0, 4.0, 4.0], [4.0, 4.0, 4.0]]
 
    // Batch matrix multiply
    A = [[[1.0, 1.0]], [[0.1, 0.1]]]
    B = [[[1.0, 1.0]], [[0.1, 0.1]]]
-   linalg_gemm2(A, B, transpose_b = 1, alpha = 2.0 )
+   gemm2(A, B, transpose_b=True, alpha=2.0)
            = [[[4.0]], [[0.04 ]]]
 )code" ADD_FILELINE)
 .set_num_inputs(2)
@@ -146,34 +150,31 @@ NNVM_REGISTER_OP(_backward_linalg_gemm2)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 3, 2, gemm2_backward>);
 
-NNVM_REGISTER_OP(linalg_potrf)
+NNVM_REGISTER_OP(_linalg_potrf)
+.add_alias("linalg_potrf")
 .describe(R"code(Performs Cholesky factorization of a symmetric positive-definite matrix.
-Input is a tensor *A* of dimension *n >= 2*. For every *n-2* dimensional index *i* let
-*A*\ :sub:`i`\  be the matrix given by the last *2* dimensions.
-The operator performs the Cholesky factorization (LAPACK function *potrf*)
-on each *A*\ :sub:`i`\ ,
-i.e. it computes a lower triangular matrix *U*\ :sub:`i` such that
+Input is a tensor *A* of dimension *n >= 2*.
 
-   *A*\ :sub:`i`\  = *U*\ :sub:`i`\  \* *U*\ :sub:`i`\ \ :sup:`T`
+If *n=2*, the Cholesky factor *L* of the symmetric, positive definite matrix *A* is
+computed. *L* is lower triangular (entries of upper triangle are all zero), has
+positive diagonal entries, and:
 
-for all such matrices. The matrices *A*\ :sub:`i` must be all symmetric and positive-definite.
-The resulting matrices *U*\ :sub:`i` will contain zeros in the upper triangle
-apart from the diagonal.
+  *A* = *L* \* *L*\ :sup:`T`
 
-In case of *n=2*, a single Cholesky factorization is performed on the matrix *A*.
+If *n>2*, *potrf* is performed separately on the trailing two dimensions for all inputs
+(batch mode).
 
-.. note:: The operator does only support float32 and float64 data types and provides
-          proper backward gradients.
+.. note:: The operator supports float32 and float64 data types only.
 
 Examples::
 
    // Single matrix factorization
    A = [[4.0, 1.0], [1.0, 4.25]]
-   linalg_potrf(A) = [[2.0, 0], [0.5, 2.0]]
+   potrf(A) = [[2.0, 0], [0.5, 2.0]]
 
    // Batch matrix factorization
    A = [[[4.0, 1.0], [1.0, 4.25]], [[16.0, 4.0], [4.0, 17.0]]]
-   linalg_potrf(A) = [[[2.0, 0], [0.5, 2.0]], [[4.0, 0], [1.0, 4.0]]]
+   potrf(A) = [[[2.0, 0], [0.5, 2.0]], [[4.0, 0], [1.0, 4.0]]]
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -198,34 +199,40 @@ NNVM_REGISTER_OP(_backward_linalg_potrf)
 .set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 2, 1, potrf_backward>);
 
 
-NNVM_REGISTER_OP(linalg_potri)
+NNVM_REGISTER_OP(_linalg_potri)
+.add_alias("linalg_potri")
 .describe(R"code(Performs matrix inversion from a Cholesky factorization.
-Input is a tensor *A* of dimension *n >= 2*. For every *n-2* dimensional index *i* let
-*A*\ :sub:`i`\  be the matrix given by the last *2* dimensions.
-The operator assumes that each *A*\ :sub:`i` is the Cholesky factorization of some symmetric
-positive-definite matrix *B*\ :sub:`i` given as a lower triangular matrix
-(so *A* is the output of a prior call to operator *linalg_potrf*). The operator computes the
-inverse of each *B*\ :sub:`i` from this decomposition, i.e
+Input is a tensor *A* of dimension *n >= 2*.
+
+If *n=2*, *A* is a lower triangular matrix (entries of upper triangle are all zero)
+with positive diagonal. We compute:
+
+  *out* = *A*\ :sup:`-T` \* *A*\ :sup:`-1`
 
-   *out*\ :sub:`i` = *B*\ :sub:`i`\ \ :sup:`-1`
+In other words, if *A* is the Cholesky factor of a symmetric positive definite matrix
+*B* (obtained by *potrf*), then
 
-for all such matrices.
+  *out* = *B*\ :sup:`-1`
 
-In case of *n=2*, the operation is performed on the matrix *A* itself.
+If *n>2*, *potri* is performed separately on the trailing two dimensions for all inputs
+(batch mode).
 
-.. note:: The operator does only support float32 and float64 data types and provides
-          proper backward gradients.
+.. note:: The operator supports float32 and float64 data types only.
+
+.. note:: Use this operator only if you are certain you need the inverse of *B*, and
+          cannot use the Cholesky factor *A* (*potrf*), together with backsubstitution
+          (*trsm*). The latter is numerically much safer, and also cheaper.
 
 Examples::
 
    // Single matrix inverse
    A = [[2.0, 0], [0.5, 2.0]]
-   linalg_potri(A) = [[0.26563, -0.0625], [-0.0625, 0.25]]
+   potri(A) = [[0.26563, -0.0625], [-0.0625, 0.25]]
 
    // Batch matrix inverse
    A = [[[2.0, 0], [0.5, 2.0]], [[4.0, 0], [1.0, 4.0]]]
-   linalg_potri(A) = [[[0.26563, -0.0625], [-0.0625, 0.25]],
-                  [[0.06641, -0.01562], [-0.01562, 0,0625]]]
+   potri(A) = [[[0.26563, -0.0625], [-0.0625, 0.25]],
+               [[0.06641, -0.01562], [-0.01562, 0,0625]]]
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -247,42 +254,42 @@ NNVM_REGISTER_OP(_backward_linalg_potri)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 3, 1, potri_backward>);
 
-NNVM_REGISTER_OP(linalg_trmm)
-.describe(R"code(Performs multiplication with a triangular matrix.
-Input are two tensors *A*, *B* each of dimension *n >= 2* and each
-having the same shape on the leading *n-2* dimensions. For every *n-2* dimensional index *i* let
-*A*\ :sub:`i`\ , *B*\ :sub:`i`\  be the matrices given by the last *2* dimensions.
-The operator performs the BLAS3 function *trmm*
+NNVM_REGISTER_OP(_linalg_trmm)
+.add_alias("linalg_trmm")
+.describe(R"code(Performs multiplication with a lower triangular matrix.
+Input are tensors *A*, *B*, each of dimension *n >= 2* and having the same shape
+on the leading *n-2* dimensions.
+
+If *n=2*, *A* must be lower triangular. The operator performs the BLAS3 function
+*trmm*:
+
+   *out* = *alpha* \* *op*\ (*A*) \* *B*
 
-   *out*\ :sub:`i` = *alpha* \* *op*\ (*A*\ :sub:`i`\ ) \* *B*\ :sub:`i`
+if *rightside=False*, or
 
-or
+   *out* = *alpha* \* *B* \* *op*\ (*A*)
 
-   *out*\ :sub:`i` = *alpha* \* *B*\ :sub:`i` \* *op*\ (*A*\ :sub:`i`\ )
+if *rightside=True*. Here, *alpha* is a scalar parameter, and *op()* is either the
+identity or the matrix transposition (depending on *transpose*).
 
-on all such pairs of matrices. Here *alpha* is a scalar operator parameter,  *op()* is either
-the identity or the matrix transposition (depending on the parameter *transpose*) and the
-order of matrix multiplication depends on the parameter *rightside*.
-All matrices *A*\ :sub:`i` must be lower triangular.
+If *n>2*, *trmm* is performed separately on the trailing two dimensions for all inputs
+(batch mode).
 
-In case of *n=2*, a single *trmm* function is performed on the matrices *A*, *B*.
+.. note:: The operator supports float32 and float64 data types only.
 
-.. note:: The operator does only support float32 and float64 data types and provides
-          proper backward gradients.
 
 Examples::
 
-   // Single matrix multiply
+   // Single triangular matrix multiply
    A = [[1.0, 0], [1.0, 1.0]]
    B = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]
-   linalg_trmm(A, B, alpha = 2.0) = [[2.0, 2.0, 2.0], [4.0, 4.0, 4.0]]
+   trmm(A, B, alpha=2.0) = [[2.0, 2.0, 2.0], [4.0, 4.0, 4.0]]
 
-   // Batch matrix multiply
+   // Batch triangular matrix multiply
    A = [[[1.0, 0], [1.0, 1.0]], [[1.0, 0], [1.0, 1.0]]]
    B = [[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]], [[0.5, 0.5, 0.5], [0.5, 0.5, 0.5]]]
-   linalg_trmm(A, B, alpha = 2.0 ) = [[[2.0, 2.0, 2.0], [4.0, 4.0, 4.0]],
-                                  [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]]]
-
+   trmm(A, B, alpha=2.0) = [[[2.0, 2.0, 2.0], [4.0, 4.0, 4.0]],
+                            [[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]]]
 )code" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
@@ -294,13 +301,13 @@ Examples::
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs)
   { return std::vector<std::pair<int, int>>{{1, 0}}; })
 .set_attr<FCompute>("FCompute<cpu>", LaOpForward<cpu, 2, 2, 2, 1, trmm>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseInOut{"_backward_linalg_trmm"})
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_linalg_trmm"})
 .add_argument("A", "NDArray-or-Symbol", "Tensor of lower triangular matrices")
 .add_argument("B", "NDArray-or-Symbol", "Tensor of matrices")
 .add_arguments(LaTriangMatrixMultParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_linalg_trmm)
-.set_num_inputs(4)
+.set_num_inputs(3)
 .set_num_outputs(2)
 .set_attr_parser(ParamParser<LaTriangMatrixMultParam>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs)
@@ -308,44 +315,44 @@ NNVM_REGISTER_OP(_backward_linalg_trmm)
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
   { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 4, 2, trmm_backward>);
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 3, 2, trmm_backward>);
+
+NNVM_REGISTER_OP(_linalg_trsm)
+.add_alias("linalg_trsm")
+.describe(R"code(Solves matrix equation involving a lower triangular matrix.
+Input are tensors *A*, *B*, each of dimension *n >= 2* and having the same shape
+on the leading *n-2* dimensions.
 
-NNVM_REGISTER_OP(linalg_trsm)
-.describe(R"code(Solves matrix equations involving a triangular matrix.
-Input are two tensors *A*, *B* each of dimension *n >= 2* and each
-having the same shape on the leading *n-2* dimensions. For every *n-2* dimensional index *i* let
-*A*\ :sub:`i`\ , *B*\ :sub:`i`\  be the matrices given by the last *2* dimensions.
-The operator performs the BLAS3 function *trsm*, i.e. it solves the equation
+If *n=2*, *A* must be lower triangular. The operator performs the BLAS3 function
+*trsm*, solving for *out* in:
 
-   *op*\ (*A*\ :sub:`i`\ ) \* *X*\ :sub:`i` = *alpha* \* *B*\ :sub:`i`
+   *op*\ (*A*) \* *out* = *alpha* \* *B*
 
-or
+if *rightside=False*, or
 
-   *X*\ :sub:`i` \* *op*\ (*A*\ :sub:`i`\ ) = *alpha* \* *B*\ :sub:`i`
+   *out* \* *op*\ (*A*) = *alpha* \* *B*
 
-on all such pairs of matrices. Here *alpha* is a scalar operator parameter,  *op()* is either
-the identity or the matrix transposition (depending on the parameter *transpose*) and the
-order of multiplication on the left depends on the parameter *rightside*.
-All matrices *A*\ :sub:`i` must be lower triangular.
+if *rightside=True*. Here, *alpha* is a scalar parameter, and *op()* is either the
+identity or the matrix transposition (depending on *transpose*).
 
-In case of *n=2*, a single *trsm* function is performed on the matrices *A*, *B*.
+If *n>2*, *trsm* is performed separately on the trailing two dimensions for all inputs
+(batch mode).
 
-.. note:: The operator does only support float32 and float64 data types and provides
-          proper backward gradients.
+.. note:: The operator supports float32 and float64 data types only.
 
 Examples::
 
    // Single matrix solve
    A = [[1.0, 0], [1.0, 1.0]]
    B = [[2.0, 2.0, 2.0], [4.0, 4.0, 4.0]]
-   linalg_trsm(A, B, alpha = 0.5) = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]
+   trsm(A, B, alpha=0.5) = [[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]
 
    // Batch matrix solve
    A = [[[1.0, 0], [1.0, 1.0]], [[1.0, 0], [1.0, 1.0]]]
    B = [[[2.0, 2.0, 2.0], [4.0, 4.0, 4.0]],
         [[4.0, 4.0, 4.0], [8.0, 8.0, 8.0]]]
-   linalg_trsm(A, B, alpha = 0.5 ) = [[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
-                                  [[2.0, 2.0, 2.0 ], [2.0, 2.0, 2.0]]]
+   trsm(A, B, alpha=0.5) = [[[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]],
+                            [[2.0, 2.0, 2.0], [2.0, 2.0, 2.0]]]
 )code" ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
@@ -373,27 +380,28 @@ NNVM_REGISTER_OP(_backward_linalg_trsm)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 4, 2, trsm_backward>);
 
-NNVM_REGISTER_OP(linalg_sumlogdiag)
-.describe(R"code(Computes the sum of the logarithms of all diagonal elements in a matrix.
-Input is a tensor *A* of dimension *n >= 2*. For every *n-2* dimensional index *i* let
-*A*\ :sub:`i`\  be the matrix given by the last *2* dimensions.
-The operator performs a reduction of each such matrix to a scalar by summing up the logarithms
-of all diagonal elements. All matrices must be square and all diagonal elements must be positive.
+NNVM_REGISTER_OP(_linalg_sumlogdiag)
+.add_alias("linalg_sumlogdiag")
+.describe(R"code(Computes the sum of the logarithms of the diagonal elements of a square matrix.
+Input is a tensor *A* of dimension *n >= 2*.
 
-In case of *n=2*, *A* represents a single matrix on which the reduction will be performed.
+If *n=2*, *A* must be square with positive diagonal entries. We sum the natural
+logarithms of the diagonal elements, the result has shape (1,).
 
-.. note:: The operator does only support float32 and float64 data types and provides
-          proper backward gradients.
+If *n>2*, *sumlogdiag* is performed separately on the trailing two dimensions for all
+inputs (batch mode).
+
+.. note:: The operator supports float32 and float64 data types only.
 
 Examples::
 
    // Single matrix reduction
    A = [[1.0, 1.0], [1.0, 7.0]]
-   linalg_sumlogdiag(A) = [1.9459]
+   sumlogdiag(A) = [1.9459]
 
    // Batch matrix reduction
    A = [[[1.0, 1.0], [1.0, 7.0]], [[3.0, 0], [0, 17.0]]]
-   linalg_sumlogdiag(A) = [1.9459, 3.9318]
+   sumlogdiag(A) = [1.9459, 3.9318]
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -408,10 +416,209 @@ Examples::
 NNVM_REGISTER_OP(_backward_linalg_sumlogdiag)
 .set_num_inputs(2)
 .set_num_outputs(1)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs)
+  { return std::vector<std::pair<int, int>>{{1, 0}}; })
 .set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
   { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 2, 1, sumlogdiag_backward>);
 
+NNVM_REGISTER_OP(_linalg_syrk)
+.add_alias("linalg_syrk")
+.describe(R"code(Multiplication of matrix with its transpose.
+Input is a tensor *A* of dimension *n >= 2*.
+
+If *n=2*, the operator performs the BLAS3 function *syrk*:
+
+  *out* = *alpha* \* *A* \* *A*\ :sup:`T`
+
+if *transpose=False*, or
+
+  *out* = *alpha* \* *A*\ :sup:`T` \ \* *A*
+
+if *transpose=True*.
+
+If *n>2*, *syrk* is performed separately on the trailing two dimensions for all
+inputs (batch mode).
+
+.. note:: The operator supports float32 and float64 data types only.
+
+Examples::
+
+   // Single matrix multiply
+   A = [[1., 2., 3.], [4., 5., 6.]]
+   syrk(A, alpha=1., transpose=False)
+            = [[14., 32.],
+               [32., 77.]]
+   syrk(A, alpha=1., transpose=True)
+            = [[17., 22., 27.],
+               [22., 29., 36.],
+               [27., 36., 45.]]
+
+   // Batch matrix multiply
+   A = [[[1., 1.]], [[0.1, 0.1]]]
+   syrk(A, alpha=2., transpose=False) = [[[4.]], [[0.04]]]
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaSyrkParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs)
+  { return std::vector<std::string>{"A"}; } )
+.set_attr<nnvm::FInferShape>("FInferShape", LaSyrkShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", LaOpForward<cpu, 2, 2, 1, 1, syrk>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_linalg_syrk"})
+.add_argument("A", "NDArray-or-Symbol", "Tensor of input matrices")
+.add_arguments(LaSyrkParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_linalg_syrk)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<LaSyrkParam>)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 2, 1, syrk_backward>);
+
+NNVM_REGISTER_OP(_linalg_gelqf)
+.add_alias("linalg_gelqf")
+.describe(R"code(LQ factorization for general matrix.
+Input is a tensor *A* of dimension *n >= 2*.
+
+If *n=2*, we compute the LQ factorization (LAPACK *gelqf*, followed by *orglq*). *A*
+must have shape *(x, y)* with *x <= y*, and must have full rank *=x*. The LQ
+factorization consists of *L* with shape *(x, x)* and *Q* with shape *(x, y)*, so
+that:
+
+   *A* = *L* \* *Q*
+
+Here, *L* is lower triangular (upper triangle equal to zero) with nonzero diagonal,
+and *Q* is row-orthonormal, meaning that
+
+   *Q* \* *Q*\ :sup:`T`
+
+is equal to the identity matrix of shape *(x, x)*.
+
+If *n>2*, *gelqf* is performed separately on the trailing two dimensions for all
+inputs (batch mode).
+
+.. note:: The operator supports float32 and float64 data types only.
+
+Examples::
+
+   // Single LQ factorization
+   A = [[1., 2., 3.], [4., 5., 6.]]
+   Q, L = gelqf(A)
+   Q = [[-0.26726124, -0.53452248, -0.80178373],
+        [0.87287156, 0.21821789, -0.43643578]]
+   L = [[-3.74165739, 0.],
+        [-8.55235974, 1.96396101]]
+
+   // Batch LQ factorization
+   A = [[[1., 2., 3.], [4., 5., 6.]],
+        [[7., 8., 9.], [10., 11., 12.]]]
+   Q, L = gelqf(A)
+   Q = [[[-0.26726124, -0.53452248, -0.80178373],
+         [0.87287156, 0.21821789, -0.43643578]],
+        [[-0.50257071, -0.57436653, -0.64616234],
+         [0.7620735, 0.05862104, -0.64483142]]]
+   L = [[[-3.74165739, 0.],
+         [-8.55235974, 1.96396101]],
+        [[-13.92838828, 0.],
+         [-19.09768702, 0.52758934]]]
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs)
+  { return std::vector<std::string>{"A"}; } )
+.set_attr<nnvm::FInferShape>("FInferShape", LaLQFactShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 2>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs)
+  { return std::vector<std::pair<int, int>>{{0, 0}}; })
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<FCompute>("FCompute<cpu>", LaOpForward<cpu, 2, 2, 1, 2, gelqf>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_linalg_gelqf"})
+.add_argument("A", "NDArray-or-Symbol", "Tensor of input matrices to be factorized");
+
+NNVM_REGISTER_OP(_backward_linalg_gelqf)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs)
+  { return std::vector<std::pair<int, int> >{{0, 0}}; })
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackward<cpu, 2, 2, 4, 1, gelqf_backward>);
+
+NNVM_REGISTER_OP(_linalg_syevd)
+.describe(R"code(Eigendecomposition for symmetric matrix.
+Input is a tensor *A* of dimension *n >= 2*.
+
+If *n=2*, *A* must be symmetric, of shape *(x, x)*. We compute the eigendecomposition,
+resulting in the orthonormal matrix *U* of eigenvectors, shape *(x, x)*, and the
+vector *L* of eigenvalues, shape *(x,)*, so that:
+
+   *U* \* *A* = *diag(L)* \* *U*
+
+Here:
+
+   *U* \* *U*\ :sup:`T` = *U*\ :sup:`T` \* *U* = *I*
+
+where *I* is the identity matrix. Also, *L(0) <= L(1) <= L(2) <= ...* (ascending order).
+
+If *n>2*, *syevd* is performed separately on the trailing two dimensions of *A* (batch
+mode). In this case, *U* has *n* dimensions like *A*, and *L* has *n-1* dimensions.
+
+.. note:: The operator supports float32 and float64 data types only.
+
+.. note:: Derivatives for this operator are defined only if *A* is such that all its
+          eigenvalues are distinct, and the eigengaps are not too small. If you need
+          gradients, do not apply this operator to matrices with multiple eigenvalues.
+
+Examples::
+
+   // Single symmetric eigendecomposition
+   A = [[1., 2.], [2., 4.]]
+   U, L = syevd(A)
+   U = [[0.89442719, -0.4472136],
+        [0.4472136, 0.89442719]]
+   L = [0., 5.]
+
+   // Batch symmetric eigendecomposition
+   A = [[[1., 2.], [2., 4.]],
+        [[1., 2.], [2., 5.]]]
+   U, L = syevd(A)
+   U = [[[0.89442719, -0.4472136],
+         [0.4472136, 0.89442719]],
+        [[0.92387953, -0.38268343],
+         [0.38268343, 0.92387953]]]
+   L = [[0., 5.],
+        [0.17157288, 5.82842712]]
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs)
+  { return std::vector<std::string>{"A"}; } )
+.set_attr<nnvm::FInferShape>("FInferShape", LaEigFactShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 2>)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs)
+  { return std::vector<std::pair<int, int>>{{0, 0}}; })
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<FCompute>("FCompute<cpu>", LaOpForwSyevd<cpu, syevd>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_linalg_syevd"})
+.add_argument("A", "NDArray-or-Symbol", "Tensor of input matrices to be factorized");
+
+NNVM_REGISTER_OP(_backward_linalg_syevd)
+.set_num_inputs(4)
+.set_num_outputs(1)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs)
+  { return std::vector<std::pair<int, int> >{{0, 0}}; })
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& attrs)
+  { return std::vector<ResourceRequest>{ResourceRequest::kTempSpace}; })
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", LaOpBackwSyevd<cpu, syevd_backward>);
+
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/tensor/la_op.cu b/src/operator/tensor/la_op.cu
index a89d98fd7f82..efd705f3f444 100644
--- a/src/operator/tensor/la_op.cu
+++ b/src/operator/tensor/la_op.cu
@@ -27,37 +27,43 @@
 namespace mxnet {
 namespace op {
 
-NNVM_REGISTER_OP(linalg_gemm)
+NNVM_REGISTER_OP(_linalg_gemm)
 .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 3, 1, gemm>);
 
 NNVM_REGISTER_OP(_backward_linalg_gemm)
 .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 4, 3, gemm_backward>);
 
-NNVM_REGISTER_OP(linalg_gemm2)
+NNVM_REGISTER_OP(_linalg_gemm2)
 .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 2, 1, gemm2>);
 
 NNVM_REGISTER_OP(_backward_linalg_gemm2)
 .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 3, 2, gemm2_backward>);
 
-NNVM_REGISTER_OP(linalg_trmm)
+NNVM_REGISTER_OP(_linalg_trmm)
 .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 2, 1, trmm>);
 
 NNVM_REGISTER_OP(_backward_linalg_trmm)
-.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 4, 2, trmm_backward>);
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 3, 2, trmm_backward>);
 
-NNVM_REGISTER_OP(linalg_trsm)
+NNVM_REGISTER_OP(_linalg_trsm)
 .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 2, 1, trsm>);
 
 NNVM_REGISTER_OP(_backward_linalg_trsm)
 .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 4, 2, trsm_backward>);
 
-NNVM_REGISTER_OP(linalg_sumlogdiag)
+NNVM_REGISTER_OP(_linalg_syrk)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, syrk>);
+
+NNVM_REGISTER_OP(_backward_linalg_syrk)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 2, 1, syrk_backward>);
+
+NNVM_REGISTER_OP(_linalg_sumlogdiag)
 .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 0, 1, 1, sumlogdiag>);
 
 NNVM_REGISTER_OP(_backward_linalg_sumlogdiag)
 .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 2, 1, sumlogdiag_backward>);
 
-NNVM_REGISTER_OP(linalg_potri)
+NNVM_REGISTER_OP(_linalg_potri)
 .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, potri>);
 
 NNVM_REGISTER_OP(_backward_linalg_potri)
@@ -65,12 +71,24 @@ NNVM_REGISTER_OP(_backward_linalg_potri)
 
 #if MXNET_USE_CUSOLVER == 1
 
-NNVM_REGISTER_OP(linalg_potrf)
+NNVM_REGISTER_OP(_linalg_potrf)
 .set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 1, potrf>);
 
 NNVM_REGISTER_OP(_backward_linalg_potrf)
 .set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 2, 1, potrf_backward>);
 
+NNVM_REGISTER_OP(_linalg_gelqf)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForward<gpu, 2, 2, 1, 2, gelqf>);
+
+NNVM_REGISTER_OP(_backward_linalg_gelqf)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackward<gpu, 2, 2, 4, 1, gelqf_backward>);
+
+NNVM_REGISTER_OP(_linalg_syevd)
+.set_attr<FCompute>("FCompute<gpu>", LaOpForwSyevd<gpu, syevd>);
+
+NNVM_REGISTER_OP(_backward_linalg_syevd)
+.set_attr<FCompute>("FCompute<gpu>", LaOpBackwSyevd<gpu, syevd_backward>);
+
 #endif
 
 }  // namespace op
diff --git a/src/operator/tensor/la_op.h b/src/operator/tensor/la_op.h
index dd5fab985e3c..3d411b2d7188 100644
--- a/src/operator/tensor/la_op.h
+++ b/src/operator/tensor/la_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file la_op.h
  * \brief Operators for advanced linear algebra.
  */
@@ -90,6 +91,20 @@ struct LaTriangMatrixMultParam : public dmlc::Parameter<LaTriangMatrixMultParam>
   }
 };
 
+// Parameters for syrk
+struct LaSyrkParam : public dmlc::Parameter<LaSyrkParam> {
+  bool transpose;
+  double alpha;
+  DMLC_DECLARE_PARAMETER(LaSyrkParam) {
+    DMLC_DECLARE_FIELD(transpose)
+      .set_default(false)
+      .describe("Use transpose of input matrix.");
+    DMLC_DECLARE_FIELD(alpha)
+      .set_default(1.0)
+      .describe("Scalar factor to be applied to the result.");
+  }
+};
+
 // Common function for shape inference for matrix mult and matrix mac.
 inline bool LaMatrixMultMacOpShape(const nnvm::NodeAttrs& attrs,
                                    std::vector<TShape>* in_attrs,
@@ -112,7 +127,8 @@ inline bool LaMatrixMultMacOpShape(const nnvm::NodeAttrs& attrs,
     std::vector<int> oshape(ndim);
     for ( int i = 0; i < ndim-2; ++i ) {
       // Both inputs must have same shape except for last two dimensions.
-      if ( (*in_attrs)[0][i] != (*in_attrs)[1][i] ) return false;
+      CHECK_EQ((*in_attrs)[0][i], (*in_attrs)[1][i])
+        << "Shapes of inputs 0, 1 must be the same, except on last two dimensions";
       oshape[i] = (*in_attrs)[0][i];
     }
     CHECK_EQ((transpose_a ? (*in_attrs)[0][ndim-2] : (*in_attrs)[0][ndim-1]),
@@ -146,7 +162,8 @@ inline bool LaTriangMatrixMultOpShape(const nnvm::NodeAttrs& attrs,
     std::vector<int> oshape(ndim);
     for ( int i = 0; i < ndim-2; ++i ) {
       // Must have same shape except for last two dimensions.
-      if ( (*in_attrs)[0][i] != (*in_attrs)[1][i] ) return false;
+      CHECK_EQ((*in_attrs)[0][i], (*in_attrs)[1][i])
+        << "Shapes of inputs 0, 1 must be the same, except on last two dimensions";
       oshape[i] = (*in_attrs)[0][i];
     }
     if ( param.rightside ) {
@@ -200,8 +217,8 @@ inline bool LaReduceShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
   const int ndim((*in_attrs)[0].ndim());
-  if ( ndim < dim ) {
-     return false;
+  if (ndim < dim) {
+    return false;
   }
   std::vector<int> oshape(std::max(1, ndim-dim));
   oshape[0] = 1;
@@ -214,13 +231,123 @@ inline bool LaReduceShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+// Shape inference function for linalg_syrk
+inline bool LaSyrkShape(const nnvm::NodeAttrs& attrs,
+                        std::vector<TShape>* in_attrs,
+                        std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const TShape& in_attr = (*in_attrs)[0];
+  bool transpose = nnvm::get<LaSyrkParam>(attrs.parsed).transpose;
+  const int ndim = in_attr.ndim();
+  if ( ndim >= 2 ) {
+    // Forward shape inference.
+    std::vector<int> oshape(ndim);
+    for ( int i = 0; i < ndim-2; ++i ) {
+      oshape[i] = in_attr[i];
+    }
+    oshape[ndim-2] = (transpose ? in_attr[ndim-1] : in_attr[ndim-2]);
+    oshape[ndim-1] = oshape[ndim-2];
+    TShape tshape(oshape.begin(), oshape.end());
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, tshape);
+    return true;
+  }
+  // Can't do backward inference of shapes for this operator.
+  return false;
+}
+
+// Shape inference function for linalg_gelqf
+// Inputs: A. Outputs: Q, L
+inline bool LaLQFactShape(const nnvm::NodeAttrs& attrs,
+                          std::vector<TShape>* in_attrs,
+                          std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 2);
+  const TShape& in_a = (*in_attrs)[0];
+  const TShape& out_q = (*out_attrs)[0];
+  const TShape& out_l = (*out_attrs)[1];
+  if ( in_a.ndim() >= 2 ) {
+    // Forward shape inference.
+    const int ndim(in_a.ndim());
+    CHECK_LE(in_a[ndim-2], in_a[ndim-1])
+      << "Input A shape wrong: Last dimension must be >= than second to last";
+    // Q must have same shape as A
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_a);
+    std::vector<int> oshape_l(ndim);
+    for ( int i = 0; i < ndim-1; ++i ) {
+      oshape_l[i] = in_a[i];
+    }
+    oshape_l[ndim-1] = in_a[ndim-2];
+    TShape tshape_l(oshape_l.begin(), oshape_l.end());
+    SHAPE_ASSIGN_CHECK(*out_attrs, 1, tshape_l);
+    return true;
+  }
+  if ( out_q.ndim() >= 2 && out_q.ndim() == out_l.ndim() ) {
+    // Backward shape inference.
+    const int ndim(out_q.ndim());
+    for ( int i = 0; i < ndim-1; ++i ) {
+      CHECK_EQ(out_q[i], out_l[i])
+        << "Outputs Q, L must have same dimensions except for last";
+    }
+    CHECK_LE(out_q[ndim-2], out_q[ndim-1])
+      << "Output Q shape wrong: Last dimension must be >= than second to last";
+    CHECK_EQ(out_l[ndim-2], out_l[ndim-1])
+      << "Output L shape wrong: Last two dimensions must be equal";
+    SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_q);
+    return true;
+  }
+  return false;
+}
+
+// Shape inference function for linalg_syevd
+// Inputs: A. Outputs: U, L
+inline bool LaEigFactShape(const nnvm::NodeAttrs& attrs,
+                           std::vector<TShape>* in_attrs,
+                           std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 2);
+  const TShape& in_a = (*in_attrs)[0];
+  const TShape& out_u = (*out_attrs)[0];
+  const TShape& out_l = (*out_attrs)[1];
+  if ( in_a.ndim() >= 2 ) {
+    // Forward shape inference.
+    const int ndim(in_a.ndim());
+    CHECK_EQ(in_a[ndim-2], in_a[ndim-1])
+      << "Input A shape wrong: Last two dimensions must be equal";
+    // U must have same shape as A
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_a);
+    std::vector<int> oshape_l(ndim-1);
+    for ( int i = 0; i < ndim-1; ++i ) {
+      oshape_l[i] = in_a[i];
+    }
+    TShape tshape_l(oshape_l.begin(), oshape_l.end());
+    SHAPE_ASSIGN_CHECK(*out_attrs, 1, tshape_l);
+    return true;
+  }
+  if ( out_u.ndim() >= 2 && out_u.ndim() == out_l.ndim()+1 ) {
+    // Backward shape inference.
+    const int ndim(out_u.ndim());
+    for ( int i = 0; i < ndim-1; ++i ) {
+      CHECK_EQ(out_u[i], out_l[i])
+        << "Outputs U, L must have same dimensions except for last";
+    }
+    CHECK_EQ(out_u[ndim-2], out_u[ndim-1])
+      << "Output U shape wrong: Last two dimensions must be equal";
+    // A must have same shape as U
+    SHAPE_ASSIGN_CHECK(*in_attrs, 0, out_u);
+    return true;
+  }
+  return false;
+}
+
 // Adapters for calling the various operators with appropriate signatures.
+
 template<typename xpu, typename DType, int idim, int odim, int inum, int onum, typename laop>
 struct LaOpCaller {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<xpu> *s) {
+                 const OpContext& ctx) {
     CHECK(false) << "no specialized LaOpCaller defined for template parameters";
   }
 };
@@ -229,9 +356,22 @@ struct LaOpCaller<xpu, DType, idim, odim, 1, 1, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<xpu> *s) {
+                 const OpContext& ctx) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s), ctx, attrs);
+  }
+};
+template<typename xpu, typename DType, int idim, int odim, typename laop>
+struct LaOpCaller<xpu, DType, idim, odim, 1, 2, laop> {
+  static void op(const std::vector<TBlob>& inputs,
+                 const std::vector<TBlob>& outputs,
+                 const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
-             outputs[0].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s),
+             outputs[1].FlatToKD<xpu, odim+1, DType>(s), ctx, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
@@ -239,10 +379,11 @@ struct LaOpCaller<xpu, DType, idim, odim, 2, 1, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<xpu> *s) {
+                 const OpContext& ctx) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
              inputs[1].FlatToKD<xpu, idim+1, DType>(s),
-             outputs[0].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s), ctx, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
@@ -250,11 +391,12 @@ struct LaOpCaller<xpu, DType, idim, odim, 3, 1, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<xpu> *s) {
+                 const OpContext& ctx) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
              inputs[1].FlatToKD<xpu, idim+1, DType>(s),
              inputs[2].FlatToKD<xpu, idim+1, DType>(s),
-             outputs[0].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s), ctx, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
@@ -262,12 +404,27 @@ struct LaOpCaller<xpu, DType, idim, odim, 3, 2, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<xpu> *s) {
+                 const OpContext& ctx) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
              inputs[1].FlatToKD<xpu, idim+1, DType>(s),
              inputs[2].FlatToKD<xpu, idim+1, DType>(s),
              outputs[0].FlatToKD<xpu, odim+1, DType>(s),
-             outputs[1].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
+             outputs[1].FlatToKD<xpu, odim+1, DType>(s), ctx, attrs);
+  }
+};
+template<typename xpu, typename DType, int idim, int odim, typename laop>
+struct LaOpCaller<xpu, DType, idim, odim, 4, 1, laop> {
+  static void op(const std::vector<TBlob>& inputs,
+                 const std::vector<TBlob>& outputs,
+                 const nnvm::NodeAttrs& attrs,
+                 const OpContext& ctx) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[1].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[2].FlatToKD<xpu, idim+1, DType>(s),
+             inputs[3].FlatToKD<xpu, idim+1, DType>(s),
+             outputs[0].FlatToKD<xpu, odim+1, DType>(s), ctx, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
@@ -275,13 +432,14 @@ struct LaOpCaller<xpu, DType, idim, odim, 4, 2, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<xpu> *s) {
+                 const OpContext& ctx) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
              inputs[1].FlatToKD<xpu, idim+1, DType>(s),
              inputs[2].FlatToKD<xpu, idim+1, DType>(s),
              inputs[3].FlatToKD<xpu, idim+1, DType>(s),
              outputs[0].FlatToKD<xpu, odim+1, DType>(s),
-             outputs[1].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
+             outputs[1].FlatToKD<xpu, odim+1, DType>(s), ctx, attrs);
   }
 };
 template<typename xpu, typename DType, int idim, int odim, typename laop>
@@ -289,14 +447,15 @@ struct LaOpCaller<xpu, DType, idim, odim, 4, 3, laop> {
   static void op(const std::vector<TBlob>& inputs,
                  const std::vector<TBlob>& outputs,
                  const nnvm::NodeAttrs& attrs,
-                       mshadow::Stream<xpu> *s) {
+                 const OpContext& ctx) {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
     laop::op(inputs[0].FlatToKD<xpu, idim+1, DType>(s),
              inputs[1].FlatToKD<xpu, idim+1, DType>(s),
              inputs[2].FlatToKD<xpu, idim+1, DType>(s),
              inputs[3].FlatToKD<xpu, idim+1, DType>(s),
              outputs[0].FlatToKD<xpu, odim+1, DType>(s),
              outputs[1].FlatToKD<xpu, odim+1, DType>(s),
-             outputs[2].FlatToKD<xpu, odim+1, DType>(s), s, attrs);
+             outputs[2].FlatToKD<xpu, odim+1, DType>(s), ctx, attrs);
   }
 };
 
@@ -308,15 +467,14 @@ void LaOpForward(const nnvm::NodeAttrs& attrs,
                  const std::vector<OpReqType>& req,
                  const std::vector<TBlob>& outputs) {
   using namespace mshadow;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(inputs.size(), inum);
   CHECK_EQ(outputs.size(), onum);
   MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
-    LaOpCaller<xpu, OType, idim, odim, inum, onum, laop>::op(inputs, outputs, attrs, s);
+    LaOpCaller<xpu, OType, idim, odim, inum, onum, laop>::op(inputs, outputs,
+                                                             attrs, ctx);
   });
 }
 
-
 template<typename xpu, int idim, int odim, int inum, int onum, typename laop>
 void LaOpBackward(const nnvm::NodeAttrs& attrs,
                   const OpContext& ctx,
@@ -331,11 +489,12 @@ void LaOpBackward(const nnvm::NodeAttrs& attrs,
     std::vector<TBlob> tspace(outputs);
     for ( int i = 0; i < onum; ++i ) {
       if ( req[i] == kAddTo ) {
-        tspace[i].dptr_ = ctx.requested[ResourceRequest::kTempSpace]
+        tspace[i].dptr_ = ctx.requested[0]
                              .get_space_typed<xpu, 1, OType>(Shape1(outputs[i].Size()), s).dptr_;
       }
     }
-    LaOpCaller<xpu, OType, idim, odim, inum, onum, laop>::op(inputs, tspace, attrs, s);
+    LaOpCaller<xpu, OType, idim, odim, inum, onum, laop>::op(inputs, tspace,
+                                                             attrs, ctx);
     for ( int i = 0; i < onum; ++i ) {
       if ( req[i] == kAddTo ) {
         Tensor<xpu, 1, OType> out = outputs[i].FlatTo1D<xpu, OType>(s);
@@ -345,6 +504,56 @@ void LaOpBackward(const nnvm::NodeAttrs& attrs,
   });
 }
 
+// Specific wrapper for syevd (cannot use the default ones, because A, U have
+// different dimensionality than L
+
+// (A) => (U, L)
+template<typename xpu, typename laop>
+void LaOpForwSyevd(const nnvm::NodeAttrs& attrs,
+                   const OpContext& ctx,
+                   const std::vector<TBlob>& inputs,
+                   const std::vector<OpReqType>& req,
+                   const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 2);
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+    mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
+    laop::op(inputs[0].FlatToKD<xpu, 3, OType>(s),
+             outputs[0].FlatToKD<xpu, 3, OType>(s),
+             outputs[1].FlatToKD<xpu, 2, OType>(s), ctx, attrs);
+  });
+}
+
+// (dU, dL, U, L) => (dA)
+template<typename xpu, typename laop>
+void LaOpBackwSyevd(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(inputs.size(), 4);
+  CHECK_EQ(outputs.size(), 1);
+  MSHADOW_SGL_DBL_TYPE_SWITCH(outputs[0].type_flag_, OType, {
+    std::vector<TBlob> tspace(outputs);
+    if ( req[0] == kAddTo ) {
+      tspace[0].dptr_ = ctx.requested[0]
+        .get_space_typed<xpu, 1, OType>(Shape1(outputs[0].Size()), s).dptr_;
+    }
+    laop::op(inputs[0].FlatToKD<xpu, 3, OType>(s),
+             inputs[1].FlatToKD<xpu, 2, OType>(s),
+             inputs[2].FlatToKD<xpu, 3, OType>(s),
+             inputs[3].FlatToKD<xpu, 2, OType>(s),
+             tspace[0].FlatToKD<xpu, 3, OType>(s), ctx, attrs);
+    if ( req[0] == kAddTo ) {
+      Tensor<xpu, 1, OType> out = outputs[0].FlatTo1D<xpu, OType>(s);
+      out += tspace[0].FlatTo1D<xpu, OType>(s);
+    }
+  });
+}
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/tensor/la_op_inline.h b/src/operator/tensor/la_op_inline.h
index 34fb441f53f7..a508eb773645 100644
--- a/src/operator/tensor/la_op_inline.h
+++ b/src/operator/tensor/la_op_inline.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file la_op_inline.h
  * \brief Operators for advanced linear algebra.
  */
@@ -55,12 +56,14 @@ struct Scale {
 };
 
 // Forward computations (always using batched processing)
+// CHANGE: Added xyz::op(..., ctx, attrs), which calls xyz::op(..., s, attrs)
 
 // D = gemm(A,B,C)
 struct gemm {
   template<typename xpu, typename DType>
   static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
-    const Tensor<xpu, 3, DType>& C, DType alpha, DType beta, bool tA, bool tB, Stream<xpu> *s) {
+                 const Tensor<xpu, 3, DType>& C, DType alpha, DType beta,
+                 bool tA, bool tB, Stream<xpu> *s) {
     linalg_batch_gemm(A, B, C, alpha, beta, tA, tB, s);
   }
   template<typename xpu, typename DType>
@@ -69,8 +72,15 @@ struct gemm {
                  Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
     if ( C.dptr_ != D.dptr_ ) Copy(D, C, s);
     const LaMatrixMacParam& param = nnvm::get<LaMatrixMacParam>(attrs.parsed);
-    gemm::op(A, B, D, DType(param.alpha), DType(param.beta),
-             param.transpose_a, param.transpose_b, s);
+    op(A, B, D, DType(param.alpha), DType(param.beta), param.transpose_a,
+       param.transpose_b, s);
+  }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const Tensor<xpu, 3, DType>& C, const Tensor<xpu, 3, DType>& D,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(A, B, C, D, s, attrs);
   }
 };
 
@@ -78,9 +88,18 @@ struct gemm {
 struct gemm2 {
   template<typename xpu, typename DType>
   static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
-                 const Tensor<xpu, 3, DType>& C, Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
+                 const Tensor<xpu, 3, DType>& C, Stream<xpu> *s,
+                 const nnvm::NodeAttrs& attrs) {
     const LaMatrixMultParam& param = nnvm::get<LaMatrixMultParam>(attrs.parsed);
-    gemm::op(A, B, C, DType(param.alpha), DType(0), param.transpose_a, param.transpose_b, s);
+    gemm::op(A, B, C, DType(param.alpha), DType(0), param.transpose_a,
+             param.transpose_b, s);
+  }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const Tensor<xpu, 3, DType>& C, const OpContext& ctx,
+                 const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(A, B, C, s, attrs);
   }
 };
 
@@ -94,6 +113,12 @@ struct potrf {
     using namespace mxnet_op;
     Kernel<ZeroUpper, xpu>::Launch(s, L.MSize(), L.size(1)*L.stride_, L.stride_, L.dptr_);
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& L,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(A, L, s, attrs);
+  }
 };
 
 // A = potri(L).
@@ -106,6 +131,12 @@ struct potri {
     using namespace mxnet_op;
     Kernel<CopyLowerToUpper, xpu>::Launch(s, A.MSize(), A.size(1)*A.stride_, A.stride_, A.dptr_);
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& L, const Tensor<xpu, 3, DType>& A,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(L, A, s, attrs);
+  }
 };
 
 // B = trsm(L,A)
@@ -123,6 +154,13 @@ struct trsm {
     const LaTriangMatrixMultParam& param = nnvm::get<LaTriangMatrixMultParam>(attrs.parsed);
     op(L, B, DType(param.alpha), param.rightside, param.transpose, s);
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& L, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& B,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(L, A, B, s, attrs);
+  }
 };
 
 // B = trmm(L,A)
@@ -134,11 +172,19 @@ struct trmm {
   }
   template<typename xpu, typename DType>
   static void op(const Tensor<xpu, 3, DType>& L, const Tensor<xpu, 3, DType>& A,
-                 const Tensor<xpu, 3, DType>& B, Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
+                 const Tensor<xpu, 3, DType>& B, Stream<xpu> *s,
+                 const nnvm::NodeAttrs& attrs) {
     if ( A.dptr_ != B.dptr_ ) Copy(B, A, s);
     const LaTriangMatrixMultParam& param = nnvm::get<LaTriangMatrixMultParam>(attrs.parsed);
     op(L, B, DType(param.alpha), param.rightside, param.transpose, s);
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& L, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& B, const OpContext& ctx,
+                 const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(L, A, B, s, attrs);
+  }
 };
 
 // Useful operator that is not part of BLAS/LAPACK.
@@ -161,6 +207,137 @@ struct sumlogdiag {
     using namespace mxnet_op;
     Kernel<ForwardSumLogDiag, xpu>::Launch(s, A.size(0), A.size(1), A.stride_, A.dptr_, B.dptr_);
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 1, DType>& B,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(A, B, s, attrs);
+  }
+};
+
+// B = syrk(A)
+struct syrk {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 DType alpha, DType beta, bool tA, Stream<xpu> *s) {
+    linalg_batch_syrk(A, B, alpha, beta, tA, s);
+    // Symmetric B is in lower triangle: Copy to upper
+    using namespace mxnet_op;
+    Kernel<CopyLowerToUpper, xpu>::Launch(s, B.MSize(), B.size(1)*B.stride_,
+                                          B.stride_, B.dptr_);
+  }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 Stream<xpu> *s, const nnvm::NodeAttrs& attrs) {
+    const LaSyrkParam& param = nnvm::get<LaSyrkParam>(attrs.parsed);
+    op(A, B, DType(param.alpha), DType(0), param.transpose, s);
+  }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(A, B, s, attrs);
+  }
+};
+
+// (Q, L) = gelqf(A) [LQ factorization]
+// More complex than the other cases:
+// - Has to reserve workspace, whose size can only be determined by workspace
+//   queries. This is done once, and then the workspace is used for all items
+//   of the batch
+// - Two different LAPACK functions are called (the first, gelqf, returns an
+//   internal representation, which has to be converted into Q, L)
+struct gelqf {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& Q,
+                 const Tensor<xpu, 3, DType>& L, const OpContext& ctx,
+                 const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (A.dptr_ != Q.dptr_) Copy(Q, A, s);
+    // From here on, we work on Q only
+    // Reserve workspace
+    // The size is determined by workspace queries, done on the first items
+    // of the batch
+    int ws_size(linalg_gelqf_workspace_query(Q[0], s));
+    Tensor<xpu, 1, DType> work = ctx.requested[0]
+      .get_space_typed<xpu, 1, DType>(Shape1(ws_size), s);
+    // Loop over items in batch
+    linalg_check_batch_size(A.size(0), Q.size(0), L.size(0));
+    int m = Q.size(1);  // Q[i] has shape (m, n)
+    for (index_t i = 0; i < A.size(0); ++i) {
+      const Tensor<xpu, 2, DType>& Qi = Q[i];
+      const Tensor<xpu, 2, DType>& Li = L[i];
+      // Call gelqf: Overwrites Qi and part of work. Afterwards, L matrix is
+      // in lower triangle of Qi
+      linalg_gelqf(Qi, work, s);
+      // Copy lower triangle & diagonal of Qi ==> Li.
+      // Also, zero the upper triangle.
+      // QLeft: First m columns of Qi
+      Tensor<xpu, 2, DType> QLeft(Qi.dptr_, Shape2(m, m), Qi.stride_, s);
+      Copy(Li, QLeft, s);
+      using namespace mxnet_op;
+      Kernel<ZeroUpper, xpu>::Launch(s, Li.MSize(), m*Li.stride_, Li.stride_,
+                                     Li.dptr_);
+      // Call orglq: Input is Qi and part of work. Overwrites Qi by final Q
+      // matrix (conversion from internal representation)
+      linalg_orglq(Qi, work, s);
+    }
+  }
+};
+
+// If (U, L) = syevd(A) [symmetric eigendecomposition], this helper acts on each row
+// of U, deciding whether its sign is flipped or not.
+// If u denotes a row, we choose the sign s.t. u_k > 0, where k = argmax|u_j|. In case
+// of a tie, the smaller index k decides.
+struct SyevdEigenVecSigns {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, int n, DType* U, int ldu) {
+    DType* urow(U + (i*ldu));
+    DType maxval(fabs(urow[0])), uval(0.0);
+    int maxind(0);
+    for (int i = 1; i < n; ++i) {
+      uval = fabs(urow[i]);
+      if (uval > maxval) {
+        maxval = uval;
+        maxind = i;
+      }
+    }
+    if (urow[maxind] < 0.0) {
+      // Flip all signs
+      for (int i = 0; i < n; ++i) {
+        urow[i] = -urow[i];
+      }
+    }
+  }
+};
+
+// (U, L) = syevd(A) [symmetric eigendecomposition]
+// - Input A must be symmetric, only lower triangle is used
+// - U can overwrite A
+// - Needs workspace (both DType and int), size of which is determined by a
+//   workspace query
+struct syevd {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& U,
+                 const Tensor<xpu, 2, DType>& L, const OpContext& ctx,
+                 const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    linalg_check_batch_size(A.size(0), U.size(0), L.size(0));
+    if (A.dptr_ != U.dptr_) Copy(U, A, s);
+    // From here on, we work on U only
+    // Reserve workspace (size determined by query)
+    int lwork(linalg_syevd_workspace_query(U[0], L[0], s));
+    Tensor<xpu, 1, DType> work = ctx.requested[0]
+      .get_space_typed<xpu, 1, DType>(Shape1(lwork), s);
+    // Loop over items in batch
+    for (index_t i = 0; i < U.size(0); ++i) {
+      linalg_syevd(U[i], L[i], work, s);
+    }
+    // Set signs of eigenvectors in a deterministic way
+    using namespace mxnet_op;
+    Kernel<SyevdEigenVecSigns, xpu>::Launch
+      (s, U.size(0)*U.size(1), U.size(1), U.dptr_, U.stride_);
+  }
 };
 
 // Backward operators (always using batch processing)
@@ -182,6 +359,15 @@ struct gemm_backward {
     using namespace mxnet_op;
     Kernel<Scale, xpu>::Launch(s, dC.MSize(), DType(param.beta), dC.dptr_);
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dD, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& B, const Tensor<xpu, 3, DType>& C,
+                 const Tensor<xpu, 3, DType>& dA, const Tensor<xpu, 3, DType>& dB,
+                 const Tensor<xpu, 3, DType>& dC,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(dD, A, B, C, dA, dB, dC, s, attrs);
+  }
 };
 
 struct gemm2_backward {
@@ -197,6 +383,14 @@ struct gemm2_backward {
     (tB ? gemm::op(dC, A, dB, DType(param.alpha), DType(0), true, tA, s)
         : gemm::op(A, dC, dB, DType(param.alpha), DType(0), !tA, false, s));
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dC, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& B, const Tensor<xpu, 3, DType>& dA,
+                 const Tensor<xpu, 3, DType>& dB,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(dC, A, B, dA, dB, s, attrs);
+  }
 };
 
 struct potrf_backward {
@@ -205,11 +399,10 @@ struct potrf_backward {
                  const Tensor<xpu, 3, DType>& dA,
                  Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     // Backward of L = potrf(A).
-    // dA = 0.5 * L**T * symm(L**T * dL # E) * L**(-1) where
-    //     '#' denotes Hadamard product
-    //      E is the matrix having 1 on diagonal, 0 on upper and 2 on lower triagle
-    //      symm(X) = 0.5 * (X + X**T)
-    // Hadamard product and symm can be realized by a single copy from lower to upper triangle.
+    //   dA = 0.5 * L**T * copyLTU(L**T * dL) * L**(-1)
+    // Here, copyLTU(M) creates a symmetric matrix from the square matrix M
+    // by setting the upper triangle to be equal to the lower triangle, leaving
+    // lower triangle and diagonal unchanged.
     if ( dL.dptr_ != dA.dptr_ ) {
       Copy(dA, dL, s);
     }
@@ -220,6 +413,13 @@ struct potrf_backward {
     trsm::op(L, dA, DType(1.0), false, true, s);
     trsm::op(L, dA, DType(0.5), true, false, s);
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dL, const Tensor<xpu, 3, DType>& L,
+                 const Tensor<xpu, 3, DType>& dA,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(dL, L, dA, s, attrs);
+  }
 };
 
 struct potri_backward {
@@ -228,11 +428,23 @@ struct potri_backward {
                  const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& dL,
                  Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
     // Backward of A = potri(L).
-    // dL = -2 * tril(A * dA * L**(-T)), where tril() extracts lower triangle and diagonal.
-    gemm::op(A, dA, dL, DType(1.0), DType(0), false, false, s);
-    trsm::op(L, dL, DType(-2.0), true, true, s);
+    // dL = -tril( A * (dA + dA**T) * L**(-T)), where tril() extracts lower triangle
+    // and diagonal. We must not assume that dA is symmetric.
+    // Note: Calling gemm twice here is a bit wasteful, but otherwise the symmetrization
+    // of dA would require temporary memory.
+    gemm::op(A, dA, dL, DType(1.), DType(0.), false, false, s);
+    gemm::op(A, dA, dL, DType(1.), DType(1.), false, true, s);
+    trsm::op(L, dL, DType(-1.), true, true, s);
     using namespace mxnet_op;
-    Kernel<ZeroUpper, xpu>::Launch(s, dL.MSize(), dL.size(1)*dL.stride_, dL.stride_, dL.dptr_);
+    Kernel<ZeroUpper, xpu>::Launch(s, dL.MSize(), dL.size(1)*dL.stride_, dL.stride_,
+                                   dL.dptr_);
+  }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dA, const Tensor<xpu, 3, DType>& L,
+                 const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& dL,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(dA, L, A, dL, s, attrs);
   }
 };
 
@@ -255,36 +467,53 @@ struct trsm_backward {
     using namespace mxnet_op;
     Kernel<ZeroUpper, xpu>::Launch(s, dL.MSize(), dL.size(1)*dL.stride_, dL.stride_, dL.dptr_);
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dB, const Tensor<xpu, 3, DType>& L,
+                 const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
+                 const Tensor<xpu, 3, DType>& dL, const Tensor<xpu, 3, DType>& dA,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(dB, L, A, B, dL, dA, s, attrs);
+  }
 };
 
 struct trmm_backward {
   template<typename xpu, typename DType>
   static void op(const Tensor<xpu, 3, DType>& dB, const Tensor<xpu, 3, DType>& L,
-                 const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& B,
-                 const Tensor<xpu, 3, DType>& dL, const Tensor<xpu, 3, DType>& dA,
-                 Stream<xpu>* s, const nnvm::NodeAttrs& attrs) {
+                 const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& dL,
+                 const Tensor<xpu, 3, DType>& dA, Stream<xpu>* s,
+                 const nnvm::NodeAttrs& attrs) {
     // Backward of B = trmm(L,A).
     const LaTriangMatrixMultParam& param = nnvm::get<LaTriangMatrixMultParam>(attrs.parsed);
     // Compute dL
-    const bool db_left(param.rightside == param.transpose);
     DType scale(param.alpha);
-    (db_left ? gemm::op(dB, A, dL, scale, DType(0), param.transpose, !param.transpose, s)
-             : gemm::op(A, dB, dL, scale, DType(0), !param.transpose, param.transpose, s));
+    if (param.rightside == param.transpose) {
+      gemm::op(dB, A, dL, scale, DType(0.), param.transpose, !param.transpose, s);
+    } else {
+      gemm::op(A, dB, dL, scale, DType(0.), !param.transpose, param.transpose, s);
+    }
     using namespace mxnet_op;
-    Kernel<ZeroUpper, xpu>::Launch(s, dL.MSize(), dL.size(1)*dL.stride_, dL.stride_, dL.dptr_);
+    Kernel<ZeroUpper, xpu>::Launch(s, dL.MSize(), dL.size(1)*dL.stride_, dL.stride_,
+                                   dL.dptr_);
     // Compute dA
-    if ( dA.dptr_ != dB.dptr_ ) Copy(dA, dB, s);
+    if (dA.dptr_ != dB.dptr_) Copy(dA, dB, s);
     trmm::op(L, dA, scale, param.rightside, !param.transpose, s);
   }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dB, const Tensor<xpu, 3, DType>& L,
+                 const Tensor<xpu, 3, DType>& A, const Tensor<xpu, 3, DType>& dL,
+                 const Tensor<xpu, 3, DType>& dA, const OpContext& ctx,
+                 const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(dB, L, A, dL, dA, s, attrs);
+  }
 };
 
 struct BackwardSumLogDiag {
   template<typename DType>
-  MSHADOW_XINLINE static void Map(int i, int N, int stride, DType* dB, DType* A, DType* dA) {
-    const int offset(i * N * stride);
-    for ( int j = 0; j < N; ++j ) {
-      dA[offset+j*(stride+1)] = dB[i]/A[offset+j*(stride+1)];
-    }
+  MSHADOW_XINLINE static void Map(int i, int M, int stride, DType* dB, DType* A, DType* dA) {
+    const int matrix(i / M), row((i % M) / stride), col(i % stride);
+    dA[i] = (row == col ? dB[matrix]/A[i] : DType(0));
   }
 };
 struct sumlogdiag_backward {
@@ -297,10 +526,146 @@ struct sumlogdiag_backward {
     // this function as the LaOpCaller-adapters can only deal with a uniform
     // dimension for all tensor inputs. This doesn't matter as we will interpret
     // it correctly internally in this function.
+    // Note that A and dA may point to the same memory.
     using namespace mxnet_op;
-    Kernel<Scale, xpu>::Launch(s, dA.MSize(), DType(0), dA.dptr_);
     Kernel<BackwardSumLogDiag, xpu>::Launch
-         (s, A.size(0), A.size(1), A.stride_, dB.dptr_, A.dptr_, dA.dptr_);
+         (s, dA.MSize(), dA.size(1)*dA.stride_, dA.stride_, dB.dptr_, A.dptr_, dA.dptr_);
+  }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dB, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& dA,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(dB, A, dA, s, attrs);
+  }
+};
+
+struct syrk_backward {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dB, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& dA, Stream<xpu>* s,
+                 const nnvm::NodeAttrs& attrs) {
+    const LaSyrkParam& param = nnvm::get<LaSyrkParam>(attrs.parsed);
+    // Note: Calling gemm twice is a bit wasteful, but the symmetrization of dB
+    // would otherwise need temporary memory
+    if (param.transpose) {
+      gemm::op(A, dB, dA, DType(param.alpha), DType(0.), false, false, s);
+      gemm::op(A, dB, dA, DType(param.alpha), DType(1.), false, true, s);
+    } else {
+      gemm::op(dB, A, dA, DType(param.alpha), DType(0.), false, false, s);
+      gemm::op(dB, A, dA, DType(param.alpha), DType(1.), true, false, s);
+    }
+  }
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dB, const Tensor<xpu, 3, DType>& A,
+                 const Tensor<xpu, 3, DType>& dA, const OpContext& ctx,
+                 const nnvm::NodeAttrs& attrs) {
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    op(dB, A, dA, s, attrs);
+  }
+};
+
+// Have to reserve temporary storage tempM, same shape as dL
+struct gelqf_backward {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dQ,
+                 const Tensor<xpu, 3, DType>& dL,
+                 const Tensor<xpu, 3, DType>& Q,
+                 const Tensor<xpu, 3, DType>& L,
+                 const Tensor<xpu, 3, DType>& dA,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    // Backward of (Q, L) = gelqf(A):
+    //   dA = L**(-T) * (dQ + copyLTU(M) * Q), M = L**T * dL - dQ * Q**T
+    // Here, copyLTU(M) creates a symmetric matrix from the square matrix M
+    // by setting the upper triangle to be equal to the lower triangle, leaving
+    // lower triangle and diagonal unchanged.
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    if (dQ.dptr_ != dA.dptr_) Copy(dA, dQ, s);
+    // Need temporal space, same shape as dL
+    Tensor<xpu, 3, DType> tempM = ctx.requested[0]
+      .get_space_typed<xpu, 3, DType>(dL.shape_, s);
+    Copy(tempM, dL, s);
+    trmm::op(L, tempM, DType(1.0), false, true, s);
+    gemm::op(dA, Q, tempM, DType(-1.0), DType(1.0), false, true, s);
+    Kernel<CopyLowerToUpper, xpu>::Launch
+           (s, tempM.MSize(), tempM.size(1)*tempM.stride_, tempM.stride_,
+            tempM.dptr_);
+    gemm::op(tempM, Q, dA, DType(1.0), DType(1.0), false, false, s);
+    trsm::op(L, dA, DType(1.0), false, true, s);
+  }
+};
+
+// Helper for syevd_backward. See technical report for details
+// Note: Could be parallelized more, but this is subdominant anyway
+template<typename DType>
+DType syevd_back_helper_eps(DType* X);
+
+template<>
+MSHADOW_XINLINE float syevd_back_helper_eps(float* X) {
+  return 1e-30;
+}
+
+template<>
+MSHADOW_XINLINE double syevd_back_helper_eps(double* X) {
+  return 1e-100;
+}
+
+struct SyevdBackHelper {
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int k, int n, DType* X, int ldx, DType* L,
+                                  int ldl, DType* dL, int lddl, DType* Y,
+                                  int ldy) {
+    const int offx(k*n*ldx);
+    const int offy(k*n*ldy);
+    const int offl(k*ldl);
+    const int offdl(k*lddl);
+    DType denom(0.0), elem(0.0);
+    const DType eps(syevd_back_helper_eps(X));
+    // Lower and upper triangle: Loop i > j
+    for (int i = 1; i < n; ++i) {
+      for (int j = 0; j < i; ++j) {
+        denom = L[offl+i] - L[offl+j];  // Must be >=0
+        if (denom < eps) denom = eps;
+        denom *= 2.0;
+        elem = (X[offx+i*ldx+j] - X[offx+j*ldx+i])/denom;
+        Y[offy+i*ldy+j] = Y[offy+j*ldy+i] = elem;
+      }
+    }
+    // Diagonal
+    for (int i = 0; i < n; ++i) {
+      Y[offy+i*(ldy+1)] = dL[offdl+i];
+    }
+  }
+};
+
+// Have to reserve temporary storage tempM, same shape as dA.
+// dA may overwrite dU
+struct syevd_backward {
+  template<typename xpu, typename DType>
+  static void op(const Tensor<xpu, 3, DType>& dU,
+                 const Tensor<xpu, 2, DType>& dL,
+                 const Tensor<xpu, 3, DType>& U,
+                 const Tensor<xpu, 2, DType>& L,
+                 const Tensor<xpu, 3, DType>& dA,
+                 const OpContext& ctx, const nnvm::NodeAttrs& attrs) {
+    // Backward of (U, L) = syevd(A):
+    //   dA = U**T * SyevdBackHelper(dU * U**T, L, dL) * U
+    using namespace mxnet_op;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    // Need temporal space, same shape as dA
+    Tensor<xpu, 3, DType> tempM = ctx.requested[0]
+      .get_space_typed<xpu, 3, DType>(dA.shape_, s);
+    // This copy is just to make sure there are no invalid values (NaN, infinity) in
+    // tempM. gemm multiplies tempM with 0, instead of setting entries to 0.
+    Copy(tempM, dU, s);
+    gemm::op(dU, U, tempM, DType(1.0), DType(0.0), false, true, s);
+    // SyevdBackHelper: tempM => dA
+    Kernel<SyevdBackHelper, xpu>::Launch
+      (s, dA.size(0), dA.size(1), tempM.dptr_, tempM.stride_, L.dptr_,
+       L.stride_, dL.dptr_, dL.stride_, dA.dptr_, dA.stride_);
+    gemm::op(U, dA, tempM, DType(1.0), DType(0.0), true, false, s);
+    gemm::op(tempM, U, dA, DType(1.0), DType(0.0), false, false, s);
   }
 };
 
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index af0de593c1be..367f8de05392 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file matrix_op-inl.h
  * \brief Function definition of matrix related operators
  */
@@ -28,11 +29,14 @@
 #include <vector>
 #include <algorithm>
 #include <utility>
+#include <type_traits>
 #include "../mshadow_op.h"
 #include "../elemwise_op_common.h"
 #include "../channel_op_common.h"
 #include "../mxnet_op.h"
 #include "broadcast_reduce_op.h"
+#include "./init_op.h"
+#include "../../common/static_array.h"
 
 #if MXNET_USE_CUDA
 #include <thrust/device_vector.h>
@@ -346,7 +350,7 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
 
   int axis = param.axis;
   if (axis < 0) {
-    axis += indim;
+    axis += indim + 1;
   }
   CHECK(axis >= 0 && axis <= indim)
       << "axis must be in the range [" << -indim << ", " << indim << "] ("
@@ -368,630 +372,640 @@ inline bool ExpandDimShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
-struct DotParam : public dmlc::Parameter<DotParam> {
-  bool transpose_a;
-  bool transpose_b;
-  DMLC_DECLARE_PARAMETER(DotParam) {
-    DMLC_DECLARE_FIELD(transpose_a)
-      .describe("If true then transpose the first input before dot.")
-      .set_default(false);
-    DMLC_DECLARE_FIELD(transpose_b)
-      .describe("If true then transpose the second input before dot.")
-      .set_default(false);
+struct SliceParam : public dmlc::Parameter<SliceParam> {
+  nnvm::Tuple<dmlc::optional<int>> begin, end;
+  nnvm::Tuple<dmlc::optional<int>> step;
+  DMLC_DECLARE_PARAMETER(SliceParam) {
+    DMLC_DECLARE_FIELD(begin)
+    .describe("starting indices for the slice operation, supports negative indices.");
+    DMLC_DECLARE_FIELD(end)
+    .describe("ending indices for the slice operation, supports negative indices.");
+    DMLC_DECLARE_FIELD(step)
+    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .describe("step for the slice operation, supports negative values.");
   }
 };
 
-template<typename xpu>
-void DotForward_(const nnvm::NodeAttrs& attrs,
-                 const OpContext& ctx,
-                 const std::vector<TBlob>& inputs,
-                 const std::vector<OpReqType>& req,
-                 const std::vector<TBlob>& outputs) {
+inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                         const int dev_mask,
+                                         DispatchMode* dispatch_mode,
+                                         std::vector<int>* in_attrs,
+                                         std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1);
+  CHECK_EQ(out_attrs->size(), 1);
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  const auto& in_stype = in_attrs->at(0);
+  auto& out_stype = out_attrs->at(0);
+  bool dispatched = false;
+  const bool invalid_ctx = dev_mask != mshadow::cpu::kDevMask;
+  const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback :
+                                         DispatchMode::kFComputeEx;
+  // If step = 1, no need to fallback; otherwise fallback to dense
+  bool trivial_step = false;
+  if (param.step.ndim() == 0U) {
+    trivial_step = true;
+  } else if (param.step.ndim() == 1U
+      && (!param.step[0].has_value() || param.step[0].value() == 1)) {
+    trivial_step = true;
+  }
+  if (!dispatched && in_stype == kDefaultStorage) {
+    dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                     dispatch_mode, DispatchMode::kFCompute);
+  }
+
+  if (!dispatched && in_stype == kCSRStorage && trivial_step) {
+    dispatched = storage_type_assign(&out_stype, kCSRStorage,
+                                     dispatch_mode, dispatch_ex);
+  }
+
+  if (!dispatched) {
+    dispatch_fallback(out_attrs, dispatch_mode);
+  }
+  if (*dispatch_mode == DispatchMode::kFComputeFallback) {
+    LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+  }
+
+  return true;
+}
+
+// slice the indptr of a csr
+struct SliceCsrIndPtr {
+  template<typename IType>
+  MSHADOW_XINLINE static void Map(int i, IType* out, const IType* in, const IType* base) {
+    KERNEL_ASSIGN(out[i], kWriteTo, in[i] - *base);
+  }
+};
+
+/*
+ * a wrapper to launch SliceCsrIndPtr kernel.
+ * slice [src[begin] .. src[end]) and store in dst[0, end - begin)
+ */
+template<typename xpu, typename IType>
+void SliceCsrIndPtrImpl(const int begin, const int end, RunContext ctx,
+                        const IType* src, IType* dst) {
   using namespace mshadow;
-  using namespace mshadow::expr;
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  using namespace mxnet_op;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
-      << "Binary function only support input/output with the same type";
-  CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
-      << "Binary function only support input/output with the same type";
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
-      << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (inputs[0].ndim() == 1 && inputs[1].ndim() == 1) {
-      CHECK_NE(req[0], kAddTo) << "AddTo not yet suported";
-      Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
-      VectorDot(out,
-                inputs[0].get<xpu, 1, DType>(s),
-                inputs[1].get<xpu, 1, DType>(s));
-    } else {
-      int ma, na, mb, nb, m, n;
-      if (param.transpose_a) {
-        ma = inputs[0].size(0);
-        na = inputs[0].Size()/ma;
-        m = na;
-      } else {
-        na = inputs[0].size(inputs[0].ndim()-1);
-        ma = inputs[0].Size()/na;
-        m = ma;
-      }
-      if (param.transpose_b) {
-        nb = inputs[1].size(inputs[1].ndim()-1);
-        mb = inputs[1].Size()/nb;
-        n = mb;
-      } else {
-        mb = inputs[1].size(0);
-        nb = inputs[1].Size()/mb;
-        n = nb;
-      }
-      Tensor<xpu, 2, DType> input0 =
-      inputs[0].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
-      Tensor<xpu, 2, DType> input1 =
-      inputs[1].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
-      Tensor<xpu, 2, DType> out =
-      outputs[0].get_with_shape<xpu, 2, DType>(Shape2(m, n), s);
-      if (param.transpose_a && param.transpose_b) {
-        ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1.T()));
-      } else if (!param.transpose_a && param.transpose_b) {
-        ASSIGN_DISPATCH(out, req[0], dot(input0, input1.T()));
-      } else if (param.transpose_a && !param.transpose_b) {
-        ASSIGN_DISPATCH(out, req[0], dot(input0.T(), input1));
-      } else {
-        ASSIGN_DISPATCH(out, req[0], dot(input0, input1));
-      }
-    }
-  });
+  int indptr_len = end - begin + 1;
+  Kernel<SliceCsrIndPtr, xpu>::Launch(s, indptr_len, dst, src + begin, src + begin);
 }
 
+/*
+ * Slice a CSR NDArray for first dimension
+ * Only implemented for CPU
+ */
 template<typename xpu>
-void DotBackward_(const nnvm::NodeAttrs& attrs,
-                  const OpContext& ctx,
-                  const std::vector<TBlob>& inputs,
-                  const std::vector<OpReqType>& req,
-                  const std::vector<TBlob>& outputs) {
+void SliceDimOneCsrImpl(const TShape &begin, const TShape &end, const OpContext& ctx,
+                        const NDArray &in, const NDArray &out) {
   using namespace mshadow;
-  using namespace mshadow::expr;
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  CHECK_NE(req[0], kWriteInplace);
-  CHECK_NE(req[1], kWriteInplace);
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
-      << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (inputs[1].ndim() == 1 && inputs[2].ndim() == 1) {
-      Tensor<xpu, 1, DType> mout_grad = inputs[0].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> mlhs_data = inputs[1].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> mrhs_data = inputs[2].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> mlhs_grad = outputs[0].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> mrhs_grad = outputs[1].get<xpu, 1, DType>(s);
-      ASSIGN_DISPATCH(mrhs_grad, req[1],
-                      broadcast_scalar(mout_grad, mlhs_data.shape_) * mlhs_data);
-      ASSIGN_DISPATCH(mlhs_grad, req[0],
-                      broadcast_scalar(mout_grad, mlhs_data.shape_) * mrhs_data);
-    } else {
-      int ma, na, mb, nb, m, n;
-      if (param.transpose_a) {
-        ma = outputs[0].size(0);
-        na = outputs[0].Size()/ma;
-        m = na;
-      } else {
-        na = outputs[0].size(outputs[0].ndim()-1);
-        ma = outputs[0].Size()/na;
-        m = ma;
-      }
-      if (param.transpose_b) {
-        nb = outputs[1].size(outputs[1].ndim()-1);
-        mb = outputs[1].Size()/nb;
-        n = mb;
-      } else {
-        mb = outputs[1].size(0);
-        nb = outputs[1].Size()/mb;
-        n = nb;
-      }
-      Tensor<xpu, 2, DType> mout_grad =
-      inputs[0].get_with_shape<xpu, 2, DType>(Shape2(m, n), s);
-      Tensor<xpu, 2, DType> mlhs_data =
-      inputs[1].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
-      Tensor<xpu, 2, DType> mrhs_data =
-      inputs[2].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
-      Tensor<xpu, 2, DType> mlhs_grad =
-      outputs[0].get_with_shape<xpu, 2, DType>(Shape2(ma, na), s);
-      Tensor<xpu, 2, DType> mrhs_grad =
-      outputs[1].get_with_shape<xpu, 2, DType>(Shape2(mb, nb), s);
-      if (param.transpose_a && param.transpose_b) {
-        // Gradient of z = dot(x.T, y.T)
-        // dy = dot(x, dz).T = dot(dz.T, x.T)
-        // dx = dot(dz, y).T = dot(y.T, dz.T)
-        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data.T()));
-        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data.T(), mout_grad.T()));
-      } else if (!param.transpose_a && param.transpose_b) {
-        // Gradient of z = dot(x, y.T)
-        // dy = dot(x.T, dz).T = dot(dz.T, x)
-        // dx = dot(dz, y)
-        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mout_grad.T(), mlhs_data));
-        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data));
-      } else if (param.transpose_a && !param.transpose_b) {
-        // Gradient of z = dot(x.T, y)
-        // dy = dot(x, dz)
-        // dx = dot(dz, y.T).T = dot(y, dz.T)
-        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data, mout_grad));
-        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mrhs_data, mout_grad.T()));
-      } else {
-        // Gradient of z = dot(x, y)
-        // dy = dot(x.T, dz)
-        // dx = dot(dz, y.T)
-        ASSIGN_DISPATCH(mrhs_grad, req[1], dot(mlhs_data.T(), mout_grad));
-        ASSIGN_DISPATCH(mlhs_grad, req[0], dot(mout_grad, mrhs_data.T()));
-      }
-    }
+  using namespace mxnet_op;
+  using namespace csr;
+  CHECK((std::is_same<xpu, cpu>::value)) << "SliceDimOneCsrImpl is only implemented for CPU";
+  nnvm::dim_t begin_row = begin[0];
+  nnvm::dim_t end_row = end[0];
+  nnvm::dim_t indptr_len = end_row - begin_row + 1;
+  out.CheckAndAllocAuxData(kIndPtr, Shape1(indptr_len));
+  // assume idx indptr share the same type
+  MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIndPtr), RType, {
+    MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIdx), IType, {
+      MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
+        RType* in_indptr = in.aux_data(kIndPtr).dptr<RType>();
+        RType* out_indptr = out.aux_data(kIndPtr).dptr<RType>();
+        SliceCsrIndPtrImpl<cpu, RType>(begin_row, end_row, ctx.run_ctx, in_indptr, out_indptr);
+
+        // retrieve nnz (CPU implementation)
+        int nnz = out_indptr[indptr_len - 1];
+        // return csr zeros if nnz = 0
+        if (nnz == 0) {
+          out.set_aux_shape(kIdx, Shape1(0));
+          return;
+        }
+        // copy indices and values
+        out.CheckAndAllocAuxData(kIdx, Shape1(nnz));
+        out.CheckAndAllocData(Shape1(nnz));
+        IType* in_idx = in.aux_data(kIdx).dptr<IType>();
+        IType* out_idx = out.aux_data(kIdx).dptr<IType>();
+        DType* in_data = in.data().dptr<DType>();
+        DType* out_data = out.data().dptr<DType>();
+        int offset = in_indptr[begin_row];
+        // this is also a CPU-only implementation
+        memcpy(out_idx, in_idx + offset, nnz * sizeof(IType));
+        memcpy(out_data, in_data + offset, nnz * sizeof(DType));
+      });
+    });
   });
 }
 
-inline bool DotShape(const nnvm::NodeAttrs& attrs,
-                     std::vector<TShape> *in_attrs,
-                     std::vector<TShape> *out_attrs) {
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  CHECK_EQ(in_attrs->size(), 2U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  TShape& lshape = (*in_attrs)[0];
-  TShape& rshape = (*in_attrs)[1];
-  if (lshape.ndim() == 1 && rshape.ndim() == 1) {
-    CHECK(!param.transpose_a && !param.transpose_b) << "Cannot transpose vectors";
-    CHECK_EQ(lshape[0], rshape[0]) << "dot shape error: " << lshape << " X " << rshape;
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape1(1));
-  } else {
-    bool Ta = param.transpose_a, Tb = param.transpose_b;
-    TShape L[2], R[2];
-    if (Ta) {
-      L[0] = mshadow::Shape1(lshape[0]);
-      L[1] = lshape.ndim() > 1 ? TShape(&lshape[1], &lshape[lshape.ndim()]) : TShape(1);
-    } else {
-      L[0] = lshape.ndim() > 1 ? TShape(&lshape[0], &lshape[lshape.ndim()-1]) : TShape(1);
-      L[1] = mshadow::Shape1(lshape[lshape.ndim()-1]);
-    }
-    if (Tb) {
-      R[0] = rshape.ndim() > 1 ? TShape(&rshape[0], &rshape[rshape.ndim()-1]) : TShape(1);
-      R[1] = mshadow::Shape1(rshape[rshape.ndim()-1]);
-    } else {
-      R[0] = mshadow::Shape1(rshape[0]);
-      R[1] = rshape.ndim() > 1 ? TShape(&rshape[1], &rshape[rshape.ndim()]) : TShape(1);
-    }
-
-    if (L[!Ta].Size() != 0 && R[Tb].Size() != 0) {
-      CHECK_EQ(L[!Ta].Size(), R[Tb].Size())
-        << "dot shape error: " << lshape << " X " << rshape;
+/*!
+ * \brief slice a CSRNDArray for two dimensions
+ */
+struct SliceDimTwoCsrAssign {
+  /*!
+   * \brief This function slices a CSRNDArray on axis one between begin_col and end_col
+   * \param i           loop index
+   * \param out_idx     output csr ndarray column indices
+   * \param out_data    output csr ndarray data
+   * \param out_indptr  output csr ndarray row index pointer
+   * \param in_idx      input csr ndarray column indices
+   * \param in_data     input csr ndarray data
+   * \param in_indptr   input csr ndarray row index pointer
+   * \param begin_col   begin column indice
+   * \param end_col     end column indice
+   */
+  template<typename IType, typename RType, typename DType>
+  MSHADOW_XINLINE static void Map(int i,
+                                  IType* out_idx, DType* out_data,
+                                  const RType* out_indptr,
+                                  const IType* in_idx, const DType* in_data,
+                                  const RType* in_indptr,
+                                  const int begin_col, const int end_col) {
+    RType ind = out_indptr[i];
+    for (RType j = in_indptr[i]; j < in_indptr[i+1]; j++) {
+      // indices of CSRNDArray are in ascending order per row
+      if (in_idx[j] >= end_col) {
+        break;
+      } else if (in_idx[j] >= begin_col) {
+        out_idx[ind] = in_idx[j] - begin_col;
+        out_data[ind] = in_data[j];
+        ind++;
+      }
     }
-    std::vector<index_t> buf;
-    if (lshape.ndim() > 1) buf.insert(buf.end(), &L[Ta][0], &L[Ta][L[Ta].ndim()]);
-    if (rshape.ndim() > 1) buf.insert(buf.end(), &R[!Tb][0], &R[!Tb][R[!Tb].ndim()]);
-    TShape oshape(buf.begin(), buf.end());
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
   }
-  return true;
-}
+};
 
+/*
+ * Slice a CSR NDArray for two dimensions
+ * Only implemented for CPU
+ */
 template<typename xpu>
-void BatchDotForward_(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<TBlob>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<TBlob>& outputs) {
+void SliceDimTwoCsrImpl(const TShape &begin, const TShape &end, const OpContext& ctx,
+                        const NDArray &in, const NDArray &out) {
   using namespace mshadow;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  CHECK_EQ(outputs[0].type_flag_, inputs[0].type_flag_)
-      << "Binary function only support input/output with the same type";
-  CHECK_EQ(outputs[0].type_flag_, inputs[1].type_flag_)
-      << "Binary function only support input/output with the same type";
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
-      << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs = inputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs = inputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 1, DType*> workspace =
-      ctx.requested[0].get_space_typed<xpu, 1, DType*>(mshadow::Shape1(3 * out.size(0)), s);
-    if (kNullOp != req[0]) {
-      if (param.transpose_a && param.transpose_b) {
-        mshadow::BatchGEMM<true, true>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else if (!param.transpose_a && param.transpose_b) {
-        mshadow::BatchGEMM<false, true>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else if (param.transpose_a && !param.transpose_b) {
-        mshadow::BatchGEMM<true, false>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      } else {
-        mshadow::BatchGEMM<false, false>(out, mlhs, mrhs, (DType)1.0f,
-                                       (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                       workspace);
-      }
-    }
+  using namespace mxnet_op;
+  using namespace csr;
+  CHECK((std::is_same<xpu, cpu>::value)) << "SliceDimTwoCsrImpl is only implemented for CPU";
+  nnvm::dim_t begin_row = begin[0], end_row = end[0];
+  nnvm::dim_t begin_col = begin[1], end_col = end[1];
+  nnvm::dim_t indptr_len = end_row - begin_row + 1;
+  out.CheckAndAllocAuxData(kIndPtr, Shape1(indptr_len));
+  // assume idx indptr share the same type
+  MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIndPtr), RType, {
+    MSHADOW_IDX_TYPE_SWITCH(in.aux_type(kIdx), IType, {
+      MSHADOW_TYPE_SWITCH(in.dtype(), DType, {
+        RType *in_indptr = in.aux_data(kIndPtr).dptr<RType>();
+        IType *in_idx = in.aux_data(kIdx).dptr<IType>();
+        DType *in_data = in.data().dptr<DType>();
+        // retrieve nnz (CPU implementation)
+        RType *out_indptr = out.aux_data(kIndPtr).dptr<RType>();
+        int nnz = 0;
+        out_indptr[0] = 0;
+        // loop through indptr array and corresponding indices to count for nnz
+        for (nnvm::dim_t i = 0; i < indptr_len - 1; i++) {
+          out_indptr[i+1] = out_indptr[i];
+          for (RType j = in_indptr[i + begin_row];
+               j < in_indptr[i + begin_row + 1]; j++) {
+            // indices of CSRNDArray are in ascending order per row
+            if (in_idx[j] >= end_col) {
+              break;
+            } else if (in_idx[j] >= begin_col) {
+              out_indptr[i+1]++;
+              nnz++;
+            }
+          }
+        }
+        // returns zeros in csr format if nnz = 0
+        if (nnz == 0) {
+          out.set_aux_shape(kIdx, Shape1(0));
+          return;
+        }
+        out.CheckAndAllocAuxData(kIdx, Shape1(nnz));
+        out.CheckAndAllocData(Shape1(nnz));
+        IType *out_idx = out.aux_data(kIdx).dptr<IType>();
+        DType *out_data = out.data().dptr<DType>();
+
+        Stream<xpu> *s = ctx.get_stream<xpu>();
+        Kernel<SliceDimTwoCsrAssign, xpu>::Launch(s, indptr_len - 1, out_idx, out_data,
+                                                  out_indptr, in_idx, in_data,
+                                                  in_indptr + begin_row,
+                                                  begin_col, end_col);
+      });
+    });
   });
 }
 
+
 template<typename xpu>
-void BatchDotBackward_(const nnvm::NodeAttrs& attrs,
-                       const OpContext& ctx,
-                       const std::vector<TBlob>& inputs,
-                       const std::vector<OpReqType>& req,
-                       const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  CHECK_NE(req[1], kWriteInplace);
-  CHECK_NE(req[0], kWriteInplace);
-  CHECK(outputs[0].type_flag_ == kFloat32 || outputs[0].type_flag_ == kFloat64)
-      << "dot only supports float32 and float64";
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    mshadow::Tensor<xpu, 3, DType> mout_grad = inputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs_data = inputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs_data = inputs[2].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mlhs_grad = outputs[0].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 3, DType> mrhs_grad = outputs[1].get<xpu, 3, DType>(s);
-    mshadow::Tensor<xpu, 2, DType*> workspace =
-      ctx.requested[0].get_space_typed<xpu, 2, DType*>(
-        mshadow::Shape2(2, 3 * mout_grad.size(0)), s);
-    mshadow::Tensor<xpu, 1, DType*> rhs_workspace = workspace[0];
-    mshadow::Tensor<xpu, 1, DType*> lhs_workspace = workspace[1];
-    if (param.transpose_a && param.transpose_b) {
-      // Gradient of z = dot(x.T, y.T)
-      // dy = dot(x, dz).T = dot(dz.T, x.T)
-      // dx = dot(dz, y).T = dot(y.T, dz.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, true>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f :  (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<true, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
-    } else if (!param.transpose_a && param.transpose_b) {
-      // Gradient of z = dot(x, y.T)
-      // dy = dot(x.T, dz).T = dot(dz.T, x)
-      // dx = dot(dz, y)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, false>(mrhs_grad, mout_grad, mlhs_data, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, false>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
-    } else if (param.transpose_a && !param.transpose_b) {
-      // Gradient of z = dot(x.T, y)
-      // dy = dot(x, dz)
-      // dx = dot(dz, y.T).T = dot(y, dz.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<false, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, true>(mlhs_grad, mrhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
-    } else {
-      // Gradient of z = dot(x, y)
-      // dy = dot(x.T, dz)
-      // dx = dot(dz, y.T)
-      if (kNullOp != req[1]) {
-        mshadow::BatchGEMM<true, false>(mrhs_grad, mlhs_data, mout_grad, (DType)1.0f,
-                                        (kAddTo == req[1]) ? (DType)1.0f : (DType)0.0f,
-                                        rhs_workspace);
-      }
-      if (kNullOp != req[0]) {
-        mshadow::BatchGEMM<false, true>(mlhs_grad, mout_grad, mrhs_data, (DType)1.0f,
-                                        (kAddTo == req[0]) ? (DType)1.0f : (DType)0.0f,
-                                        lhs_workspace);
-      }
+void SliceCsrImpl(const SliceParam &param, const OpContext& ctx,
+                  const NDArray &in, OpReqType req, const NDArray &out) {
+  CHECK((std::is_same<xpu, cpu>::value)) << "Slice for CSR input only implemented for CPU";
+  if (req == kNullOp) return;
+  CHECK_NE(req, kAddTo) << "kAddTo for Slice on CSR input is not supported";
+  CHECK_NE(req, kWriteInplace) << "kWriteInplace for Slice on CSR input is not supported";
+
+  const TShape ishape = in.shape();
+  const TShape oshape = out.shape();
+
+  uint32_t N = ishape.ndim();
+  TShape begin(N), end(N);
+  for (uint32_t i = 0; i < N; ++i) {
+    int s = 0;
+    if (param.begin[i]) {
+      s = *param.begin[i];
+      if (s < 0) s += ishape[i];
     }
-  });
+    begin[i] = s;
+    end[i] = s + oshape[i];
+  }
+  switch (N) {
+    case 1: {
+      SliceDimOneCsrImpl<xpu>(begin, end, ctx, in, out);
+      break;
+    }
+    case 2: {
+      SliceDimTwoCsrImpl<xpu>(begin, end, ctx, in, out);
+      break;
+    }
+    default:
+      LOG(FATAL) << "CSR is only for 2-D shape";
+      break;
+  }
 }
 
-inline bool BatchDotShape(const nnvm::NodeAttrs& attrs,
-                          std::vector<TShape> *in_attrs,
-                          std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2U);
-  CHECK_EQ(out_attrs->size(), 1U);
-  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
-  TShape& lshape = (*in_attrs)[0];
-  TShape& rshape = (*in_attrs)[1];
-  if (lshape.ndim() == 3 && rshape.ndim() == 3) {
-    CHECK(lshape[0] == rshape[0])
-      << "batch_dot shape error(batch_size must be equal): " << lshape << " X " << rshape
-      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    index_t out_m = param.transpose_a ? lshape[2] : lshape[1];
-    index_t lshape_k = param.transpose_a ? lshape[1] : lshape[2];
-    index_t out_n = param.transpose_b ? rshape[1] : rshape[2];
-    index_t rshape_k = param.transpose_b ? rshape[2] : rshape[1];
-    CHECK(lshape_k == rshape_k)
-      << "batch_dot shape error(shape mismatch): " << lshape << " X " << rshape
-      << " trans_a=" << param.transpose_a << " trans_b=" << param.transpose_b;
-    SHAPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::Shape3(lshape[0], out_m, out_n));
+template<typename xpu>
+void SliceEx(const nnvm::NodeAttrs& attrs,
+             const OpContext& ctx,
+             const std::vector<NDArray>& inputs,
+             const std::vector<OpReqType>& req,
+             const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1);
+  CHECK_EQ(outputs.size(), 1);
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  auto in_stype = inputs[0].storage_type();
+  if (in_stype == kCSRStorage) {
+    SliceCsrImpl<xpu>(param, ctx, inputs[0], req[0], outputs[0]);
   } else {
-    LOG(FATAL) << "batch_dot currently only support 3D*3D array"
-               << lshape << " v.s. " << rshape;
+    LOG(FATAL) << "Slice not implemented for storage type" << in_stype;
   }
-  return true;
 }
 
-struct SliceParam : public dmlc::Parameter<SliceParam> {
-  nnvm::Tuple<dmlc::optional<int> > begin, end;
-  DMLC_DECLARE_PARAMETER(SliceParam) {
-    DMLC_DECLARE_FIELD(begin)
-    .describe("starting indices for the slice operation, supports negative indices.");
-    DMLC_DECLARE_FIELD(end)
-    .describe("ending indices for the slice operation, supports negative indices.");
-  }
-};
-
-inline TShape GetSliceShape(const SliceParam& param, const TShape& dshape) {
-  CHECK_LE(param.begin.ndim(), dshape.ndim())
+template<int ndim>
+inline void GetIndexRange(const TShape& dshape,
+                          const nnvm::Tuple<dmlc::optional<int>>& param_begin,
+                          const nnvm::Tuple<dmlc::optional<int>>& param_end,
+                          const nnvm::Tuple<dmlc::optional<int>>& param_step,
+                          common::StaticArray<int, ndim>* begin,
+                          common::StaticArray<int, ndim>* end,
+                          common::StaticArray<int, ndim>* step) {
+  CHECK_NE(dshape.ndim(), 0U);
+  CHECK_NE(dshape.Size(), 0U);
+  CHECK_LE(param_begin.ndim(), dshape.ndim())
     << "Slicing axis exceeds data dimensions";
-  CHECK_LE(param.end.ndim(), dshape.ndim())
+  CHECK_LE(param_end.ndim(), dshape.ndim())
     << "Slicing axis exceeds data dimensions";
-  CHECK_EQ(param.begin.ndim(), param.end.ndim())
+  CHECK_EQ(param_begin.ndim(), param_end.ndim())
     << "begin and end must have the same length";
+  CHECK_EQ(ndim, dshape.ndim())
+    << "Static array size=" << ndim
+    << " is not equal to data shape ndim=" << dshape.ndim();
 
-  TShape oshape = dshape;
-  for (index_t i = 0; i < param.begin.ndim(); ++i) {
-    int s = 0, e = dshape[i];
-    if (e != 0) {
-      if (param.begin[i]) {
-        CHECK_LE(*param.begin[i], e)
-          << "Slicing begin exceeds data dimensions "
-          << param.begin << " vs " << dshape;
-        s = *param.begin[i];
-        if (s < 0) s += dshape[i];
+  if (param_step.ndim() != 0U) {
+    CHECK_EQ(param_step.ndim(), param_begin.ndim())
+      << "step and begin must have the same length";
+  }
+
+  for (index_t i = 0; i < param_begin.ndim(); ++i) {
+    int b = 0, e = dshape[i], s = 1;
+    const int len = dshape[i];
+    if (param_step.ndim() != 0U) {
+      const auto& opt_step_val = param_step[i];
+      if (opt_step_val.has_value()) {
+        s = opt_step_val.value();
+        CHECK_NE(s, 0) << "slice op step[" << i << "] cannot be 0";
+      }
+    }
+
+    if (param_begin[i].has_value()) {
+      b = param_begin[i].value();
+      if (b < 0) {
+        b += len;
+        CHECK_GE(b, 0) << "slicing with begin[" << i << "]="
+                       << b - len << " exceeds limit of " << len;
       }
-      if (param.end[i]) {
-        CHECK_LE(*param.end[i], e)
-          << "Slicing end exceeds data dimensions "
-          << param.end << " vs " << dshape;
-        e = *param.end[i];
-        if (e < 0) e += dshape[i];
+    } else if (s < 0) {
+      b = len - 1;
+    }
+    CHECK_LT(b, len) << "slicing with begin[" << i << "]="
+                     << b << " exceends limit of " << len;
+
+    if (param_end[i].has_value()) {
+      e = param_end[i].value();
+      if (e < 0) {
+        e += len;
+        CHECK_GE(e, 0) << "slicing with end[" << i << "]="
+                       << e - len << " exceeds limit of " << len;
       }
-      CHECK(s >= 0 && s < e && e <= static_cast<int>(dshape[i]))
-        << "Invalid slicing begin " << param.begin << " and end "
-        << param.end << " for data of shape " << dshape;
+    } else if (s < 0) {
+      e = -1;
     }
-    oshape[i] = e - s;
+    CHECK_LE(e, len) << "slicing with end[" << i << "]="
+                     << e << " exceeds limit of " << len;
+
+    (*begin)[i] = b;
+    (*end)[i] = e;
+    (*step)[i] = s;
+  }
+  for (index_t i = param_begin.ndim(); i < dshape.ndim(); ++i) {
+    (*begin)[i] = 0;
+    (*end)[i] = dshape[i];
+    (*step)[i] = 1;
   }
-  return oshape;
 }
 
-inline bool SliceShape(const nnvm::NodeAttrs& attrs,
-                       std::vector<TShape> *in_attrs,
-                       std::vector<TShape> *out_attrs) {
-  const TShape& dshape = (*in_attrs)[0];
-  if (dshape.ndim() == 0) return false;
-  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, GetSliceShape(param, dshape));
-  return true;
+inline void SetSliceOpOutputDimSize(const index_t i, const int b,
+                                    const int e, const int s,
+                                    TShape* oshape) {
+  if (s > 0) {
+    CHECK_LT(b, e) << "slicing with begin=[" << i << "]=" << b << ", end[" << i << "]="
+                   << e << ", and step[" << i << "]=" << s << " is invalid";
+    (*oshape)[i] = (e - b - 1) / s + 1;
+  } else {
+    CHECK_LT(e, b) << "slicing with begin=[" << i << "]=" << b << ", end[" << i << "]="
+                   << e << ", and step[" << i << "]=" << s << " is invalid";
+    (*oshape)[i] = (b - e - 1) / (-s) + 1;
+  }
 }
 
-// matrix crop for multi dimensional cropping: see also slice
-template<typename xpu>
-void Slice(const nnvm::NodeAttrs& attrs,
-          const OpContext& ctx,
-          const std::vector<TBlob>& inputs,
-          const std::vector<OpReqType>& req,
-          const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
+inline bool SliceOpShape(const nnvm::NodeAttrs& attrs,
+                         std::vector<TShape>* in_attrs,
+                         std::vector<TShape>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& dshape = (*in_attrs)[0];
+  if (dshape.ndim() == 0 || dshape.Size() == 0) return false;
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
-  index_t N = inputs[0].ndim();
-  TShape begin(N), end(N);
-  for (index_t i = 0; i < N; ++i) {
-    int s = 0;
-    if (param.begin[i]) {
-      s = *param.begin[i];
-      if (s < 0) s += inputs[0].size(i);
-    }
-    begin[i] = s;
-    end[i] = s + outputs[0].size(i);
-  }
-
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    switch (inputs[0].ndim()) {
-     case 0:
-      break;
-     case 1: {
-      Tensor<xpu, 1, DType> in = inputs[0].get<xpu, 1, DType>(s);
-      Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
-      out = slice(in, begin.get<1>(), end.get<1>());
-      break;
-     }
-     case 2: {
-      Tensor<xpu, 2, DType> in = inputs[0].get<xpu, 2, DType>(s);
-      Tensor<xpu, 2, DType> out = outputs[0].get<xpu, 2, DType>(s);
-      out = slice(in, begin.get<2>(), end.get<2>());
-      break;
-     }
-     case 3: {
-      Tensor<xpu, 3, DType> in = inputs[0].get<xpu, 3, DType>(s);
-      Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
-      out = slice(in, begin.get<3>(), end.get<3>());
-      break;
-     }
-     case 4: {
-      Tensor<xpu, 4, DType> in = inputs[0].get<xpu, 4, DType>(s);
-      Tensor<xpu, 4, DType> out = outputs[0].get<xpu, 4, DType>(s);
-      out = slice(in, begin.get<4>(), end.get<4>());
-      break;
-     }
-     case 5: {
-      Tensor<xpu, 5, DType> in = inputs[0].get<xpu, 5, DType>(s);
-      Tensor<xpu, 5, DType> out = outputs[0].get<xpu, 5, DType>(s);
-      out = slice(in, begin.get<5>(), end.get<5>());
-      break;
-     }
-     default:
-      LOG(FATAL) << "slice supports at most 5 dimensions";
-      break;
+  TShape oshape = dshape;
+  MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
+    common::StaticArray<int, ndim> begin, end, step;
+    GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
+    for (index_t i = 0; i < param.begin.ndim(); ++i) {
+      const int b = begin[i], e = end[i], s = step[i];
+      SetSliceOpOutputDimSize(i, b, e, s, &oshape);
     }
   });
+
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, oshape);
+  return oshape.ndim() != 0 && oshape.Size() != 0;
 }
 
-inline bool SliceAssignShape(const nnvm::NodeAttrs& attrs,
-                             std::vector<TShape> *in_attrs,
-                             std::vector<TShape> *out_attrs) {
-  const TShape& lshape = (*in_attrs)[0];
-  if (lshape.ndim() == 0) return false;
+template<int ndim>
+struct slice_forward {
+  // i is the i-th row after flattening out into 2D tensor
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* data,
+                                  const OpReqType req,
+                                  const mshadow::Shape<ndim> dshape,
+                                  const mshadow::Shape<ndim> oshape,
+                                  const common::StaticArray<int, ndim> begin,
+                                  const common::StaticArray<int, ndim> step) {
+    const int data_last_dim_size = dshape[ndim-1];
+    const int out_last_dim_size = oshape[ndim-1];
+    const int step_last_dim = step[ndim-1];
+    const int begin_last_dim = begin[ndim-1];
+    int out_offset = i * out_last_dim_size;
+    for (int j = 0; j < out_last_dim_size; ++j) {
+      int irow = 0;  // row id of flattend 2D data
+      int stride = 1;
+      int idx = i;
+      #pragma unroll
+      for (int k = ndim - 2; k >= 0; --k) {
+        irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
+        idx /= oshape[k];
+        stride *= dshape[k];
+      }
+      KERNEL_ASSIGN(out[out_offset++], req,
+                    data[irow * data_last_dim_size + j * step_last_dim + begin_last_dim]);
+    }
+  }
+};
+
+template<typename xpu>
+void SliceOpForward(const nnvm::NodeAttrs& attrs,
+                    const OpContext& ctx,
+                    const std::vector<TBlob>& inputs,
+                    const std::vector<OpReqType>& req,
+                    const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (req[0] == kNullOp) return;
+  using namespace mshadow;
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  const TBlob& data = inputs[0];
+  const TBlob& out = outputs[0];
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
-  SHAPE_ASSIGN_CHECK(*in_attrs, 1, GetSliceShape(param, lshape));
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, lshape);
-  return true;
+  MXNET_NDIM_SWITCH(data.ndim(), ndim, {
+    common::StaticArray<int, ndim> begin, end, step;
+    GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
+    MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+      mxnet_op::Kernel<slice_forward<ndim>, xpu>::Launch(s, out.shape_.FlatTo2D()[0],
+          out.dptr<DType>(), data.dptr<DType>(), req[0],
+          data.shape_.get<ndim>(), out.shape_.get<ndim>(), begin, step);
+    })
+  })
 }
 
+template<int ndim>
+struct slice_assign {
+  // i is the i-th row after flattening out into 2D tensor
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* val,
+                                  const OpReqType req,
+                                  const mshadow::Shape<ndim> oshape,
+                                  const mshadow::Shape<ndim> vshape,
+                                  const common::StaticArray<int, ndim> begin,
+                                  const common::StaticArray<int, ndim> step) {
+    const int data_last_dim_size = oshape[ndim-1];
+    const int out_last_dim_size = vshape[ndim-1];
+    const int step_last_dim = step[ndim-1];
+    const int begin_last_dim = begin[ndim-1];
+    int offset = i * out_last_dim_size;
+    for (int j = 0; j < out_last_dim_size; ++j) {
+      int irow = 0;  // row id of flattend 2D out
+      int stride = 1;
+      int idx = i;
+      #pragma unroll
+      for (int k = ndim - 2; k >= 0; --k) {
+        irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
+        idx /= vshape[k];
+        stride *= oshape[k];
+      }
+      KERNEL_ASSIGN(out[irow * data_last_dim_size + j * step_last_dim + begin_last_dim],
+                    req, val[offset++]);
+    }
+  }
+};
+
 template<typename xpu>
-void SliceAssignImpl(mshadow::Stream<xpu> *s, const SliceParam& param,
-                     const TBlob& dst, const TBlob& src) {
+void SliceOpBackward(const nnvm::NodeAttrs& attrs,
+                     const OpContext& ctx,
+                     const std::vector<TBlob>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (req[0] == kNullOp) return;
   using namespace mshadow;
-  using namespace mshadow::expr;
-  index_t N = dst.ndim();
-  TShape begin(N), end(N);
-  for (index_t i = 0; i < N; ++i) {
-    int s = 0;
-    if (param.begin[i]) {
-      s = *param.begin[i];
-      if (s < 0) s += dst.size(i);
-    }
-    begin[i] = s;
-    end[i] = s + src.size(i);
+  Stream<xpu>* s = ctx.get_stream<xpu>();
+  const TBlob& ograd = inputs[0];
+  const TBlob& igrad = outputs[0];
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  if (req[0] == kWriteTo) {
+    Fill(s, igrad, req[0], 0);
+  } else if (req[0] == kWriteInplace) {
+    LOG(FATAL) << "_slice_backward does not support kWriteInplace";
   }
+  MXNET_NDIM_SWITCH(ograd.ndim(), ndim, {
+    common::StaticArray<int, ndim> begin, end, step;
+    GetIndexRange(igrad.shape_, param.begin, param.end, param.step, &begin, &end, &step);
+    MSHADOW_TYPE_SWITCH(ograd.type_flag_, DType, {
+      mxnet_op::Kernel<slice_assign<ndim>, xpu>::Launch(s, ograd.shape_.FlatTo2D()[0],
+          igrad.dptr<DType>(), ograd.dptr<DType>(), req[0],
+          igrad.shape_.get<ndim>(), ograd.shape_.get<ndim>(), begin, step);
+    })
+  })
+}
 
-  MSHADOW_TYPE_SWITCH(dst.type_flag_, DType, {
-    switch (dst.ndim()) {
-      case 0:
-        break;
-      case 1: {
-        Tensor<xpu, 1, DType> out = dst.get<xpu, 1, DType>(s);
-        Tensor<xpu, 1, DType> in = src.get<xpu, 1, DType>(s);
-        slice(out, begin.get<1>(), end.get<1>()) = in;
-        break;
-      }
-      case 2: {
-        Tensor<xpu, 2, DType> out = dst.get<xpu, 2, DType>(s);
-        Tensor<xpu, 2, DType> in = src.get<xpu, 2, DType>(s);
-        slice(out, begin.get<2>(), end.get<2>()) = in;
-        break;
-      }
-      case 3: {
-        Tensor<xpu, 3, DType> out = dst.get<xpu, 3, DType>(s);
-        Tensor<xpu, 3, DType> in = src.get<xpu, 3, DType>(s);
-        slice(out, begin.get<3>(), end.get<3>()) = in;
-        break;
-      }
-      case 4: {
-        Tensor<xpu, 4, DType> out = dst.get<xpu, 4, DType>(s);
-        Tensor<xpu, 4, DType> in = src.get<xpu, 4, DType>(s);
-        slice(out, begin.get<4>(), end.get<4>()) = in;
-        break;
-      }
-      case 5: {
-        Tensor<xpu, 5, DType> out = dst.get<xpu, 5, DType>(s);
-        Tensor<xpu, 5, DType> in = src.get<xpu, 5, DType>(s);
-        slice(out, begin.get<5>(), end.get<5>()) = in;
-        break;
-      }
-      default:
-        LOG(FATAL) << "CropAssign supports at most 5 dimensions";
-        break;
+inline bool SliceAssignOpShape(const nnvm::NodeAttrs& attrs,
+                               std::vector<TShape> *in_attrs,
+                               std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& dshape = (*in_attrs)[0];
+  if (dshape.ndim() == 0U || dshape.Size() == 0U) return false;
+  TShape vshape = dshape;  // vshape is the value shape on the right hand side
+  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
+    common::StaticArray<int, ndim> begin, end, step;
+    GetIndexRange(dshape, param.begin, param.end, param.step, &begin, &end, &step);
+    for (index_t i = 0; i < param.begin.ndim(); ++i) {
+      const int b = begin[i], e = end[i], s = step[i];
+      SetSliceOpOutputDimSize(i, b, e, s, &vshape);
     }
   });
+  SHAPE_ASSIGN_CHECK(*in_attrs, 1, vshape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, dshape);
+  return true;
 }
 
 template<typename xpu>
-void SliceAssign(const nnvm::NodeAttrs& attrs,
-                 const OpContext& ctx,
-                 const std::vector<TBlob>& inputs,
-                 const std::vector<OpReqType>& req,
-                 const std::vector<TBlob>& outputs) {
+void SliceAssignOpForward(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<TBlob>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<TBlob>& outputs) {
   using namespace mshadow;
-  using namespace mshadow::expr;
-
-  const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 2U);  // data[index] = val, data and val are two inputs
+  CHECK_EQ(outputs.size(), 1U);
+  if (req[0] == kNullOp) return;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-
-  if (req[0] == kNullOp) {
-    return;
-  } else if (req[0] == kWriteTo) {
+  const TBlob& data = inputs[0];
+  const TBlob& val = inputs[1];
+  const TBlob& out = outputs[0];
+  if (req[0] == kWriteTo) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       Tensor<xpu, 1, DType> in = inputs[0].FlatTo1D<xpu, DType>(s);
       Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
       Copy(out, in, s);
     });
   } else if (req[0] != kWriteInplace) {
-    LOG(FATAL) << "CropAssign only supports kWriteTo and kWriteInplace";
+    LOG(FATAL) << "_slice_assign only supports kWriteTo and kWriteInplace";
   }
 
-  SliceAssignImpl<xpu>(s, param, outputs[0], inputs[1]);
-}
-
-template<typename xpu>
-void SliceBackward(const nnvm::NodeAttrs& attrs,
-                   const OpContext& ctx,
-                   const std::vector<TBlob>& inputs,
-                   const std::vector<OpReqType>& req,
-                   const std::vector<TBlob>& outputs) {
-  using namespace mshadow;
-  using namespace mshadow::expr;
-
   const SliceParam& param = nnvm::get<SliceParam>(attrs.parsed);
-  Stream<xpu> *s = ctx.get_stream<xpu>();
-
-  if (req[0] == kNullOp) {
-    return;
-  } else if (req[0] == kWriteTo) {
-    MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-      Tensor<xpu, 1, DType> out = outputs[0].FlatTo1D<xpu, DType>(s);
-      out = DType(0);
-    });
-  } else {
-    LOG(FATAL) << "CropAssign only supports kWriteTo";
-  }
-
-  SliceAssignImpl<xpu>(s, param, outputs[0], inputs[0]);
+  MXNET_NDIM_SWITCH(data.ndim(), ndim, {
+    common::StaticArray<int, ndim> begin, end, step;
+    GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
+    MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+      mxnet_op::Kernel<slice_assign<ndim>, xpu>::Launch(s, val.shape_.FlatTo2D()[0],
+          out.dptr<DType>(), val.dptr<DType>(), req[0],
+          out.shape_.get<ndim>(), val.shape_.get<ndim>(), begin, step);
+    })
+  })
 }
 
-struct SimpleCropAssignScalarParam : public dmlc::Parameter<SimpleCropAssignScalarParam> {
+struct SliceAssignScalarParam : public dmlc::Parameter<SliceAssignScalarParam> {
   real_t scalar;
-  TShape begin, end;
-  DMLC_DECLARE_PARAMETER(SimpleCropAssignScalarParam) {
+  nnvm::Tuple<dmlc::optional<int>> begin, end;
+  nnvm::Tuple<dmlc::optional<int>> step;
+  DMLC_DECLARE_PARAMETER(SliceAssignScalarParam) {
     DMLC_DECLARE_FIELD(scalar)
     .set_default(0)
     .describe("The scalar value for assignment.");
     DMLC_DECLARE_FIELD(begin)
-    .describe("starting coordinates");
+    .describe("starting indices for the slice operation, supports negative indices.");
     DMLC_DECLARE_FIELD(end)
-    .describe("ending coordinates");
+    .describe("ending indices for the slice operation, supports negative indices.");
+    DMLC_DECLARE_FIELD(step)
+    .set_default(nnvm::Tuple<dmlc::optional<int>>())
+    .describe("step for the slice operation, supports negative values.");
+  }
+};
+
+inline bool SliceAssignScalarOpShape(const nnvm::NodeAttrs& attrs,
+                                    std::vector<TShape> *in_attrs,
+                                    std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const TShape& dshape = (*in_attrs)[0];
+  if (dshape.ndim() == 0U || dshape.Size() == 0U) return false;
+  SHAPE_ASSIGN_CHECK(*out_attrs, 0, dshape);
+  return true;
+}
+
+template<int ndim>
+struct slice_assign_scalar {
+  // i is the i-th row after flattening out into 2D tensor
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType val,
+                                  const OpReqType req,
+                                  const mshadow::Shape<ndim> oshape,
+                                  const mshadow::Shape<ndim> vshape,
+                                  const common::StaticArray<int, ndim> begin,
+                                  const common::StaticArray<int, ndim> step) {
+    const int data_last_dim_size = oshape[ndim-1];
+    const int out_last_dim_size = vshape[ndim-1];
+    const int step_last_dim = step[ndim-1];
+    const int begin_last_dim = begin[ndim-1];
+    for (int j = 0; j < out_last_dim_size; ++j) {
+      int irow = 0;  // row id of flattend 2D out
+      int stride = 1;
+      int idx = i;
+      #pragma unroll
+      for (int k = ndim - 2; k >= 0; --k) {
+        irow += stride * ((idx % vshape[k]) * step[k] + begin[k]);
+        idx /= vshape[k];
+        stride *= oshape[k];
+      }
+      KERNEL_ASSIGN(out[irow * data_last_dim_size + j * step_last_dim + begin_last_dim], req, val);
+    }
   }
 };
 
 template<typename xpu>
-void CropAssignScalar(const nnvm::NodeAttrs& attrs,
-                      const OpContext& ctx,
-                      const std::vector<TBlob>& inputs,
-                      const std::vector<OpReqType>& req,
-                      const std::vector<TBlob>& outputs) {
+void SliceAssignScalarOpForward(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<TBlob>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<TBlob>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
   using namespace mshadow;
-  using namespace mshadow::expr;
-  const SimpleCropAssignScalarParam& param = nnvm::get<SimpleCropAssignScalarParam>(attrs.parsed);
   Stream<xpu> *s = ctx.get_stream<xpu>();
 
+  const TBlob& data = inputs[0];
+  const TBlob& out = outputs[0];
   if (req[0] == kWriteTo) {
     MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
       Tensor<xpu, 1, DType> in = inputs[0].FlatTo1D<xpu, DType>(s);
@@ -999,63 +1013,24 @@ void CropAssignScalar(const nnvm::NodeAttrs& attrs,
       Copy(out, in, s);
     });
   } else if (req[0] != kWriteInplace) {
-    LOG(FATAL) << "CropAssignScalar only supports kWriteTo and kWriteInplace";
+    LOG(FATAL) << "_crop_assign_scalar only supports kWriteTo and kWriteInplace";
   }
 
-  MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    switch (outputs[0].shape_.ndim()) {
-      case 0:
-        break;
-      case 1: {
-        Tensor<xpu, 1, DType> out = outputs[0].get<xpu, 1, DType>(s);
-        slice(out, param.begin.get<1>(), param.end.get<1>()) = \
-            static_cast<DType>(param.scalar);
-        break;
-      }
-      case 2: {
-        Tensor<xpu, 2, DType> out = outputs[0].get<xpu, 2, DType>(s);
-        slice(out, param.begin.get<2>(), param.end.get<2>()) = \
-            static_cast<DType>(param.scalar);
-        break;
-      }
-      case 3: {
-        Tensor<xpu, 3, DType> out = outputs[0].get<xpu, 3, DType>(s);
-        slice(out, param.begin.get<3>(), param.end.get<3>()) = \
-            static_cast<DType>(param.scalar);
-        break;
-      }
-      case 4: {
-        Tensor<xpu, 4, DType> out = outputs[0].get<xpu, 4, DType>(s);
-        slice(out, param.begin.get<4>(), param.end.get<4>()) = \
-            static_cast<DType>(param.scalar);
-        break;
-      }
-      case 5: {
-        Tensor<xpu, 5, DType> out = outputs[0].get<xpu, 5, DType>(s);
-        slice(out, param.begin.get<5>(), param.end.get<5>()) = \
-            static_cast<DType>(param.scalar);
-        break;
-      }
-      default:
-        LOG(FATAL) << "CropAssign supports at most 5 dimensions";
-        break;
+  TShape vshape = data.shape_;
+  const SliceAssignScalarParam& param = nnvm::get<SliceAssignScalarParam>(attrs.parsed);
+  MXNET_NDIM_SWITCH(data.ndim(), ndim, {
+    common::StaticArray<int, ndim> begin, end, step;
+    GetIndexRange(data.shape_, param.begin, param.end, param.step, &begin, &end, &step);
+    for (index_t i = 0; i < param.begin.ndim(); ++i) {
+      const int b = begin[i], e = end[i], s = step[i];
+      SetSliceOpOutputDimSize(i, b, e, s, &vshape);
     }
-  });
-}
-
-inline bool CropAssignScalarShape(const nnvm::NodeAttrs& attrs,
-                                  std::vector<TShape> *in_attrs,
-                                  std::vector<TShape> *out_attrs) {
-  const SimpleCropAssignScalarParam& param = nnvm::get<SimpleCropAssignScalarParam>(attrs.parsed);
-  TShape& lshape = (*in_attrs)[0];
-  CHECK_EQ(lshape.ndim(), param.begin.ndim());
-  CHECK_EQ(lshape.ndim(), param.end.ndim());
-  for (index_t i = 0; i < lshape.ndim(); ++i) {
-    CHECK_LT(param.begin[i], param.end[i]);
-    CHECK_LE(param.end[i], lshape[i]);
-  }
-  SHAPE_ASSIGN_CHECK(*out_attrs, 0, lshape);
-  return true;
+    MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+      mxnet_op::Kernel<slice_assign_scalar<ndim>, xpu>::Launch(s, vshape.FlatTo2D()[0],
+          out.dptr<DType>(), static_cast<DType>(param.scalar), req[0],
+          out.shape_.get<ndim>(), vshape.get<ndim>(), begin, step);
+    })
+  })
 }
 
 struct SliceAxisParam : public dmlc::Parameter<SliceAxisParam> {
@@ -1255,17 +1230,29 @@ void Clip(const nnvm::NodeAttrs& attrs,
           const std::vector<OpReqType>& req,
           const std::vector<TBlob>& outputs) {
   using namespace mshadow;
-  using namespace mxnet_op;
   const ClipParam& param = nnvm::get<ClipParam>(attrs.parsed);
   CHECK_EQ(inputs[0].type_flag_, outputs[0].type_flag_);
   Stream<xpu> *s = ctx.get_stream<xpu>();
 
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    Kernel<clip, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
-    inputs[0].dptr<DType>(), DType(param.a_min), DType(param.a_max));
+    mxnet_op::Kernel<mxnet::op::clip, xpu>::Launch(s, outputs[0].Size(), outputs[0].dptr<DType>(),
+                                                   inputs[0].dptr<DType>(),
+                                                   DType(param.a_min), DType(param.a_max));
   });
 }
 
+template<typename xpu>
+void ClipEx(const nnvm::NodeAttrs& attrs,
+            const OpContext& ctx,
+            const std::vector<NDArray>& inputs,
+            const std::vector<OpReqType>& req,
+            const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs[0].dtype(), outputs[0].dtype());
+  CHECK_EQ(inputs[0].storage_type(), outputs[0].storage_type());
+  CHECK_NE(inputs[0].storage_type(), kDefaultStorage);
+  UnaryOp::MapToFCompute<xpu>(attrs, ctx, inputs, req, outputs, Clip<xpu>);
+}
+
 template<typename xpu>
 void ClipGrad_(const nnvm::NodeAttrs& attrs,
                const OpContext& ctx,
@@ -1442,7 +1429,6 @@ void RepeatOpForward(const nnvm::NodeAttrs& attrs,
   GetRepeatParams(param, ishape, &repeats, &axisOpt);
   if (0 == repeats) return;
 
-  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
   std::pair<TShape, TShape> rshapes = ReshapeInputOutputForRepeatOp(ishape, axisOpt, repeats);
 
   // reshaped input tblob
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index e7e8f5548a1c..8f36e35d279f 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file matrix_op.cc
  * \brief CPU Implementation of matrix operations
  */
@@ -31,10 +32,9 @@ DMLC_REGISTER_PARAMETER(ReshapeParam);
 DMLC_REGISTER_PARAMETER(TransposeParam);
 DMLC_REGISTER_PARAMETER(ExpandDimParam);
 DMLC_REGISTER_PARAMETER(ClipParam);
-DMLC_REGISTER_PARAMETER(SimpleCropAssignScalarParam);
+DMLC_REGISTER_PARAMETER(SliceAssignScalarParam);
 DMLC_REGISTER_PARAMETER(SliceParam);
 DMLC_REGISTER_PARAMETER(SliceAxisParam);
-DMLC_REGISTER_PARAMETER(DotParam);
 DMLC_REGISTER_PARAMETER(RepeatParam);
 DMLC_REGISTER_PARAMETER(TileParam);
 DMLC_REGISTER_PARAMETER(ReverseParam);
@@ -110,7 +110,7 @@ If the argument `reverse` is set to 1, then the special values are inferred from
 .set_attr<nnvm::FInferShape>("FInferShape", ReshapeShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_copy"})
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
@@ -153,7 +153,7 @@ Example::
 .set_attr<nnvm::FInferShape>("FInferShape", FlattenShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_copy" })
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
@@ -243,25 +243,45 @@ will return a new array with shape ``(2,1,3,4)``.
     return std::vector<bool>{true};
   })
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_copy"})
-.set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(ExpandDimParam::__FIELDS__());
 
 NNVM_REGISTER_OP(slice)
+MXNET_ADD_SPARSE_OP_ALIAS(slice)
 .add_alias("crop")
-.describe(R"code(Slices a contiguous region of the array.
+.describe(R"code(Slices a region of the array.
 
 .. note:: ``crop`` is deprecated. Use ``slice`` instead.
 
-This function returns a sliced continuous region of the array between the indices given
-by `begin` and `end`.
+This function returns a sliced array between the indices given
+by `begin` and `end` with the corresponding `step`.
 
-For an input array of `n` dimensions, slice operation with ``begin=(b_0, b_1...b_n-1)`` indices
-and ``end=(e_1, e_2, ... e_n)`` indices will result in an array with the shape
-``(e_1-b_0, ..., e_n-b_n-1)``.
+For an input array of ``shape=(d_0, d_1, ..., d_n-1)``,
+slice operation with ``begin=(b_0, b_1...b_m-1)``,
+``end=(e_0, e_1, ..., e_m-1)``, and ``step=(s_0, s_1, ..., s_m-1)``,
+where m <= n, results in an array with the shape
+``(|e_0-b_0|/|s_0|, ..., |e_m-1-b_m-1|/|s_m-1|, d_m, ..., d_n-1)``.
 
 The resulting array's *k*-th dimension contains elements
-from the *k*-th dimension of the input array with the open range ``[b_k, e_k)``.
+from the *k*-th dimension of the input array starting
+from index ``b_k`` (inclusive) with step ``s_k``
+until reaching ``e_k`` (exclusive).
+
+If the *k*-th elements are `None` in the sequence of `begin`, `end`,
+and `step`, the following rule will be used to set default values.
+If `s_k` is `None`, set `s_k=1`. If `s_k > 0`, set `b_k=0`, `e_k=d_k`;
+else, set `b_k=d_k-1`, `e_k=-1`.
+
+The storage type of ``slice`` output depends on storage types of inputs
+
+- slice(csr) = csr
+- otherwise, ``slice`` generates output with default storage
+
+.. note:: When input data storage type is csr, it only supports
+step=(), or step=(None,), or step=(1,) to generate a csr output.
+For other step parameter values, it falls back to slicing
+a dense tensor.
 
 Example::
 
@@ -271,20 +291,24 @@ Example::
 
   slice(x, begin=(0,1), end=(2,4)) = [[ 2.,  3.,  4.],
                                      [ 6.,  7.,  8.]]
-
+  slice(x, begin=(None, 0), end=(None, 3), step=(-1, 2)) = [[9., 11.],
+                                                            [5.,  7.],
+                                                            [1.,  3.]]
 )code" ADD_FILELINE)
 .set_attr_parser(ParamParser<SliceParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", SliceShape)
+.set_attr<nnvm::FInferShape>("FInferShape", SliceOpShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FInferStorageType>("FInferStorageType", SliceForwardInferStorageType)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_slice"})
-.set_attr<FCompute>("FCompute<cpu>", Slice<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", SliceOpForward<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SliceEx<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "Source input")
 .add_arguments(SliceParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_backward_slice)
 .set_attr_parser(ParamParser<SliceParam>)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", SliceBackward<cpu>);
+.set_attr<FCompute>("FCompute<cpu>", SliceOpBackward<cpu>);
 
 NNVM_REGISTER_OP(_slice_assign)
 .add_alias("_crop_assign")
@@ -300,18 +324,19 @@ NNVM_REGISTER_OP(_slice_assign)
     return std::vector<std::string>{"lhs", "rhs"};
   })
 .set_attr_parser(ParamParser<SliceParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", SliceAssignShape)
+.set_attr<nnvm::FInferShape>("FInferShape", SliceAssignOpShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
-.set_attr<FCompute>("FCompute<cpu>", SliceAssign<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", SliceAssignOpForward<cpu>)
 .add_argument("lhs", "NDArray-or-Symbol", "Source input")
 .add_argument("rhs", "NDArray-or-Symbol", "value to assign")
 .add_arguments(SliceParam::__FIELDS__());
 
-NNVM_REGISTER_OP(_crop_assign_scalar)
+NNVM_REGISTER_OP(_slice_assign_scalar)
+.add_alias("_crop_assign_scalar")
 .MXNET_DESCRIBE("(Assign the scalar to a cropped subset of the input.\n\n"
 "Requirements\n"
 "------------\n"
@@ -319,16 +344,16 @@ NNVM_REGISTER_OP(_crop_assign_scalar)
 ")")
 .set_num_inputs(1)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<SimpleCropAssignScalarParam>)
-.set_attr<nnvm::FInferShape>("FInferShape", CropAssignScalarShape)
+.set_attr_parser(ParamParser<SliceAssignScalarParam>)
+.set_attr<nnvm::FInferShape>("FInferShape", SliceAssignScalarOpShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs){
     return std::vector<std::pair<int, int> >{{0, 0}};
   })
-.set_attr<FCompute>("FCompute<cpu>", CropAssignScalar<cpu>)
+.set_attr<FCompute>("FCompute<cpu>", SliceAssignScalarOpForward<cpu>)
 .add_argument("data", "NDArray-or-Symbol", "Source input")
-.add_arguments(SimpleCropAssignScalarParam::__FIELDS__());
+.add_arguments(SliceAssignScalarParam::__FIELDS__());
 
 NNVM_REGISTER_OP(slice_axis)
 .describe(R"code(Slices along a given axis.
@@ -370,95 +395,8 @@ NNVM_REGISTER_OP(_backward_slice_axis)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", SliceAxisGrad_<cpu>);
 
-NNVM_REGISTER_OP(dot)
-.describe(R"doc(Dot product of two arrays.
-
-``dot``'s behavior depends on the input array dimensions:
-
-- 1-D arrays: inner product of vectors
-- 2-D arrays: matrix multiplication
-- N-D arrays: a sum product over the last axis of the first input and the first
-  axis of the second input
-
-  For example, given 3-D ``x`` with shape `(n,m,k)` and ``y`` with shape `(k,r,s)`, the
-  result array will have shape `(n,m,r,s)`. It is computed by::
-
-    dot(x,y)[i,j,a,b] = sum(x[i,j,:]*y[:,a,b])
-
-  Example::
-
-    x = reshape([0,1,2,3,4,5,6,7], shape=(2,2,2))
-    y = reshape([7,6,5,4,3,2,1,0], shape=(2,2,2))
-    dot(x,y)[0,0,1,1] = 0
-    sum(x[0,0,:]*y[:,1,1]) = 0
-)doc" ADD_FILELINE)
-.set_num_inputs(2)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"lhs", "rhs"};
-  })
-.set_attr<nnvm::FInferShape>("FInferShape", DotShape)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
-.set_attr<FCompute>("FCompute<cpu>", DotForward_<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_dot"})
-.add_argument("lhs", "NDArray-or-Symbol", "The first input")
-.add_argument("rhs", "NDArray-or-Symbol", "The second input")
-.add_arguments(DotParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_dot)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", DotBackward_<cpu>)
-.add_arguments(DotParam::__FIELDS__());
-
-NNVM_REGISTER_OP(batch_dot)
-.describe(R"doc(Batchwise dot product.
-
-``batch_dot`` is used to compute dot product of ``x`` and ``y`` when ``x`` and
-``y`` are data in batch, namely 3D arrays in shape of `(batch_size, :, :)`.
-
-For example, given ``x`` with shape `(batch_size, n, m)` and ``y`` with shape
-`(batch_size, m, k)`, the result array will have shape `(batch_size, n, k)`,
-which is computed by::
-
-   batch_dot(x,y)[i,:,:] = dot(x[i,:,:], y[i,:,:])
-
-)doc" ADD_FILELINE)
-.set_num_inputs(2)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<nnvm::FListInputNames>("FListInputNames",
-  [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"lhs", "rhs"};
-  })
-.set_attr<nnvm::FInferShape>("FInferShape", BatchDotShape)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<FCompute>("FCompute<cpu>", BatchDotForward_<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_batch_dot"})
-.add_argument("lhs", "NDArray-or-Symbol", "The first input")
-.add_argument("rhs", "NDArray-or-Symbol", "The second input")
-.add_arguments(DotParam::__FIELDS__());
-
-NNVM_REGISTER_OP(_backward_batch_dot)
-.set_num_inputs(3)
-.set_num_outputs(2)
-.set_attr_parser(ParamParser<DotParam>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<FCompute>("FCompute<cpu>", BatchDotBackward_<cpu>);
-
 NNVM_REGISTER_OP(clip)
+MXNET_ADD_SPARSE_OP_ALIAS(clip)
 .describe(R"code(Clips (limits) the values in an array.
 
 Given an interval, values outside the interval are clipped to the interval edges.
@@ -472,6 +410,17 @@ Example::
 
     clip(x,1,8) = [ 1.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  8.]
 
+The storage type of ``clip`` output depends on storage types of inputs and the a_min, a_max \
+parameter values:
+
+   - clip(default) = default
+   - clip(row_sparse, a_min <= 0, a_max >= 0) = row_sparse
+   - clip(csr, a_min <= 0, a_max >= 0) = csr
+   - clip(row_sparse, a_min < 0, a_max < 0) = default
+   - clip(row_sparse, a_min > 0, a_max > 0) = default
+   - clip(csr, a_min < 0, a_max < 0) = csr
+   - clip(csr, a_min > 0, a_max > 0) = csr
+
 )code" ADD_FILELINE)
 .set_num_inputs(1)
 .set_num_outputs(1)
@@ -479,6 +428,38 @@ Example::
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<1, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<FCompute>("FCompute<cpu>", Clip<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ClipEx<cpu>)
+.set_attr<FInferStorageType>("FInferStorageType", [](const nnvm::NodeAttrs& attrs,
+                                                     const int dev_mask,
+                                                     DispatchMode* dispatch_mode,
+                                                     std::vector<int> *in_attrs,
+                                                     std::vector<int> *out_attrs) {
+    bool dispatched = false;
+    // For clipping ranges that cross zero, sparse output is possible
+    CHECK_EQ(in_attrs->size(), 1U) << " in operator " << attrs.name;
+    CHECK_EQ(out_attrs->size(), 1U) << " in operator " << attrs.name;
+    if ((*in_attrs)[0] == kDefaultStorage) {
+      dispatched = storage_type_assign(&out_attrs[0], kDefaultStorage,
+                                       dispatch_mode, DispatchMode::kFCompute);
+    }
+    const auto& param = nnvm::get<ClipParam>(attrs.parsed);
+    if (!dispatched && param.a_min <= 0.0 && param.a_max >= 0.0) {
+      const int this_stype = (*in_attrs)[0];
+      if (this_stype != kUndefinedStorage) {
+        dispatched = storage_type_assign(&(*out_attrs)[0], kRowSparseStorage,
+                                         dispatch_mode, DispatchMode::kFComputeEx);
+      }
+    }
+    if (!dispatched) {
+      // otherwise, output is dense (print warning anyway)
+      if (!storage_type_assign(&(*out_attrs)[0], kDefaultStorage,
+                              dispatch_mode, DispatchMode::kFComputeFallback)) {
+        dispatch_fallback(out_attrs, dispatch_mode);
+      }
+      LogStorageFallback(attrs, dev_mask, in_attrs, out_attrs);
+    }
+    return true;
+  })
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{ "_backward_clip" })
 .add_argument("data", "NDArray-or-Symbol", "Input array.")
 .add_arguments(ClipParam::__FIELDS__());
@@ -544,7 +525,7 @@ NNVM_REGISTER_OP(tile)
 .describe(R"code(Repeats the whole array multiple times.
 
 If ``reps`` has length *d*, and input array has dimension of *n*. There are
-there cases:
+three cases:
 
 - **n=d**. Repeat *i*-th dimension of the input by ``reps[i]`` times::
 
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index ca40419a9367..30eaf23b10b8 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -18,38 +18,38 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file matrix_op.cu
  * \brief GPU Implementation of matrix operations
  */
-// this will be invoked by gcc and compile GPU version
 #include "./matrix_op-inl.h"
 #include "./elemwise_unary_op.h"
 
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(Reshape)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
 NNVM_REGISTER_OP(Flatten)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
 NNVM_REGISTER_OP(transpose)
 .set_attr<FCompute>("FCompute<gpu>", Transpose<gpu>);
 
 NNVM_REGISTER_OP(expand_dims)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", UnaryOp::IdentityCompute<gpu>);
 
 NNVM_REGISTER_OP(slice)
-.set_attr<FCompute>("FCompute<gpu>", Slice<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SliceOpForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_slice)
-.set_attr<FCompute>("FCompute<gpu>", SliceBackward<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SliceOpBackward<gpu>);
 
 NNVM_REGISTER_OP(_slice_assign)
-.set_attr<FCompute>("FCompute<gpu>", SliceAssign<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", SliceAssignOpForward<gpu>);
 
-NNVM_REGISTER_OP(_crop_assign_scalar)
-.set_attr<FCompute>("FCompute<gpu>", CropAssignScalar<gpu>);
+NNVM_REGISTER_OP(_slice_assign_scalar)
+.set_attr<FCompute>("FCompute<gpu>", SliceAssignScalarOpForward<gpu>);
 
 NNVM_REGISTER_OP(slice_axis)
 .set_attr<FCompute>("FCompute<gpu>", SliceAxis<gpu>);
@@ -57,20 +57,9 @@ NNVM_REGISTER_OP(slice_axis)
 NNVM_REGISTER_OP(_backward_slice_axis)
 .set_attr<FCompute>("FCompute<gpu>", SliceAxisGrad_<gpu>);
 
-NNVM_REGISTER_OP(dot)
-.set_attr<FCompute>("FCompute<gpu>", DotForward_<gpu>);
-
-NNVM_REGISTER_OP(_backward_dot)
-.set_attr<FCompute>("FCompute<gpu>", DotBackward_<gpu>);
-
-NNVM_REGISTER_OP(batch_dot)
-.set_attr<FCompute>("FCompute<gpu>", BatchDotForward_<gpu>);
-
-NNVM_REGISTER_OP(_backward_batch_dot)
-.set_attr<FCompute>("FCompute<gpu>", BatchDotBackward_<gpu>);
-
 NNVM_REGISTER_OP(clip)
-.set_attr<FCompute>("FCompute<gpu>", Clip<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", Clip<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", ClipEx<gpu>);
 
 NNVM_REGISTER_OP(_backward_clip)
 .set_attr<FCompute>("FCompute<gpu>", ClipGrad_<gpu>);
diff --git a/src/operator/tensor/ordering_op-inl.h b/src/operator/tensor/ordering_op-inl.h
index eb28b010cbd3..606406dfe0bd 100644
--- a/src/operator/tensor/ordering_op-inl.h
+++ b/src/operator/tensor/ordering_op-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file ordering_op-inl.h
  * \brief Function definition of matrix related operators
  */
@@ -27,6 +28,7 @@
 #include <mxnet/operator_util.h>
 #include <dmlc/optional.h>
 #include <mshadow/tensor.h>
+#include <algorithm>
 #include <vector>
 #include <type_traits>
 #include "../mshadow_op.h"
@@ -175,8 +177,10 @@ void TopKImpl(RunContext ctx,
   }
   // 1. Parse and initialize information
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  Tensor<xpu, 1, real_t> workspace;
-  Tensor<xpu, 1, real_t> sorted_dat, indices, batch_id, sel_indices;
+  Tensor<xpu, 1, char> workspace;
+  Tensor<xpu, 1, char> temp_workspace;
+  Tensor<xpu, 1, real_t> sorted_dat;
+  Tensor<xpu, 1, int> indices, batch_id, sel_indices;
   Tensor<xpu, 2, real_t> mask_val;
   int batch_size, element_num;  // number of batches + the size of each batch
   int axis = 0;
@@ -187,49 +191,60 @@ void TopKImpl(RunContext ctx,
   ParseTopKParam(src.shape_, param,
                  &target_shape, &batch_size, &element_num, &axis, &k, &do_transpose, &is_ascend);
   Tensor<xpu, 3, real_t> dat = src.FlatTo3D<xpu, real_t>(axis, axis, s);
+  size_t temp_size = mxnet::op::SortByKeyWorkspaceSize<int, int, xpu>(src.Size());
+  temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize<int, real_t, xpu>(src.Size()));
+  temp_size = std::max(temp_size, mxnet::op::SortByKeyWorkspaceSize<real_t, int, xpu>(src.Size()));
+  size_t workspace_size = temp_size + sizeof(real_t) * src.Size() + sizeof(int) * src.Size() * 2;
   if (param.ret_typ == topk_enum::kReturnMask) {
-    workspace =
-      resource.get_space_typed<xpu, 1, real_t>(Shape1(src.Size() * 3 + 2 * batch_size * k), s);
-  } else {
-    workspace = resource.get_space_typed<xpu, 1, real_t>(mshadow::Shape1(src.Size() * 3), s);
+    workspace_size += sizeof(int) * batch_size * k + sizeof(real_t) * batch_size * k;
   }
-  sorted_dat = Tensor<xpu, 1, real_t>(workspace.dptr_,
+  workspace = resource.get_space_typed<xpu, 1, char>(Shape1(workspace_size), s);
+  char* workspace_curr_ptr = workspace.dptr_;
+  sorted_dat = Tensor<xpu, 1, real_t>(reinterpret_cast<real_t*>(workspace_curr_ptr),
                                       Shape1(src.Size()), s);  // contain sorted dat
-  indices = Tensor<xpu, 1, real_t>(workspace.dptr_ + src.Size(),
-                                   Shape1(src.Size()), s);  // indices in the original matrix
-  batch_id = Tensor<xpu, 1, real_t>(workspace.dptr_ + 2 * src.Size(),
-                                    Shape1(src.Size()), s);  // batch id in the original matrix
+  workspace_curr_ptr += sizeof(real_t) * src.Size();
+  indices = Tensor<xpu, 1, int>(reinterpret_cast<int*>(workspace_curr_ptr),
+                                Shape1(src.Size()), s);  // indices in the original matrix
+  workspace_curr_ptr += sizeof(int) * src.Size();
+  batch_id = Tensor<xpu, 1, int>(reinterpret_cast<int*>(workspace_curr_ptr),
+                                 Shape1(src.Size()), s);  // batch id in the original matrix
+  workspace_curr_ptr += sizeof(int) * src.Size();
   if (do_transpose) {
     sorted_dat = reshape(transpose(dat, Shape3(0, 2, 1)), Shape1(src.Size()));
   } else {
     sorted_dat = reshape(dat, Shape1(src.Size()));
   }
-  indices = range<real_t>(0, batch_size * element_num);
+  mxnet_op::Kernel<range_fwd, xpu>::Launch(s, batch_size * element_num, 1, 0, 1,
+    kWriteTo, indices.dptr_);
+
   CHECK_EQ(sorted_dat.CheckContiguous(), true);
   CHECK_EQ(indices.CheckContiguous(), true);
   if (param.ret_typ == topk_enum::kReturnMask) {
-    sel_indices = Tensor<xpu, 1, real_t>(workspace.dptr_ + 3 * src.Size(),
-                                         Shape1(batch_size * k), s);
-    mask_val = Tensor<xpu, 2, real_t>(workspace.dptr_ + 3 * src.Size() + batch_size * k,
+    sel_indices = Tensor<xpu, 1, int>(reinterpret_cast<int*>(workspace_curr_ptr),
+                                      Shape1(batch_size * k), s);
+    workspace_curr_ptr += sizeof(int) * batch_size * k;
+    mask_val = Tensor<xpu, 2, real_t>(reinterpret_cast<real_t*>(workspace_curr_ptr),
                                       Shape2(batch_size * k, 1), s);
+    workspace_curr_ptr += sizeof(real_t) * batch_size * k;
     mask_val = scalar<real_t>(1);
     CHECK_EQ(sel_indices.CheckContiguous(), true);
     CHECK_EQ(mask_val.CheckContiguous(), true);
   }
-
+  temp_workspace = Tensor<xpu, 1, char>(workspace_curr_ptr, Shape1(temp_size), s);  // temp space
+  workspace_curr_ptr += temp_size;
   // 2. Perform inplace batch sort using the `SortByKey` in MShadow
   // After sorting, each batch in `sorted_dat` will be sorted in the corresponding order
   //   and the `indices` will contain the corresponding index in `sorted_dat`
   // Sort the data and keep record of the correspondence to global indices.
-  mxnet::op::SortByKey(sorted_dat, indices, is_ascend);
+  mxnet::op::SortByKey(sorted_dat, indices, is_ascend, &temp_workspace);
   // Calculate the corresponding batch indices of the elements
-  batch_id = F<mshadow_op::floor>(indices / static_cast<real_t>(element_num));
+  batch_id = indices / element_num;
   // Since the SortByKey performs stable sort, the second SortByKey will reorder
   //   the sorted_dat based on the order of the batch_id
-  mxnet::op::SortByKey(batch_id, sorted_dat, true);
+  mxnet::op::SortByKey(batch_id, sorted_dat, true, &temp_workspace);
   // Reorder the indices
-  batch_id = F<mshadow_op::floor>(indices / static_cast<real_t>(element_num));
-  mxnet::op::SortByKey(batch_id, indices, true);
+  batch_id = indices / element_num;
+  mxnet::op::SortByKey(batch_id, indices, true, &temp_workspace);
 
   // 3. Assign results to the ret blob
   if (param.ret_typ == topk_enum::kReturnMask) {
@@ -239,8 +254,8 @@ void TopKImpl(RunContext ctx,
     sel_indices = reshape(slice<1>(
                               inplace_reshape(indices,
                                               Shape2(batch_size,
-                                                    element_num)), 0, k),
-                            Shape1(batch_size * k));
+                                                     element_num)), 0, k),
+                              Shape1(batch_size * k));
     if (do_transpose) {
       TShape src_shape = src.shape_.FlatTo3D(axis);
       CHECK_EQ(sel_indices.CheckContiguous(), true);
@@ -249,23 +264,24 @@ void TopKImpl(RunContext ctx,
     }
     IndexFill(ret_mask, sel_indices, mask_val);
   } else if (param.ret_typ == topk_enum::kReturnIndices) {
-    indices -= batch_id * static_cast<real_t>(element_num);
+    indices -= batch_id * element_num;
     if (do_transpose) {
       Tensor<xpu, 3, real_t> ret_indices = ret[0].FlatTo3D<xpu, real_t>(axis, axis, s);
-      ret_indices = transpose(
+      ret_indices = tcast<real_t>(transpose(
                       slice<2>(inplace_reshape(indices,
                                                Shape3(ret_indices.shape_[0],
                                                       ret_indices.shape_[2],
                                                       element_num)),
                                0, k),
-                      Shape3(0, 2, 1));
+                      Shape3(0, 2, 1)));
     } else {
       Tensor<xpu, 2, real_t> ret_indices =
         ret[0].get_with_shape<xpu, 2, real_t>(Shape2(batch_size, k), s);
-      ret_indices = slice<1>(inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k);
+      ret_indices = tcast<real_t>(slice<1>(
+                      inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k));
     }
   } else {
-    indices -= batch_id * static_cast<real_t>(element_num);
+    indices -= batch_id * element_num;
     if (do_transpose) {
       Tensor<xpu, 3, real_t> ret_value = ret[0].FlatTo3D<xpu, real_t>(axis, axis, s);
       Tensor<xpu, 3, real_t> ret_indices = ret[1].FlatTo3D<xpu, real_t>(axis, axis, s);
@@ -274,20 +290,21 @@ void TopKImpl(RunContext ctx,
                                     Shape3(ret_value.shape_[0], ret_value.shape_[2], element_num)),
                             0, k),
                    Shape3(0, 2, 1));
-      ret_indices = transpose(
+      ret_indices = tcast<real_t>(transpose(
                       slice<2>(inplace_reshape(indices,
                                                Shape3(ret_indices.shape_[0],
                                                       ret_indices.shape_[2],
                                                       element_num)),
                                0, k),
-                      Shape3(0, 2, 1));
+                      Shape3(0, 2, 1)));
     } else {
       Tensor<xpu, 2, real_t> ret_value =
         ret[0].get_with_shape<xpu, 2, real_t>(Shape2(batch_size, k), s);
       Tensor<xpu, 2, real_t> ret_indices =
         ret[1].get_with_shape<xpu, 2, real_t>(Shape2(batch_size, k), s);
       ret_value = slice<1>(inplace_reshape(sorted_dat, Shape2(batch_size, element_num)), 0, k);
-      ret_indices = slice<1>(inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k);
+      ret_indices = tcast<real_t>(slice<1>(
+                      inplace_reshape(indices, Shape2(batch_size, element_num)), 0, k));
     }
   }
 }
@@ -369,7 +386,8 @@ void TopKBackward_(const nnvm::NodeAttrs& attrs,
     inputs[0].get_with_shape<xpu, 2, real_t>(Shape2(inputs[0].shape_.Size(), 1), s);
   Tensor<xpu, 2, real_t> in_grad =
     outputs[0].get_with_shape<xpu, 2, real_t>(Shape2(outputs[0].shape_.Size(), 1), s);
-  batch_shift = range<real_t>(0, batch_size, 1) * element_num;
+  mxnet_op::Kernel<range_fwd, xpu>::Launch(s, batch_size, 1, 0.0f,
+    static_cast<real_t>(element_num), kWriteTo, batch_shift.dptr_);
   if (do_transpose) {
     Tensor<xpu, 1, real_t> indices = inputs[2].FlatTo1D<xpu, real_t>(s);
     TShape src_shape = outputs[0].shape_.FlatTo3D(axis);
@@ -397,7 +415,8 @@ void TopKBackward_(const nnvm::NodeAttrs& attrs,
   } else if (kAddTo == req[0]) {
     // TODO(sxjscience) We can use AddTakeGrad in the future.
     // However, the current implementation of AddTakeGrad is not so efficient.
-    dummy_index = range<real_t>(0, sel_indices.shape_.Size());
+    mxnet_op::Kernel<range_fwd, xpu>::Launch(s, sel_indices.shape_.Size(), 1, 0.0f,
+      1.0f, kWriteTo, dummy_index.dptr_);
     mxnet::op::AddTakeGradLargeBatch(in_grad, sel_indices, dummy_index, out_grad);
   } else if (kNullOp == req[0]) {
     return;
diff --git a/src/operator/tensor/ordering_op.cc b/src/operator/tensor/ordering_op.cc
index 22712a82b4c9..ebd7c62ec886 100644
--- a/src/operator/tensor/ordering_op.cc
+++ b/src/operator/tensor/ordering_op.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2016 by Contributors
  * \file ordering.cc
  * \brief CPU Implementation of the ordering operations
  */
diff --git a/src/operator/tensor/ordering_op.cu b/src/operator/tensor/ordering_op.cu
index 8e40b4a350d9..d78361448d44 100644
--- a/src/operator/tensor/ordering_op.cu
+++ b/src/operator/tensor/ordering_op.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file matrix_op.cu
  * \brief GPU Implementation of matrix operations
  */
diff --git a/src/operator/tensor/sort_op.h b/src/operator/tensor/sort_op.h
index a0425a5afe1e..3fa95bb660f5 100644
--- a/src/operator/tensor/sort_op.h
+++ b/src/operator/tensor/sort_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  * \file sort_op.h
  * \brief SortByKey function
  */
diff --git a/src/operator/tensor/sparse_retain-inl.h b/src/operator/tensor/sparse_retain-inl.h
new file mode 100644
index 000000000000..8caa65e60a17
--- /dev/null
+++ b/src/operator/tensor/sparse_retain-inl.h
@@ -0,0 +1,418 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file sparse_retain-inl.h
+ * \brief
+*/
+#ifndef MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_
+#define MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <utility>
+#include "./init_op.h"
+#include "../mshadow_op.h"
+#include "../elemwise_op_common.h"
+#include "../mxnet_op.h"
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief sparse retain namespace
+ */
+namespace sr {
+enum SparseRetainOpInputs {kArr, kIdx};
+enum SparseRetainOpOutputs {kOut};
+}  // namespace sr
+
+inline bool SparseRetainOpShape(const nnvm::NodeAttrs& attrs,
+                                std::vector<TShape> *in_attrs,
+                                std::vector<TShape> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U)
+    << "sparse_retain operator takes 2 arguments (" << in_attrs->size() << " given)";
+  CHECK_EQ(out_attrs->size(), 1U);
+
+  TShape tshape((*in_attrs)[sr::kArr]);
+  shape_assign(&tshape, (*out_attrs)[sr::kOut]);
+  SHAPE_ASSIGN_CHECK(*in_attrs, sr::kArr, tshape);
+  SHAPE_ASSIGN_CHECK(*out_attrs, sr::kOut, tshape);
+  return true;
+}
+
+inline bool SparseRetainOpType(const nnvm::NodeAttrs& attrs,
+                               std::vector<int> *in_attrs,
+                               std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  CHECK_NE((*in_attrs)[sr::kIdx], -1) << "Index type must be set for sparse_retain operator";
+
+  TYPE_ASSIGN_CHECK(*out_attrs, 0, (*in_attrs)[sr::kArr]);
+  TYPE_ASSIGN_CHECK(*in_attrs, 0, (*out_attrs)[sr::kOut]);
+  return (*in_attrs)[0] != -1;
+}
+
+inline bool SparseRetainForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                                const int dev_mask,
+                                                DispatchMode* dispatch_mode,
+                                                std::vector<int> *in_attrs,
+                                                std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  bool dispatched = false;
+  auto &arr_stype = in_attrs->at(sr::kArr);
+  auto &idx_stype = in_attrs->at(sr::kIdx);
+  auto &out_stype = out_attrs->at(sr::kOut);
+  if (!dispatched && arr_stype == kRowSparseStorage && idx_stype == kDefaultStorage) {
+    // rsp, dns -> rsp
+    dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
+                                     dispatch_mode, DispatchMode::kFComputeEx);
+  }
+  if (!dispatched) {
+    LOG(FATAL) << "Not implemented: "
+               << operator_stype_string(attrs, dev_mask, *in_attrs, *out_attrs);
+  }
+  return true;
+}
+
+inline bool SparseRetainBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                                 const int dev_mask,
+                                                 DispatchMode* dispatch_mode,
+                                                 std::vector<int> *in_attrs,
+                                                 std::vector<int> *out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 2U);
+  bool dispatched = false;
+  const auto &ograd_stype = in_attrs->at(sr::kOut);
+  const auto &idx_stype = in_attrs->at(sr::kArr);
+  auto &arr_grad_stype = out_attrs->at(sr::kArr);
+  auto &idx_grad_stype = out_attrs->at(sr::kIdx);
+  if (!dispatched && ograd_stype == kDefaultStorage && idx_stype == kDefaultStorage) {
+    if (type_assign(&arr_grad_stype, kRowSparseStorage) &&
+        type_assign(&idx_grad_stype, kDefaultStorage)) {
+      DISPATCH_MODE_ASSIGN_CHECK(dispatch_mode, 0, DispatchMode::kFComputeEx);
+      dispatched = true;
+    }
+  }
+  if (!dispatched) {
+    LOG(FATAL) << "Not implemented: "
+               << operator_stype_string(attrs, dev_mask, *in_attrs, *out_attrs);
+  }
+  return true;
+}
+
+/*!
+ * \brief Each thread searches for a user input index in the input
+ * row sparse ndarray alternatively. This ensures each thread
+ * has the almost the same workload. The overhead is the binary
+ * search. If all the indices of the idx array are contained
+ * in the in_idx, one should use SparseRetainRspRowBlockKernel instead,
+ * where each thread only perform binary search once.
+ */
+struct SparseRetainRspThreadKernel {
+  template<typename DType, typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx,
+                                  const DType* in_data, const RType* in_idx,
+                                  const IType* idx, const size_t nnr,
+                                  const size_t row_length) {
+    const RType irow = idx[i];
+    int j = -1, left = 0, right = nnr - 1;
+    while (left <= right) {
+      int m = left + (right - left) / 2;
+      const auto in_idx_m = in_idx[m];
+      if (in_idx_m == irow) {
+        j = m;
+        break;
+      } else if (in_idx_m < irow) {
+        left = m + 1;
+      } else {
+        right = m - 1;
+      }
+    }
+    out_idx[i] = idx[i];
+    if (j >= 0) {
+      const size_t in_offset = j * row_length;
+      const size_t out_offset = i * row_length;
+      for (size_t k = 0; k < row_length; ++k) {
+        out_data[out_offset+k] = in_data[in_offset+k];
+      }
+    }
+  }
+};
+
+/*!
+ * \brief This kernel should be invoked when the row indices
+ * to be retained are all in the input rsp.
+ * Each thread searches for a subarray of indices of
+ * the user-input idx array for retain. The first index
+ * in the subarray will be searched for using binary search.
+ * The rest of the indices will be searched for starting from
+ * the lower bound of the binary search. This kernel assumes
+ * that idx has been sorted in ascending order.
+ */
+struct SparseRetainRspRowBlockKernel {
+  template<typename DType, typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, RType* out_idx,
+                                  const DType* in_data, const RType* in_idx,
+                                  const IType* idx, const size_t num_indices,
+                                  const size_t nnr, const size_t row_length,
+                                  const size_t seg_len) {
+    const size_t seg_start = i * seg_len;
+    if (seg_start >= num_indices) return;
+    const size_t seg_end = (seg_start+seg_len < num_indices? seg_start+seg_len : num_indices);
+    for (size_t j = seg_start; j < seg_end; ++j) {
+      out_idx[j] = idx[j];
+    }
+    // use binary search to find the lower bound of idx[seg_start] in in_idx
+    const RType* first = in_idx;
+    const RType* last = in_idx + nnr;
+    const auto val = idx[seg_start];
+    const RType* it;
+    int count = last - first, step;
+    while (count > 0) {
+      it = first;
+      step = count / 2;
+      it += step;
+      if (*it < val) {
+        first = ++it;
+        count -= step + 1;
+      } else {
+        count = step;
+      }
+    }
+    size_t cur_row_idx = first - in_idx;
+    // end of binary search
+    if (cur_row_idx == nnr ||  in_idx[cur_row_idx] > idx[seg_end-1]) {
+      return;
+    }
+    size_t cur_idx = seg_start;
+    while (cur_row_idx < nnr && cur_idx < seg_end) {
+      if (in_idx[cur_row_idx] == idx[cur_idx]) {
+        const size_t in_offset = cur_row_idx * row_length;
+        const size_t out_offset = cur_idx * row_length;
+        for (size_t k = 0; k < row_length; ++k) {
+          out_data[out_offset+k] = in_data[in_offset+k];
+        }
+        ++cur_row_idx;
+        ++cur_idx;
+      } else if (in_idx[cur_row_idx] < idx[cur_idx]) {
+        ++cur_row_idx;
+      } else {
+        ++cur_idx;
+      }
+    }
+  }
+};
+
+/*!
+ * Copy input indices to output indices.
+ * Only used when input rsp is dense.
+ */
+struct SparseRetainCopyIndices {
+  template<typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, RType* out_idx, IType* idx) {
+    out_idx[i] = idx[i];
+  }
+};
+
+/*!
+ * Copy input retained rows to output rows.
+ * Only used when input rsp is dense.
+ * This kernel is only used when ctx is on GPU.
+ * So it's parallelized by out_rows' elements,
+ * instead of rows.
+ * For CPU ctx, we simply call mshadow::Copy.
+ */
+struct SparseRetainCopyRetainedRowsFromDns {
+  template<typename DType, typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_rows, const DType* in_rows,
+                                  const RType* in_row_idx, const IType* idx,
+                                  const size_t row_length) {
+    const size_t irow = i / row_length;
+    const size_t icol = i % row_length;
+    out_rows[i] = in_rows[static_cast<size_t>(idx[irow]) * row_length + icol];
+  }
+};
+
+template<typename xpu>
+void SparseRetainOpForwardRspImpl(mshadow::Stream<xpu> *s,
+                                  const NDArray& input_nd,
+                                  const TBlob& idx_data,
+                                  const OpReqType req,
+                                  NDArray* output_nd) {
+  if (req == kNullOp) return;
+  CHECK_EQ(req, kWriteTo) << "SparseRetainOpForwardRspImpl only support req = kWriteTo now";
+  CHECK_EQ(input_nd.storage_type(), kRowSparseStorage)
+    << "SparseRetainOpForwardRspImpl operator only takes row sparse NDArray as input";
+  CHECK_EQ(output_nd->storage_type(), kRowSparseStorage)
+    << "SparseRetainOpForwardRspImpl operator only outputs row sparse NDArray";
+
+  if (!input_nd.storage_initialized()
+      || idx_data.Size() == 0U
+      || input_nd.shape()[0] == 0) {
+    FillZerosRspImpl(s, *output_nd);
+    return;
+  }
+
+  const TBlob input_data = input_nd.data();
+  const TBlob input_idx = input_nd.aux_data(rowsparse::kIdx);
+
+  output_nd->CheckAndAlloc({mshadow::Shape1(idx_data.Size())});
+  TBlob output_data = output_nd->data();
+  TBlob output_idx = output_nd->aux_data(rowsparse::kIdx);
+  const auto row_length = input_data.shape_.ProdShape(1, input_data.shape_.ndim());
+
+  using namespace mxnet_op;
+  MSHADOW_TYPE_SWITCH(output_data.type_flag_, DType, {  // output data type
+    Kernel<set_zero, xpu>::Launch(s, output_data.Size(), output_data.dptr<DType>());
+    MSHADOW_IDX_TYPE_SWITCH(output_idx.type_flag_, RType, {  // row index data type
+      MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, {  // index array data type
+        if (input_idx.Size() == input_nd.shape()[0]) {  // input rsp is dense
+          using namespace mshadow;
+          // copy indices
+          Tensor<xpu, 1, RType> output_idx_tensor = output_idx.FlatTo1D<xpu, RType>(s);
+          const size_t num_rows_retained = output_idx.Size();
+          if (output_idx.type_flag_ == idx_data.type_flag_) {  // same type, use Copy
+            const Tensor<xpu, 1, RType> idx_tensor = idx_data.FlatTo1D<xpu, RType>(s);
+            Copy(output_idx_tensor, idx_tensor, s);
+          } else {  // different index types, use Kernel::Launch
+            Kernel<SparseRetainCopyIndices, xpu>::Launch(s, num_rows_retained,
+                output_idx.dptr<RType>(), idx_data.dptr<IType>());
+          }
+          // copy data
+          if (std::is_same<xpu, cpu>::value) {  // For cpu, we can access output_idx_tensor[i]
+            const Tensor<xpu, 2, DType> input_tensor =
+              input_data.get_with_shape<xpu, 2, DType>(Shape2(input_data.shape_[0], row_length), s);
+            Tensor<xpu, 2, DType> output_tensor =
+              output_data.get_with_shape<xpu, 2, DType>(Shape2(output_data.shape_[0], row_length),
+                                                        s);
+            for (size_t i = 0; i < num_rows_retained; ++i) {
+              Copy(output_tensor[i], input_tensor[output_idx_tensor[i]], s);
+            }
+          } else {  // For gpu, have to kernel launch
+            Kernel<SparseRetainCopyRetainedRowsFromDns, xpu>::Launch(s, output_data.Size(),
+                output_data.dptr<DType>(), input_data.dptr<DType>(), input_idx.dptr<RType>(),
+                idx_data.dptr<IType>(), row_length);
+          }
+        } else {  // input rsp is not dense
+          Kernel<SparseRetainRspThreadKernel, xpu>::Launch(s, idx_data.Size(),
+              output_data.dptr<DType>(), output_idx.dptr<RType>(), input_data.dptr<DType>(),
+              input_idx.dptr<RType>(), idx_data.dptr<IType>(), input_data.shape_[0], row_length);
+        }
+      });
+    });
+  });
+}
+
+template<typename xpu>
+void SparseRetainOpForwardEx(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<NDArray>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  if (req[sr::kOut] == kNullOp) return;
+  CHECK_EQ(req[sr::kOut], kWriteTo) << "sparse_retain only supports req=\'write\'";
+  CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage)
+    << "sparse_retain operator only takes default NDArray as its index array";
+  if (inputs[sr::kArr].storage_type() == kRowSparseStorage) {
+    NDArray output_nd = outputs[sr::kOut];
+    SparseRetainOpForwardRspImpl<xpu>(ctx.get_stream<xpu>(), inputs[sr::kArr],
+        inputs[sr::kIdx].data(), req[sr::kOut], &output_nd);
+  } else {
+    LOG(FATAL) << "sparse_retain op only supports row-sparse ndarrays as input";
+  }
+}
+
+template<int req>
+struct SparseRetainRspGradKernel {
+  template<typename DType, typename RType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, DType* in_grad, RType* in_grad_idx,
+                                  const DType* out_grad, const IType* idx,
+                                  const size_t row_length) {
+    const RType irow = idx[i];
+    in_grad_idx[i] = irow;
+    const size_t out_offset = irow * row_length;
+    const size_t in_offset = i * row_length;
+    for (size_t j = 0; j < row_length; ++j) {
+      KERNEL_ASSIGN(in_grad[in_offset+j], req, out_grad[out_offset+j]);
+    }
+  }
+};
+
+template<typename xpu>
+void SparseRetainOpBackwardEx(const nnvm::NodeAttrs& attrs,
+                              const OpContext& ctx,
+                              const std::vector<NDArray>& inputs,
+                              const std::vector<OpReqType>& req,
+                              const std::vector<NDArray>& outputs) {
+  CHECK_EQ(req.size(), 2U);
+  CHECK_EQ(req[sr::kIdx], kNullOp);
+  if (req[sr::kArr] == kNullOp) return;
+  CHECK_EQ(req[sr::kArr], kWriteTo);
+
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 2U)
+    << "sparse_retain does not support calculating gradients of indices";
+
+  CHECK_EQ(inputs[sr::kOut].storage_type(), kDefaultStorage)
+    << "sparse_retain backward only takes default NDArray as ograd";
+  CHECK_EQ(inputs[sr::kIdx].storage_type(), kDefaultStorage)
+    << "sparse_retain backward only takes default NDArray as its index array";
+  CHECK_EQ(outputs[sr::kArr].storage_type(), kRowSparseStorage)
+    << "sparse_retain backward only outputs row sparse NDArray as grad of input";
+
+  using namespace mxnet_op;
+  using namespace mshadow;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  const TBlob idx_data = inputs[sr::kIdx].data();
+  if (idx_data.Size() == 0U) {
+    FillZerosRspImpl(s, outputs[sr::kArr]);
+    return;
+  }
+
+  const TBlob out_grad_data = inputs[sr::kOut].data();
+
+  NDArray in_grad_nd = outputs[sr::kArr];
+  in_grad_nd.CheckAndAlloc({mshadow::Shape1(idx_data.Size())});
+  TBlob in_grad_data = in_grad_nd.data();
+  TBlob in_grad_idx = in_grad_nd.aux_data(rowsparse::kIdx);
+  const auto row_length = out_grad_data.shape_.ProdShape(1, out_grad_data.shape_.ndim());
+
+  MSHADOW_TYPE_SWITCH(out_grad_data.type_flag_, DType, {  // output data type
+    MSHADOW_IDX_TYPE_SWITCH(in_grad_idx.type_flag_, RType, {  // row index data type
+      MSHADOW_TYPE_SWITCH(idx_data.type_flag_, IType, {  // index array data type
+        MXNET_ASSIGN_REQ_SWITCH(req[sr::kArr], req_type, {
+          Kernel<SparseRetainRspGradKernel<req_type>, xpu>::Launch(
+              s, in_grad_idx.Size(), in_grad_data.dptr<DType>(), in_grad_idx.dptr<RType>(),
+              out_grad_data.dptr<DType>(), idx_data.dptr<IType>(), row_length);
+        });
+      });
+    });
+  });
+}
+
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_SPARSE_RETAIN_INL_H_
diff --git a/src/operator/tensor/sparse_retain.cc b/src/operator/tensor/sparse_retain.cc
new file mode 100644
index 000000000000..f8fc325c0534
--- /dev/null
+++ b/src/operator/tensor/sparse_retain.cc
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file sparse_retain.cc
+ * \brief
+*/
+
+#include "./sparse_retain-inl.h"
+namespace mxnet {
+namespace op {
+
+// Add prefix "_sparse_" to prevent it from being registered
+// under mxnet.ndarray in python frontend as this op only
+// accepts row-sparse format ndarrays. It will be registered
+// under mxnet.ndarray.sparse with name retain.
+NNVM_REGISTER_OP(_sparse_retain)
+.describe(R"code(pick rows specified by user input index array from a row sparse matrix
+and save them in the output sparse matrix.
+
+Example::
+
+  data = [[1, 2], [3, 4], [5, 6]]
+  indices = [0, 1, 3]
+  shape = (4, 2)
+  rsp_in = row_sparse(data, indices)
+  to_retain = [0, 3]
+  rsp_out = retain(rsp_in, to_retain)
+  rsp_out.values = [[1, 2], [5, 6]]
+  rsp_out.indices = [0, 3]
+
+The storage type of ``retain`` output depends on storage types of inputs
+
+- retain(row_sparse, default) = row_sparse
+- otherwise, ``retain`` is not supported
+
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "indices"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", SparseRetainOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SparseRetainOpType)
+.set_attr<FInferStorageType>("FInferStorageType", SparseRetainForwardInferStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SparseRetainOpForwardEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    return MakeNonlossGradNode("_backward_sparse_retain", n, ograds,
+                               {n->inputs[sr::kIdx]}, n->attrs.dict);
+  })
+.add_argument("data", "NDArray-or-Symbol", "The input array for sparse_retain operator.")
+.add_argument("indices", "NDArray-or-Symbol", "The index array of rows ids that will be retained.");
+
+NNVM_REGISTER_OP(_backward_sparse_retain)
+.set_num_inputs(2)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FInferStorageType>("FInferStorageType", SparseRetainBackwardInferStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SparseRetainOpBackwardEx<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/sparse_retain.cu b/src/operator/tensor/sparse_retain.cu
new file mode 100644
index 000000000000..6b4ac1bdf1a1
--- /dev/null
+++ b/src/operator/tensor/sparse_retain.cu
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file sparse_retain.cu
+ * \brief
+*/
+
+#include "./sparse_retain-inl.h"
+namespace mxnet {
+namespace op {
+
+NNVM_REGISTER_OP(_sparse_retain)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SparseRetainOpForwardEx<gpu>);
+
+NNVM_REGISTER_OP(_backward_sparse_retain)
+.set_attr<FComputeEx>("FComputeEx<gpu>", SparseRetainOpBackwardEx<gpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/square_sum-inl.h b/src/operator/tensor/square_sum-inl.h
new file mode 100644
index 000000000000..a052ad96cfcc
--- /dev/null
+++ b/src/operator/tensor/square_sum-inl.h
@@ -0,0 +1,524 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file square_sum-inl.h
+ * \brief This is a temporary solution for fusing operators
+ * square and sum together as a composite op for row sparse tensors.
+ * The purpose for fusing square and sum for row sparse tensors
+ * is that the gradient of the fused operator depends on the input
+ * ndarray and thus its gradient is a row-sparse ndarray too.
+ * This fused op will become deprecated after the functionality
+ * of fusing operators is finished in the future.
+ */
+
+#ifndef MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_
+#define MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_
+
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include "../mxnet_op.h"
+#include "./broadcast_reduce_op.h"
+#include "./init_op.h"
+
+namespace mxnet {
+namespace op {
+
+// infer storage function for _square_sum operator on cpu
+inline bool SquareSumForwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                             const int dev_mask,
+                                             DispatchMode* dispatch_mode,
+                                             std::vector<int>* in_attrs,
+                                             std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  const auto& in_stype = in_attrs->at(0);
+  auto& out_stype = out_attrs->at(0);
+  bool dispatched = false;
+  // current impl is only available on cpu
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    if (!dispatched && in_stype == kRowSparseStorage && param.axis[0] == 1 && param.keepdims) {
+      // sum per row and keep dims
+      dispatched = storage_type_assign(&out_stype, kRowSparseStorage,
+                                       dispatch_mode, DispatchMode::kFComputeEx);
+    }
+    if (!dispatched && in_stype == kRowSparseStorage &&
+        (param.axis[0] == 0 || (param.axis[0] == 1 && !param.keepdims))) {
+        dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+                                         dispatch_mode, DispatchMode::kFComputeEx);
+    }
+  }
+  if (!dispatched) {
+    // nothing to fallback on
+    LOG(FATAL) << "Not implemented: "
+               << operator_stype_string(attrs, dev_mask, *in_attrs, *out_attrs);
+  }
+  return true;
+}
+
+// infer storage function for _backward_square_sum operator on cpu
+inline bool SquareSumBackwardInferStorageType(const nnvm::NodeAttrs& attrs,
+                                              const int dev_mask,
+                                              DispatchMode* dispatch_mode,
+                                              std::vector<int>* in_attrs,
+                                              std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  const auto& ograd_stype = in_attrs->at(0);
+  const auto& in_stype = in_attrs->at(1);
+  auto& grad_stype = out_attrs->at(0);
+  bool dispatched = false;
+  // only implemented on cpu
+  if (dev_mask == mshadow::cpu::kDevMask) {
+    if (!dispatched && (ograd_stype == kDefaultStorage || ograd_stype == kRowSparseStorage) &&
+        in_stype == kRowSparseStorage) {
+      dispatched = storage_type_assign(&grad_stype, kRowSparseStorage,
+                                       dispatch_mode, DispatchMode::kFComputeEx);
+    }
+  }
+  if (!dispatched) {
+    // nothing to fallback on
+    LOG(FATAL) << "Not implemented: "
+               << operator_stype_string(attrs, dev_mask, *in_attrs, *out_attrs);
+  }
+  return true;
+}
+
+/*!
+ * \brief square sum of a rsp
+ * if axis = -1, same as mx.nd.sum(tensor*tensor)
+ * if axis = 0, same as mx.nd.sum(tensor*tensor, axis=0)
+ * if axis = 1, same as mx.nd.sum(tensor*tensor, axis=1)
+ * where tensor*tensor is elemwise multiplication of two ndarrays.
+ */
+template<int req, int axis, bool keepdim>
+struct SquareSumRspKernel;
+
+/*!
+ * \brief square sum of a rsp on axis=0 without keeping the dim
+ */
+template<int req>
+struct SquareSumRspKernel<req, 0, false> {
+  /*!
+   * \param j the element index in out_data and column id of in_data
+   */
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int j, DType* out_data, const DType* in_data,
+                                  const int64_t nnr, const int64_t num_cols) {
+    DType sum, residual;
+    mshadow::red::sum::SetInitValue(sum, residual);
+    for (int64_t i = 0; i < nnr; ++i) {
+      const DType val = in_data[i*num_cols+j] * in_data[i*num_cols+j];
+      mshadow::red::sum::Reduce(sum, val, residual);
+    }
+    KERNEL_ASSIGN(out_data[j], req, sum);
+  }
+};
+
+/*!
+ * \brief square sum of a rsp on axis=1 without keeping the dim
+ */
+template<int req>
+struct SquareSumRspKernel<req, 1, false> {
+  /*!
+   * \param i the i-th non-zero row of in_data
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out_data, const IType* in_row_idx,
+                                  const DType* in_data, const int64_t num_cols) {
+    DType sum, residual;
+    mshadow::red::sum::SetInitValue(sum, residual);
+    const int64_t offset = i * num_cols;
+    for (int64_t j = 0; j < num_cols; ++j) {
+      const DType val = in_data[offset+j] * in_data[offset+j];
+      mshadow::red::sum::Reduce(sum, val, residual);
+    }
+    KERNEL_ASSIGN(out_data[in_row_idx[i]], req, sum);
+  }
+};
+
+/*!
+ * \brief square sum of a rsp on axis=1 keeping the dim
+ */
+template<int req>
+struct SquareSumRspKernel<req, 1, true> {
+  /*!
+   * \param i the i-th non-zero row of in_data
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* out_row_idx, DType* out_data,
+                                  const IType* in_row_idx, const DType* in_data,
+                                  const int64_t num_cols) {
+    DType sum, residual;
+    mshadow::red::sum::SetInitValue(sum, residual);
+    out_row_idx[i] = in_row_idx[i];
+    const int64_t offset = i * num_cols;
+    for (int64_t j = 0; j < num_cols; ++j) {
+      const DType val = in_data[offset+j] * in_data[offset+j];
+      mshadow::red::sum::Reduce(sum, val, residual);
+    }
+    KERNEL_ASSIGN(out_data[i], req, sum);
+  }
+};
+
+template<int req, int axis, int ograd_stype = kDefaultStorage, bool is_data_full_rsp = false>
+struct SquareSumRspGradKernel;
+
+template<int req>
+struct SquareSumRspGradKernel<req, 0> {
+  /*!
+   * \param i element index in in_grad and in_data
+   * \param in_grad_row_idx row_idx of the gradient of the op's input
+   * \param in_grad gradient of the op's input
+   * \param out_grad gradient of the op's output
+   * \param in_row_idx row idx of the op's input
+   * \param in_data op's input
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad,
+                                  const DType* out_grad, const IType* in_row_idx,
+                                  const DType* in_data, const int64_t num_cols) {
+    const int64_t row = i / num_cols;
+    in_grad_row_idx[row] = in_row_idx[row];
+    KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[i%num_cols]);
+  }
+};
+
+template<int req>
+struct SquareSumRspGradKernel<req, 1> {
+  /*!
+   * \param i element index in in_grad and in_data
+   * \param in_grad_row_idx row_idx of the gradient of the op's input
+   * \param in_grad gradient of the op's input
+   * \param out_grad gradient of the op's output
+   * \param in_row_idx row idx of the op's input
+   * \param in_data op's input
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad,
+                                  const DType* out_grad, const IType* in_row_idx,
+                                  const DType* in_data, const int64_t num_cols) {
+    const int64_t row = i / num_cols;
+    in_grad_row_idx[row] = in_row_idx[row];
+    KERNEL_ASSIGN(in_grad[i], req, 2*in_data[i]*out_grad[in_row_idx[row]]);
+  }
+};
+
+/*!
+ * Note: This kernel assumes that the ograd and in_data
+ * are all rsp and have equal row_idx array.
+ */
+template<int req>
+struct SquareSumRspGradKernel<req, 1, kRowSparseStorage, false> {
+  /*!
+   * \param i index of igrad.data()
+   * \param in_grad_row_idx row_idx of the gradient of the op's input
+   * \param in_grad gradient of the op's input
+   * \param out_grad_row_idx row_idx of the gradient of the op's output
+   * \param out_grad gradient of the op's output
+   * \param in_data op's input
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad,
+                                  const IType* out_grad_row_idx, const DType* out_grad,
+                                  const DType* in_data, const int64_t num_cols) {
+    const int64_t row = i / num_cols;
+    in_grad_row_idx[row] = out_grad_row_idx[row];
+    KERNEL_ASSIGN(in_grad[i], req, 2 * in_data[i] * out_grad[row]);
+  }
+};
+
+/*!
+ * Note: This kernel assumes that the ograd and in_data
+ * are all rsp and in_data is a full rsp.
+ */
+template<int req>
+struct SquareSumRspGradKernel<req, 1, kRowSparseStorage, true> {
+  /*!
+   * \param i index of igrad.data()
+   * \param in_grad_row_idx row_idx of the gradient of the op's input
+   * \param in_grad gradient of the op's input
+   * \param out_grad_row_idx row_idx of the gradient of the op's output
+   * \param out_grad gradient of the op's output
+   * \param in_data op's input
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int i, IType* in_grad_row_idx, DType* in_grad,
+                                  const IType* out_grad_row_idx, const DType* out_grad,
+                                  const DType* in_data, const int64_t num_cols) {
+    const int64_t row = i / num_cols;
+    const int64_t row_dns = out_grad_row_idx[row];
+    in_grad_row_idx[row] = row_dns;
+    KERNEL_ASSIGN(in_grad[i], req, 2 * in_data[row_dns*num_cols+i%num_cols] * out_grad[row]);
+  }
+};
+
+
+template<typename xpu>
+void SquareSumRspImpl(const nnvm::NodeAttrs& attrs,
+                      mshadow::Stream<xpu>* s,
+                      const NDArray& input,
+                      const OpReqType req,
+                      NDArray* output) {
+  if (req == kNullOp) return;
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0 or 1";
+  CHECK(param.axis[0] == 0 || param.axis[0] == 1)
+    << "_square_sum(row_sparse_matrix) only supports axis=0 or 1";
+  CHECK_EQ(input.storage_type(), kRowSparseStorage)
+    << "_square_sum op only supports row-sparse matrix as input";
+  int64_t out_data_size = 0;
+  if (param.axis[0] == 0) {  // axis = 0
+    CHECK_EQ(output->storage_type(), kDefaultStorage);
+    out_data_size = input.storage_shape()[1];
+  } else if (param.keepdims) {  // axis = 1, keepdims = true
+    CHECK_EQ(output->storage_type(), kRowSparseStorage);
+    out_data_size = input.storage_shape()[0];
+  } else {  // axis = 1, keepdims = false
+    CHECK_EQ(output->storage_type(), kDefaultStorage);
+    out_data_size = input.shape()[0];
+  }
+  CHECK_NE(req, kWriteInplace);
+
+  using namespace mxnet_op;
+  if (!input.storage_initialized()) {
+    if (req == kWriteTo) {
+      if (output->storage_type() == kDefaultStorage) {
+        MSHADOW_TYPE_SWITCH(output->data().type_flag_, DType, {
+          Kernel<set_zero, xpu>::Launch(s, out_data_size, output->data().dptr<DType>());
+        })
+      } else if (output->storage_type() == kRowSparseStorage) {
+        FillZerosRspImpl(s, *output);
+      } else {
+        LOG(FATAL) << "SquareSumRspImpl only supports row-sparse/dense output storage type";
+      }
+    }
+    return;
+  }
+
+  if (output->storage_type() == kRowSparseStorage) {
+    output->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)});
+  }
+  const TBlob& out_data = output->data();
+  const int64_t nnr = input.storage_shape()[0];
+  const int64_t num_cols = input.storage_shape()[1];
+  const TBlob& in_data = input.data();
+  if (0 == param.axis[0]) {  // axis = 0, output is dense
+    MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+      MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+        Kernel<SquareSumRspKernel<req_type, 0, false>, xpu>::Launch(s, num_cols,
+            out_data.dptr<DType>(), input.data().dptr<DType>(), nnr, num_cols);
+      })
+    })
+  } else {  // axis = 1
+    const TBlob in_row_idx = input.aux_data(rowsparse::kIdx);
+    if (param.keepdims) {  // output is rsp
+      const TBlob out_row_idx = output->aux_data(rowsparse::kIdx);
+      MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+        MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            Kernel<SquareSumRspKernel<req_type, 1, true>, xpu>::Launch(s, nnr,
+                out_row_idx.dptr<IType>(), out_data.dptr<DType>(), in_row_idx.dptr<IType>(),
+                in_data.dptr<DType>(), num_cols);
+          })
+        })
+      })
+    } else {  // output is dense
+      if (req == kWriteTo) {
+        MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+          Kernel<set_zero, xpu>::Launch(s, out_data_size, out_data.dptr<DType>());
+        })
+      }
+      MSHADOW_TYPE_SWITCH(out_data.type_flag_, DType, {
+        MSHADOW_IDX_TYPE_SWITCH(in_row_idx.type_flag_, IType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            Kernel<SquareSumRspKernel<req_type, 1, false>, xpu>::Launch(s, nnr,
+                out_data.dptr<DType>(), in_row_idx.dptr<IType>(), in_data.dptr<DType>(), num_cols);
+          })
+        })
+      })
+    }
+  }
+}
+
+/*!\brief
+ * This function only supports the following three situations:
+ * 1. ograd is a dns and input is an rsp
+ * 2. ograd and input are both rsp and have the same row_idx array
+ * 3. ograd and input are both rsp and input is a full rsp
+ */
+template<typename xpu>
+void SquareSumRspGradImpl(const nnvm::NodeAttrs& attrs,
+                          mshadow::Stream<xpu>* s,
+                          const NDArray& ograd,
+                          const NDArray& input,
+                          const OpReqType req,
+                          NDArray* igrad) {
+  if (req == kNullOp) return;
+  const ReduceAxesParam& param = nnvm::get<ReduceAxesParam>(attrs.parsed);
+  CHECK_EQ(param.axis.ndim(), 1U) << "_square_sum(row_sparse_matrix) only supports axis=0/1";
+  CHECK(param.axis[0] == 0 || param.axis[0] == 1)
+    << "_square_sum(row_sparse_matrix) only supports axis=0 or 1";
+  CHECK(ograd.storage_type() == kDefaultStorage || ograd.storage_type() == kRowSparseStorage);
+  CHECK_EQ(input.storage_type(), kRowSparseStorage);
+  CHECK_EQ(igrad->storage_type(), kRowSparseStorage);
+  CHECK_EQ(req, kWriteTo);
+  if (!input.storage_initialized()
+      || (ograd.storage_type() == kRowSparseStorage && !ograd.storage_initialized())) {
+    FillZerosRspImpl(s, *igrad);
+    return;
+  }
+
+  using namespace mxnet_op;
+  const int64_t num_cols = input.storage_shape()[1];
+  const TBlob& ograd_data = ograd.data();
+  const TBlob& in_data = input.data();
+  const TBlob in_row_idx = input.aux_data(rowsparse::kIdx);
+  if (ograd.storage_type() == kDefaultStorage) {
+    igrad->CheckAndAlloc({input.aux_shape(rowsparse::kIdx)});
+    const TBlob& igrad_data = igrad->data();
+    const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
+    if (0 == param.axis[0]) {  // forward is sum per column
+      MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
+        MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            Kernel<SquareSumRspGradKernel<req_type, 0, kDefaultStorage>, xpu>::Launch(
+                s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
+                igrad_data.dptr<DType>(), ograd_data.dptr<DType>(),
+                in_row_idx.dptr<IType>(), in_data.dptr<DType>(), num_cols);
+          })
+        })
+      })
+    } else {  // forward is sum per row
+      MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
+        MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
+          MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+            Kernel<SquareSumRspGradKernel<req_type, 1, kDefaultStorage>, xpu>::Launch(
+                s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
+                igrad_data.dptr<DType>(), ograd_data.dptr<DType>(),
+                in_row_idx.dptr<IType>(), in_data.dptr<DType>(), num_cols);
+          })
+        })
+      })
+    }
+  } else if (ograd.storage_type() == kRowSparseStorage) {
+    CHECK_EQ(1, param.axis[0]) << "SquareSumRspGradImpl only supports axis = 1"
+                                   " when ograd_stype = kRowSparseStorage";
+    CHECK_EQ(ograd.shape().ndim(), 2U);
+    const TBlob ograd_row_idx = ograd.aux_data(rowsparse::kIdx);
+    CHECK(ograd_row_idx.Size() == in_row_idx.Size() || in_row_idx.Size() == in_data.shape_[0]);
+    igrad->CheckAndAlloc({ograd.aux_shape(rowsparse::kIdx)});
+    const TBlob& igrad_data = igrad->data();
+    const TBlob igrad_row_idx = igrad->aux_data(rowsparse::kIdx);
+    MSHADOW_IDX_TYPE_SWITCH(igrad_row_idx.type_flag_, IType, {
+      if (std::is_same<xpu, cpu>::value) {
+        // when ograd_row_idx and in_row_idx have the same size and input is not a full rsp
+        // ograd_row_idx and in_row_idx are expected to have the same elements
+        if (in_row_idx.Size() != input.shape()[0]) {  // if input data is not a full rsp
+          CHECK_EQ(ograd_row_idx.Size(), in_row_idx.Size()) << "SquareSumRspGradImpl only supports"
+                                                               " equal ograd_row_idx and"
+                                                               " input_row_idx when ograd and"
+                                                               " input are both row-sparse and"
+                                                               " input data is not a full"
+                                                               " row-sparse matrix";
+          const IType* first1 = ograd_row_idx.dptr<IType>();
+          const IType* last1 = first1 + ograd_row_idx.Size();
+          const IType* first2 = in_row_idx.dptr<IType>();
+          CHECK(std::equal(first1, last1, first2)) << "SquareSumRspGradImpl only supports"
+                                                      " equal ograd_row_idx and input_row_idx"
+                                                      " when ograd and input are both"
+                                                      " row-sparse and input data is not a full"
+                                                      " row-sparse matrix";
+        }
+      } else {
+        LOG(FATAL) << "SquareSumRspGradImpl has not implemented GPU version when"
+                      " ograd and input are both row-sparse";
+      }
+      MSHADOW_TYPE_SWITCH(igrad_data.type_flag_, DType, {
+        MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
+          if (in_row_idx.Size() != input.shape()[0]) {  // input data is not a full rsp
+            Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage, false>, xpu>::Launch(
+                s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
+                igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),
+                ograd_data.dptr<DType>(), in_data.dptr<DType>(), num_cols);
+          } else {  // input data is a full rsp
+            Kernel<SquareSumRspGradKernel<req_type, 1, kRowSparseStorage, true>, xpu>::Launch(
+                s, igrad_data.Size(), igrad_row_idx.dptr<IType>(),
+                igrad_data.dptr<DType>(), ograd_row_idx.dptr<IType>(),
+                ograd_data.dptr<DType>(), in_data.dptr<DType>(), num_cols);
+          }
+        })
+      })
+    })
+  } else {
+    LOG(FATAL) << "SquareSumRspGradImpl only supports ograd_stype"
+               << " = kDefaultStorage/kRowSparseStorage";
+  }
+}
+
+template<typename xpu>
+void SquareSumOpForwardEx(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  const NDArrayStorageType istype = inputs[0].storage_type();
+  if (istype == kRowSparseStorage) {
+    CHECK_EQ(inputs[0].shape().ndim(), 2U) << "_square_sum op only supports"
+                                              " 2D ndarray as input";
+    NDArray output = outputs[0];
+    SquareSumRspImpl(attrs, s, inputs[0], req[0], &output);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
+template<typename xpu>
+void SquareSumOpBackwardEx(const nnvm::NodeAttrs& attrs,
+                           const OpContext& ctx,
+                           const std::vector<NDArray>& inputs,
+                           const std::vector<OpReqType>& req,
+                           const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(req.size(), 1U);
+  mshadow::Stream<xpu>* s = ctx.get_stream<xpu>();
+  const NDArrayStorageType ograd_stype = inputs[0].storage_type();
+  const NDArrayStorageType input_stype = inputs[1].storage_type();
+  if (input_stype == kRowSparseStorage &&
+      (ograd_stype == kDefaultStorage || ograd_stype == kRowSparseStorage)) {
+    CHECK_EQ(inputs[1].shape().ndim(), 2U) << "_square_sum op only supports"
+                                              " 2D ndarray as input";
+    NDArray output = outputs[0];
+    SquareSumRspGradImpl(attrs, s, inputs[0], inputs[1], req[0], &output);
+  } else {
+    LOG(FATAL) << "Not implemented: " << operator_string(attrs, ctx, inputs, req, outputs);
+  }
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_SQUARE_SUM_INL_H_
diff --git a/src/operator/tensor/square_sum.cc b/src/operator/tensor/square_sum.cc
new file mode 100644
index 000000000000..e4b49d7f7fcb
--- /dev/null
+++ b/src/operator/tensor/square_sum.cc
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file square_sum.cc
+ * \brief CPU Implementation of square_sum op.
+ */
+#include "./square_sum-inl.h"
+
+namespace mxnet {
+namespace op {
+MXNET_OPERATOR_REGISTER_REDUCE(_square_sum)
+.describe(R"code(Computes the square sum of array elements over a given axis
+for row-sparse matrix. This is a temporary solution for fusing ops square and
+sum together for row-sparse matrix to save memory for storing gradients.
+It will become deprecated once the functionality of fusing operators is finished
+in the future.
+
+Example::
+
+  dns = mx.nd.array([[0, 0], [1, 2], [0, 0], [3, 4], [0, 0]])
+  rsp = dns.tostype('row_sparse')
+  sum = mx.nd._internal._square_sum(rsp, axis=1)
+  sum = [0, 5, 0, 25, 0]
+)code" ADD_FILELINE)
+.set_attr<FInferStorageType>("FInferStorageType", SquareSumForwardInferStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SquareSumOpForwardEx<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseIn{"_backward_square_sum"});
+
+MXNET_OPERATOR_REGISTER_REDUCE_BACKWARD(_backward_square_sum)
+.set_num_inputs(2)
+.set_attr<FInferStorageType>("FInferStorageType", SquareSumBackwardInferStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SquareSumOpBackwardEx<cpu>);
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/util/tensor_util-inl.cuh b/src/operator/tensor/util/tensor_util-inl.cuh
new file mode 100644
index 000000000000..f38e8e117c94
--- /dev/null
+++ b/src/operator/tensor/util/tensor_util-inl.cuh
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file tensor_util-inl.cuh
+ * \brief commonly utilized tensor operator GPU kernels
+ */
+#ifndef MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_
+#define MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_
+
+#include <cub/cub.cuh>
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief Thread kernel for marking non-zero rows of a tensor.
+ * Parallelized by tensor rows: 1 thread/row
+ */
+struct MarkRspRowThreadKernel {
+  /*!
+   * \brief
+   * \param tid         global thread id
+   * \param row_flg     row flag array to mark non-zero rows
+   * \param dns         dense matrix data
+   * \param num_rows    number of rows (size of first dimension of tensor)
+   * \param row_length  number of elements per row
+   */
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* row_flg,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    if (tid < num_rows) {
+      dim_t j = 0;
+      dim_t offset = tid * row_length;
+      for (; j < row_length; ++j) {
+        if (dns[offset+j] != 0) {
+          break;
+        }
+      }
+      if (j < row_length) {
+        row_flg[tid] = 1;  // mark as one for non-zero row
+      } else {
+        row_flg[tid] = 0;  // mark as zero for zero row
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Warp kernel for marking non-zero rows of a tensor.
+ * Parallelized by tensor rows: 1 warp/row
+ */
+struct MarkRspRowWarpKernel {
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* row_flg,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    typedef cub::WarpReduce<dim_t> WarpReduce;
+    const dim_t warps_per_block = mshadow::cuda::kBaseThreadNum / 32;
+    __shared__ typename WarpReduce::TempStorage temp_storage[warps_per_block];
+
+    const dim_t warp_id   = tid / 32;          // global warp   id
+    const dim_t warp_lane = threadIdx.x / 32;  // local  warp   id within thread block
+    const dim_t lane      = tid & (32-1);      // local  thread id within warp
+
+    if (warp_id < num_rows) {
+      dim_t flg = 0;
+      dim_t offset = warp_id * row_length;
+      for (dim_t j = lane; j < row_length; j+=32) {
+        if (dns[offset+j] != 0) {
+          // avoid break: causes slower performance on sparse tensors (<20% density),
+          // due to thread divergence
+          flg++;
+        }
+      }
+      dim_t aggr = WarpReduce(temp_storage[warp_lane]).Sum(flg);
+      if (lane == 0) {
+        if (aggr > 0) {
+          row_flg[warp_id] = 1;  // mark as one for non-zero row
+        } else {
+          row_flg[warp_id] = 0;  // mark as zero for zero row
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief Block kernel for marking non-zero rows of a tensor.
+ * Parallelized by tensor rows: 1 threadBlock/row
+ */
+struct MarkRspRowBlockKernel {
+  template<typename DType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* row_flg,
+                                             const DType* dns,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t row_length) {
+    using nnvm::dim_t;
+    using mshadow::cuda::kBaseThreadNum;
+    typedef cub::BlockReduce<dim_t, kBaseThreadNum> BlockReduce;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+    if (blockIdx.x < num_rows) {
+      dim_t flg = 0;
+      dim_t offset = blockIdx.x * row_length;
+      for (dim_t j = threadIdx.x; j < row_length; j+=kBaseThreadNum) {
+        if (dns[offset+j] != 0) {
+          // avoid break: causes slower performance on sparse tensors (<20% density),
+          // due to thread divergence
+          flg++;
+        }
+      }
+      dim_t aggr = BlockReduce(temp_storage).Sum(flg);
+      if (threadIdx.x == 0) {
+        if (aggr > 0) {
+          row_flg[blockIdx.x] = 1;  // mark as one for non-zero row
+        } else {
+          row_flg[blockIdx.x] = 0;  // mark as zero for zero row
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \brief GPU kernel to flag non-zero rows of an rsp tensor with 1.
+ * Parallelized by tensor rows: 1 thread/row
+ */
+struct MarkRspRowFlgKernel {
+  /*!
+   * \brief
+   * \param tid      global thread id
+   * \param row_flg  array to flag storage indices of non-zero rows
+   * \param row_idx  rsp tensor row index array storing indices of non-zero rows
+   * \param nnr      rsp tensor number of non-zero rows (storage shape)
+   */
+  template<typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             IType* row_flg,
+                                             const IType* row_idx,
+                                             const nnvm::dim_t nnr) {
+    if (tid < nnr) {
+      row_flg[row_idx[tid]] = 1;
+    }
+  }
+};
+
+/*!
+ * \brief GPU kernel to flag non-zero rows of an rsp tensor with indices.
+ * Parallelized by matrix rows: 1 thread/row
+ */
+struct IndexRspRowFlgKernel {
+  /*!
+   * \brief
+   * \param tid      global thread id
+   * \param row_flg  array to flag storage indices of non-zero rows
+   * \param row_idx  rsp tensor row index array storing indices of non-zero rows
+   * \param nnr      rsp tensor number of non-zero rows (storage shape)
+   */
+  template<typename RType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             RType* row_flg,
+                                             const RType* row_idx,
+                                             const nnvm::dim_t nnr) {
+    if (tid < nnr) {
+      row_flg[row_idx[tid]] = tid+1;
+    }
+  }
+};
+
+/*!
+ * \brief GPU kernel for marking non-zero columns of a csr matrix.
+ * Parallelized by matrix rows: 1 warp/row
+ */
+struct MarkCsrColWarpKernel {
+  /*!
+   * \brief
+   * \param tid       global thread id
+   * \param flg       flg array to mark non-zero columns
+   * \param col_idx   csr matrix column indices
+   * \param indptr    csr matrix row index pointer
+   * \param num_rows  csr matrix number of rows
+   * \param num_cols  csr matrix number of columns
+   */
+  template<typename CType, typename IType>
+  __device__ __forceinline__ static void Map(int tid,
+                                             nnvm::dim_t* flg,
+                                             const CType* col_idx,
+                                             const IType* indptr,
+                                             const nnvm::dim_t num_rows,
+                                             const nnvm::dim_t num_cols) {
+    typedef unsigned long long int uint64_cu;
+    static_assert(sizeof(uint64_cu) == sizeof(nnvm::dim_t), "unexpected sizeof dim_t");
+
+    const nnvm::dim_t warp_id = tid / 32;      // global warp   id
+    const nnvm::dim_t lane    = tid & (32-1);  // local  thread id within warp
+
+    if (warp_id < num_rows) {
+      uint64_cu zero = 0;
+      uint64_cu one = 1;
+      for (IType j = indptr[warp_id]+lane; j < indptr[warp_id+1]; j+=32) {
+        atomicCAS(reinterpret_cast<uint64_cu*>(flg+col_idx[j]), zero, one);
+      }
+    }
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_CUH_
diff --git a/src/operator/tensor/util/tensor_util-inl.h b/src/operator/tensor/util/tensor_util-inl.h
new file mode 100644
index 000000000000..45b12730318a
--- /dev/null
+++ b/src/operator/tensor/util/tensor_util-inl.h
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file tensor_util-inl.h
+ * \brief commonly utilized tensor operator CPU kernels
+ */
+#ifndef MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_H_
+#define MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_H_
+
+#include <mxnet/base.h>
+#include <mxnet/operator.h>
+
+namespace mxnet {
+namespace op {
+
+/*!
+ * \brief kernel to flag indices that appear in row_idx array with 1.
+ */
+struct MarkRowFlgKernel {
+  /*!
+   * \brief
+   * \param tid      global thread id
+   * \param row_flg  flag array for indices
+   * \param row_idx  row index array storing indices of rows
+   */
+  template<typename IType, typename DType>
+  MSHADOW_XINLINE static void Map(int tid,
+                                  DType* row_flg,
+                                  const IType* row_idx) {
+    nnvm::dim_t idx = static_cast<nnvm::dim_t>(row_idx[tid]);
+    row_flg[idx] = 1;
+  }
+};
+
+/*!
+ * \brief kernel for filling the row index array of an rsp tensor.
+ * Parallelized by tensor rows: 1 thread/row
+ */
+struct FillRspRowIdxKernel {
+  /*!
+   * \brief
+   * \param tid          global thread id
+   * \param row_idx      row index array to store indices of non-zero rows
+   * \param row_flg_sum  inclusive prefix sum array over 0/1 marked row flag array
+   * \param num_rows     rsp tensor number of rows (shape)
+   */
+  template<typename RType>
+  MSHADOW_XINLINE static void Map(int tid,
+                                  RType* row_idx,
+                                  const nnvm::dim_t* row_flg_sum,
+                                  const nnvm::dim_t num_rows) {
+    if (tid < num_rows) {
+      nnvm::dim_t prev = (tid == 0) ? 0 : row_flg_sum[tid-1];
+      if (row_flg_sum[tid] > prev) {
+        row_idx[prev] = static_cast<RType>(tid);
+      }
+    }
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_TENSOR_UTIL_TENSOR_UTIL_INL_H_
diff --git a/src/operator/upsampling-inl.h b/src/operator/upsampling-inl.h
index fec0f74f14c8..bac9709f4e53 100644
--- a/src/operator/upsampling-inl.h
+++ b/src/operator/upsampling-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file upsampling-inl.h
  * \brief
  * \author Bing Xu
@@ -260,9 +261,7 @@ class UpSamplingProp : public OperatorProperty {
       if ((*in_type)[i] == -1) {
         (*in_type)[i] = dtype;
       } else {
-        CHECK_EQ((*in_type)[i], dtype) << "This layer requires uniform type. "
-                                       << "Expected " << dtype << " v.s. given "
-                                       << (*in_type)[i] << " at " << ListArguments()[i];
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
       }
     }
     out_type->clear();
diff --git a/src/operator/upsampling.cc b/src/operator/upsampling.cc
index 653b5709f120..8942e35ab325 100644
--- a/src/operator/upsampling.cc
+++ b/src/operator/upsampling.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
  * \author Bing Xu
diff --git a/src/operator/upsampling.cu b/src/operator/upsampling.cu
index 8152535233e4..f83535a2b2e6 100644
--- a/src/operator/upsampling.cu
+++ b/src/operator/upsampling.cu
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file upsampling_nearest.cc
  * \brief
  * \author Bing Xu
diff --git a/src/optimizer/sgd-inl.h b/src/optimizer/sgd-inl.h
index 01a330bece8c..3c0224d28070 100644
--- a/src/optimizer/sgd-inl.h
+++ b/src/optimizer/sgd-inl.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file sgd-inl.h
  * \brief Operator interface of mxnet.
  * \author Junyuan Xie
diff --git a/src/resource.cc b/src/resource.cc
index 4c2dbee33f2b..d1038dc57cc4 100644
--- a/src/resource.cc
+++ b/src/resource.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file resource.cc
  * \brief Implementation of resource manager.
  */
@@ -112,14 +113,14 @@ class ResourceManagerImpl : public ResourceManager {
 
   // request resources
   Resource Request(Context ctx, const ResourceRequest &req) override {
-    if (ctx.dev_mask() == cpu::kDevMask) {
+    if (ctx.dev_mask() == Context::kCPU) {
       switch (req.type) {
         case ResourceRequest::kRandom: return cpu_rand_->resource;
         case ResourceRequest::kTempSpace: return cpu_space_->GetNext();
         default: LOG(FATAL) << "Unknown supported type " << req.type;
       }
     } else {
-      CHECK_EQ(ctx.dev_mask(), gpu::kDevMask);
+      CHECK_EQ(ctx.dev_mask(), Context::kGPU);
 #if MSHADOW_USE_CUDA
       switch (req.type) {
         case ResourceRequest::kRandom: {
@@ -186,9 +187,11 @@ class ResourceManagerImpl : public ResourceManager {
     inline void Seed(uint32_t global_seed) {
       uint32_t seed = ctx.dev_id + global_seed * kRandMagic;
       mshadow::Random<xpu> *r = prnd;
-      Engine::Get()->PushSync([r, seed](RunContext rctx) {
+      Engine::Get()->PushAsync(
+        [r, seed](RunContext rctx, Engine::CallbackOnComplete on_complete) {
           r->set_stream(rctx.get_stream<xpu>());
           r->Seed(seed);
+          on_complete();
         }, ctx, {}, {resource.var},
         FnProperty::kNormal, 0, PROFILER_MESSAGE("ResourceRandomSetSeed"));
     }
diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h
index ead00dafbf44..f0dd61f01ac0 100644
--- a/src/storage/cpu_device_storage.h
+++ b/src/storage/cpu_device_storage.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cpu_device_storage.h
  * \brief CPU storage implementation.
  */
diff --git a/src/storage/cpu_shared_storage_manager.h b/src/storage/cpu_shared_storage_manager.h
new file mode 100644
index 000000000000..d623cf2c7b61
--- /dev/null
+++ b/src/storage/cpu_shared_storage_manager.h
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_STORAGE_CPU_SHARED_STORAGE_MANAGER_H_
+#define MXNET_STORAGE_CPU_SHARED_STORAGE_MANAGER_H_
+
+#if MXNET_USE_CUDA
+  #include <cuda_runtime.h>
+#endif  // MXNET_USE_CUDA
+#include <mxnet/base.h>
+
+#ifndef _WIN32
+#include <sys/mman.h>
+#include <sys/fcntl.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif  // _WIN32
+
+#include <unordered_map>
+#include <vector>
+#include <atomic>
+#include <iostream>
+#include <mutex>
+#include <new>
+#include <string>
+#include <limits>
+
+#include "./storage_manager.h"
+#include "../common/cuda_utils.h"
+
+
+namespace mxnet {
+namespace storage {
+/*!
+ * \brief Storage manager for cpu shared memory
+ */
+class CPUSharedStorageManager final : public StorageManager {
+ public:
+  /*!
+   * \brief Default constructor.
+   */
+  CPUSharedStorageManager() : rand_gen_(std::random_device()()) {}
+  /*!
+   * \brief Default destructor.
+   */
+  ~CPUSharedStorageManager() {
+    for (const auto& kv : pool_) {
+      FreeImpl(kv.second);
+    }
+  }
+
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override {
+    pool_.erase(handle.dptr);
+    FreeImpl(handle);
+  }
+
+  void DirectFree(Storage::Handle handle) override {
+    Free(handle);
+  }
+
+  void IncrementRefCount(const Storage::Handle& handle) {
+    std::atomic<int>* counter = reinterpret_cast<std::atomic<int>*>(
+        static_cast<char*>(handle.dptr) - alignment_);
+    ++(*counter);
+  }
+
+  int DecrementRefCount(const Storage::Handle& handle) {
+    std::atomic<int>* counter = reinterpret_cast<std::atomic<int>*>(
+        static_cast<char*>(handle.dptr) - alignment_);
+    return --(*counter);
+  }
+
+ private:
+  static constexpr size_t alignment_ = 16;
+
+  std::mutex mutex_;
+  std::mt19937 rand_gen_;
+  std::unordered_map<void*, Storage::Handle> pool_;
+
+  void FreeImpl(const Storage::Handle& handle);
+
+  std::string SharedHandleToString(int shared_pid, int shared_id) {
+    std::stringstream name;
+    name << "/mx_" << std::hex << shared_pid << "_" << std::hex << shared_id;
+    return name.str();
+  }
+  DISALLOW_COPY_AND_ASSIGN(CPUSharedStorageManager);
+};  // class CPUSharedStorageManager
+
+void CPUSharedStorageManager::Alloc(Storage::Handle* handle) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  std::uniform_int_distribution<> dis(0, std::numeric_limits<int>::max());
+  int fid = -1;
+  bool is_new = false;
+  size_t size = handle->size + alignment_;
+  void* ptr = nullptr;
+#ifdef _WIN32
+  LOG(FATAL) << "Shared memory is not supported on Windows yet.";
+#else
+  if (handle->shared_id == -1 && handle->shared_pid == -1) {
+    is_new = true;
+    handle->shared_pid = getpid();
+    for (int i = 0; i < 10; ++i) {
+      handle->shared_id = dis(rand_gen_);
+      auto filename = SharedHandleToString(handle->shared_pid, handle->shared_id);
+      fid = shm_open(filename.c_str(), O_EXCL|O_CREAT|O_RDWR, 0666);
+      if (fid != -1) break;
+    }
+  } else {
+    auto filename = SharedHandleToString(handle->shared_pid, handle->shared_id);
+    fid = shm_open(filename.c_str(), O_RDWR, 0666);
+  }
+
+  if (fid == -1) {
+    LOG(FATAL) << "Failed to open shared memory. shm_open failed with error "
+               << strerror(errno);
+  }
+
+  if (is_new) CHECK_EQ(ftruncate(fid, size), 0);
+
+  ptr = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fid, 0);
+  CHECK_NE(ptr, MAP_FAILED)
+      << "Failed to map shared memory. mmap failed with error " << strerror(errno);
+#endif  // _WIN32
+
+  if (is_new) {
+    new (ptr) std::atomic<int>(1);
+  }
+  handle->dptr = static_cast<char*>(ptr) + alignment_;
+  pool_[handle->dptr] = *handle;
+}
+
+void CPUSharedStorageManager::FreeImpl(const Storage::Handle& handle) {
+  int count = DecrementRefCount(handle);
+  CHECK_GE(count, 0);
+#ifdef _WIN32
+  LOG(FATAL) << "Shared memory is not supported on Windows yet.";
+#else
+  CHECK_EQ(munmap(static_cast<char*>(handle.dptr) - alignment_,
+                  handle.size + alignment_), 0)
+      << "Failed to unmap shared memory. munmap failed with error "
+      << strerror(errno);
+
+  if (count == 0) {
+    auto filename = SharedHandleToString(handle.shared_pid, handle.shared_id);
+    CHECK_EQ(shm_unlink(filename.c_str()), 0)
+        << "Failed to unlink shared memory. shm_unlink failed with error "
+        << strerror(errno);
+  }
+#endif  // _WIN32
+}
+
+}  // namespace storage
+}  // namespace mxnet
+
+#endif  // MXNET_STORAGE_CPU_SHARED_STORAGE_MANAGER_H_
diff --git a/src/storage/gpu_device_storage.h b/src/storage/gpu_device_storage.h
index 3c4f732c80dc..c8598922338a 100644
--- a/src/storage/gpu_device_storage.h
+++ b/src/storage/gpu_device_storage.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file gpu_device_storage.h
  * \brief GPU storage implementation.
  */
@@ -25,6 +26,7 @@
 #define MXNET_STORAGE_GPU_DEVICE_STORAGE_H_
 
 #include "mxnet/base.h"
+#include "mxnet/storage.h"
 #include "../common/cuda_utils.h"
 #if MXNET_USE_CUDA
 #include <cuda_runtime.h>
@@ -55,6 +57,9 @@ class GPUDeviceStorage {
 inline void* GPUDeviceStorage::Alloc(size_t size) {
   void* ret = nullptr;
 #if MXNET_USE_CUDA
+#if MXNET_USE_NCCL
+  std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
+#endif  // MXNET_USE_NCCL
   cudaError_t e = cudaMalloc(&ret, size);
   if (e != cudaSuccess && e != cudaErrorCudartUnloading)
     throw std::bad_alloc();
@@ -66,6 +71,9 @@ inline void* GPUDeviceStorage::Alloc(size_t size) {
 
 inline void GPUDeviceStorage::Free(void* ptr) {
 #if MXNET_USE_CUDA
+#if MXNET_USE_NCCL
+  std::lock_guard<std::mutex> l(Storage::Get()->GetMutex(Context::kGPU));
+#endif  // MXNET_USE_NCCL
   // throw special exception for caller to catch.
   cudaError_t err = cudaFree(ptr);
   // ignore unloading error, as memory has already been recycled
diff --git a/src/storage/naive_storage_manager.h b/src/storage/naive_storage_manager.h
index 731f374bbfd2..b05b242a799e 100644
--- a/src/storage/naive_storage_manager.h
+++ b/src/storage/naive_storage_manager.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file naive_storage_manager.h
  * \brief Naive storage manager.
  */
@@ -44,11 +45,11 @@ class NaiveStorageManager final : public StorageManager {
    * \brief Default destructor.
    */
   ~NaiveStorageManager() = default;
-  void* Alloc(size_t size) override;
-  void Free(void* ptr, size_t) override;
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override;
 
-  void DirectFree(void* ptr, size_t size) override {
-    DeviceStorage::Free(ptr);
+  void DirectFree(Storage::Handle handle) override {
+    DeviceStorage::Free(handle.dptr);
   }
 
  private:
@@ -56,13 +57,13 @@ class NaiveStorageManager final : public StorageManager {
 };  // class NaiveStorageManager
 
 template <class DeviceStorage>
-void* NaiveStorageManager<DeviceStorage>::Alloc(size_t size) {
-  return DeviceStorage::Alloc(size);
+void NaiveStorageManager<DeviceStorage>::Alloc(Storage::Handle* handle) {
+  handle->dptr = DeviceStorage::Alloc(handle->size);
 }
 
 template <class DeviceStorage>
-void NaiveStorageManager<DeviceStorage>::Free(void* ptr, size_t) {
-  DeviceStorage::Free(ptr);
+void NaiveStorageManager<DeviceStorage>::Free(Storage::Handle handle) {
+  DeviceStorage::Free(handle.dptr);
 }
 
 }  // namespace storage
diff --git a/src/storage/pinned_memory_storage.h b/src/storage/pinned_memory_storage.h
index 69e05f7cf90c..e3fec2f4a06d 100644
--- a/src/storage/pinned_memory_storage.h
+++ b/src/storage/pinned_memory_storage.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file cpu_device_storage.h
  * \brief CPU storage with pinned memory
  */
@@ -27,6 +28,7 @@
 
 #include <dmlc/logging.h>
 #include "mxnet/base.h"
+#include "mxnet/storage.h"
 #include "../common/cuda_utils.h"
 
 namespace mxnet {
@@ -50,12 +52,18 @@ class PinnedMemoryStorage {
 
 inline void* PinnedMemoryStorage::Alloc(size_t size) {
   void* ret = nullptr;
+#if MXNET_USE_NCCL
+  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+#endif
   // make the memory available across all devices
   CUDA_CALL(cudaHostAlloc(&ret, size, cudaHostAllocPortable));
   return ret;
 }
 
 inline void PinnedMemoryStorage::Free(void* ptr) {
+#if MXNET_USE_NCCL
+  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+#endif
   cudaError_t err = cudaFreeHost(ptr);
   // ignore unloading error, as memory has already been recycled
   if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
diff --git a/src/storage/pooled_storage_manager.h b/src/storage/pooled_storage_manager.h
index b2c6633a8082..4916bf1bd251 100644
--- a/src/storage/pooled_storage_manager.h
+++ b/src/storage/pooled_storage_manager.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file pooled_storage_manager.h
  * \brief Storage manager with a memory pool.
  */
@@ -28,6 +29,7 @@
   #include <cuda_runtime.h>
 #endif  // MXNET_USE_CUDA
 #include <mxnet/base.h>
+#include <mxnet/storage.h>
 #include <unordered_map>
 #include <vector>
 #include <mutex>
@@ -58,12 +60,17 @@ class GPUPooledStorageManager final : public StorageManager {
     ReleaseAll();
   }
 
-  void* Alloc(size_t raw_size) override;
-  void Free(void* ptr, size_t raw_size) override;
+  void Alloc(Storage::Handle* handle) override;
+  void Free(Storage::Handle handle) override;
 
-  void DirectFree(void* ptr, size_t raw_size) override {
-    cudaError_t err = cudaFree(ptr);
-    size_t size = raw_size + NDEV;
+  void DirectFree(Storage::Handle handle) override {
+    std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+    DirectFreeNoLock(handle);
+  }
+
+  void DirectFreeNoLock(Storage::Handle handle) {
+    cudaError_t err = cudaFree(handle.dptr);
+    size_t size = handle.size + NDEV;
     // ignore unloading error, as memory has already been recycled
     if (err != cudaSuccess && err != cudaErrorCudartUnloading) {
       LOG(FATAL) << "CUDA: " << cudaGetErrorString(err);
@@ -73,8 +80,6 @@ class GPUPooledStorageManager final : public StorageManager {
 
  private:
   void ReleaseAll();
-  // internal mutex
-  std::mutex mutex_;
   // used memory
   size_t used_memory_ = 0;
   // percentage of reserved memory
@@ -86,9 +91,9 @@ class GPUPooledStorageManager final : public StorageManager {
   DISALLOW_COPY_AND_ASSIGN(GPUPooledStorageManager);
 };  // class GPUPooledStorageManager
 
-void* GPUPooledStorageManager::Alloc(size_t raw_size) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  size_t size = raw_size + NDEV;
+void GPUPooledStorageManager::Alloc(Storage::Handle* handle) {
+  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+  size_t size = handle->size + NDEV;
   auto&& reuse_it = memory_pool_.find(size);
   if (reuse_it == memory_pool_.end() || reuse_it->second.size() == 0) {
     size_t free, total;
@@ -102,30 +107,34 @@ void* GPUPooledStorageManager::Alloc(size_t raw_size) {
       LOG(FATAL) << "cudaMalloc failed: " << cudaGetErrorString(e);
     }
     used_memory_ += size;
-    return ret;
+    handle->dptr = ret;
   } else {
     auto&& reuse_pool = reuse_it->second;
     auto ret = reuse_pool.back();
     reuse_pool.pop_back();
-    return ret;
+    handle->dptr = ret;
   }
 }
 
-void GPUPooledStorageManager::Free(void* ptr, size_t raw_size) {
-  std::lock_guard<std::mutex> lock(mutex_);
-  size_t size = raw_size + NDEV;
+void GPUPooledStorageManager::Free(Storage::Handle handle) {
+  std::lock_guard<std::mutex> lock(Storage::Get()->GetMutex(Context::kGPU));
+  size_t size = handle.size + NDEV;
   auto&& reuse_pool = memory_pool_[size];
-  reuse_pool.push_back(ptr);
+  reuse_pool.push_back(handle.dptr);
 }
 
 void GPUPooledStorageManager::ReleaseAll() {
   for (auto&& i : memory_pool_) {
     for (auto&& j : i.second) {
-      DirectFree(j, i.first - NDEV);
+      Storage::Handle handle;
+      handle.dptr = j;
+      handle.size = i.first - NDEV;
+      DirectFreeNoLock(handle);
     }
   }
   memory_pool_.clear();
 }
+
 #endif  // MXNET_USE_CUDA
 
 }  // namespace storage
diff --git a/src/storage/storage.cc b/src/storage/storage.cc
index fa15a44b4fb6..ce40daa90449 100644
--- a/src/storage/storage.cc
+++ b/src/storage/storage.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  */
 #include <mxnet/storage.h>
 #include <mshadow/tensor.h>
@@ -26,6 +27,7 @@
 #include "./storage_manager.h"
 #include "./naive_storage_manager.h"
 #include "./pooled_storage_manager.h"
+#include "./cpu_shared_storage_manager.h"
 #include "./cpu_device_storage.h"
 #include "./pinned_memory_storage.h"
 #include "../common/cuda_utils.h"
@@ -36,9 +38,10 @@ namespace mxnet {
 // consider change storage as a pure abstract class
 class StorageImpl : public Storage {
  public:
-  Handle Alloc(size_t size, Context ctx) override;
+  void Alloc(Handle* handle) override;
   void Free(Handle handle) override;
   void DirectFree(Handle handle) override;
+  void SharedIncrementRefCount(Handle handle) override;
   StorageImpl() {}
   virtual ~StorageImpl() = default;
 
@@ -51,12 +54,13 @@ class StorageImpl : public Storage {
 
   static void ActivateDevice(Context ctx) {
     switch (ctx.dev_type) {
-      case Context::kCPU: break;
+      case Context::kCPU:
+      case Context::kCPUShared: break;
       case Context::kGPU:
       case Context::kCPUPinned: {
 #if MXNET_USE_CUDA
           if (num_gpu_device > 0) {
-            CUDA_CALL(cudaSetDevice(ctx.dev_id));
+            CUDA_CALL(cudaSetDevice(ctx.real_dev_id()));
           }
 #endif  // MXNET_USE_CUDA
           break;
@@ -73,20 +77,21 @@ class StorageImpl : public Storage {
 int StorageImpl::num_gpu_device = 0;
 #endif  // MXNET_USE_CUDA
 
-Storage::Handle StorageImpl::Alloc(size_t size, Context ctx) {
+void StorageImpl::Alloc(Storage::Handle* handle) {
   // space already recycled, ignore request
-  Handle hd;
-  hd.ctx = ctx;
-  hd.size = size;
-  auto&& device = storage_managers_.at(ctx.dev_type);
+  auto&& device = storage_managers_.at(handle->ctx.dev_type);
   std::shared_ptr<storage::StorageManager> manager = device.Get(
-      ctx.dev_id, [ctx]() {
+      handle->ctx.real_dev_id(), [handle]() {
         storage::StorageManager *ptr = nullptr;
-        switch (ctx.dev_type) {
+        switch (handle->ctx.dev_type) {
           case Context::kCPU: {
             ptr = new storage::NaiveStorageManager<storage::CPUDeviceStorage>();
             break;
           }
+          case Context::kCPUShared: {
+            ptr = new storage::CPUSharedStorageManager();
+            break;
+          }
           case Context::kCPUPinned: {
 #if MXNET_USE_CUDA
             num_gpu_device = 0;
@@ -114,38 +119,47 @@ Storage::Handle StorageImpl::Alloc(size_t size, Context ctx) {
 #endif  // MXNET_USE_CUDA
             break;
           }
-          default: LOG(FATAL) <<  "Unimplemented device " << ctx.dev_type;
+          default: LOG(FATAL) <<  "Unimplemented device " << handle->ctx.dev_type;
         }
         return ptr;
       });
-  this->ActivateDevice(ctx);
-  hd.dptr = manager->Alloc(size);
-  return hd;
+
+  this->ActivateDevice(handle->ctx);
+  manager->Alloc(handle);
 }
 
 void StorageImpl::Free(Storage::Handle handle) {
   const Context &ctx = handle.ctx;
   auto&& device = storage_managers_.at(ctx.dev_type);
   std::shared_ptr<storage::StorageManager> manager = device.Get(
-      ctx.dev_id, []() {
+      ctx.real_dev_id(), []() {
         LOG(FATAL) <<  "Cannot Free space to a device you have not allocated";
         return nullptr;
       });
   this->ActivateDevice(ctx);
-  manager->Free(handle.dptr, handle.size);
+  manager->Free(handle);
 }
 
 void StorageImpl::DirectFree(Storage::Handle handle) {
   const Context &ctx = handle.ctx;
   auto&& device = storage_managers_.at(ctx.dev_type);
   std::shared_ptr<storage::StorageManager> manager = device.Get(
-      ctx.dev_id, []() {
+      ctx.real_dev_id(), []() {
         LOG(FATAL) <<  "Cannot Free space to a device you have not allocated";
         return nullptr;
       });
   this->ActivateDevice(ctx);
-  // directly free ths data.
-  manager->DirectFree(handle.dptr, handle.size);
+  manager->DirectFree(handle);
+}
+
+void StorageImpl::SharedIncrementRefCount(Storage::Handle handle) {
+  CHECK_EQ(handle.ctx.dev_type, Context::kCPUShared);
+  auto&& device = storage_managers_.at(Context::kCPUShared);
+  auto manager = device.Get(0, []() {
+      LOG(FATAL) << "Cannot increment ref count before allocating any shared memory.";
+      return nullptr;
+    });
+  dynamic_cast<storage::CPUSharedStorageManager*>(manager.get())->IncrementRefCount(handle);
 }
 
 std::shared_ptr<Storage> Storage::_GetSharedRef() {
diff --git a/src/storage/storage_manager.h b/src/storage/storage_manager.h
index 924d2ed48b1a..15a2c7ecffcb 100644
--- a/src/storage/storage_manager.h
+++ b/src/storage/storage_manager.h
@@ -18,12 +18,15 @@
  */
 
 /*!
+ * Copyright (c) 2015 by Contributors
  * \file storage_manager.h
  * \brief Storage manager.
  */
+
 #ifndef MXNET_STORAGE_STORAGE_MANAGER_H_
 #define MXNET_STORAGE_STORAGE_MANAGER_H_
 
+#include <mxnet/storage.h>
 #include <cstddef>
 
 namespace mxnet {
@@ -39,19 +42,19 @@ class StorageManager {
    * \param size Size to allocate.
    * \return Pointer to the storage.
    */
-  virtual void* Alloc(size_t size) = 0;
+  virtual void Alloc(Storage::Handle* handle) = 0;
   /*!
    * \brief Deallocation.
    * \param ptr Pointer to deallocate.
    * \param size Size of the storage.
    */
-  virtual void Free(void* ptr, size_t size) = 0;
+  virtual void Free(Storage::Handle handle) = 0;
   /*!
    * \brief Direct de-allocation.
    * \param ptr Pointer to deallocate.
    * \param size Size of the storage.
    */
-  virtual void DirectFree(void* ptr, size_t size) = 0;
+  virtual void DirectFree(Storage::Handle handle) = 0;
   /*!
    * \brief Destructor.
    */
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 418d40e9eb8a..f2758c44284e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -28,24 +28,30 @@ if(GTEST_FOUND)
 
   include_directories(cpp/include)
 
+  if (NOT PRIVATE_RUNTIME_DIR)
+   set(PRIVATE_RUNTIME_DIR ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+  endif()
+
   add_executable(${PROJECT_NAME}_unit_tests ${UNIT_TEST_SOURCE})
+  set_property(TARGET ${PROJECT_NAME}_unit_tests
+               PROPERTY RUNTIME_OUTPUT_DIRECTORY ${PRIVATE_RUNTIME_DIR})
 
   if(UNITTEST_STATIC_LINK)
     target_link_libraries(${PROJECT_NAME}_unit_tests
       ${GTEST_LIBRARY}
-      rt
       ${BEGIN_WHOLE_ARCHIVE} mxnet_static ${END_WHOLE_ARCHIVE}
       dmlc
       ${mxnet_LINKER_LIBS}
+      ${pslite_LINKER_LIBS}
       )
   else()
     target_link_libraries(${PROJECT_NAME}_unit_tests
       ${GTEST_LIBRARY}
-      rt
       dmlc
       ${nnvm_LINKER_LIBS}
       ${mxnet_LINKER_LIBS}
       mxnet
+      ${pslite_LINKER_LIBS}
       )
   endif()
 
diff --git a/tests/ci_build/Dockerfile.mklml_gpu b/tests/ci_build/Dockerfile.mklml_gpu
index 1c29ca3248ec..185681cb5293 100644
--- a/tests/ci_build/Dockerfile.mklml_gpu
+++ b/tests/ci_build/Dockerfile.mklml_gpu
@@ -9,7 +9,7 @@ RUN /install/ubuntu_install_python.sh
 COPY install/ubuntu_install_scala.sh /install/
 RUN /install/ubuntu_install_scala.sh
 
-RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.7/mklml_lnx_2018.0.20170425.tgz
+RUN wget --no-check-certificate -O /tmp/mklml.tgz https://github.com/01org/mkl-dnn/releases/download/v0.10/mklml_lnx_2018.0.20170908.tgz
 RUN tar -zxvf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
 
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5 b/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5
index e9810af6b72c..88fd7cea6fcb 100644
--- a/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5
+++ b/tests/ci_build/Dockerfile.ubuntu1404_cuda75_cudnn5
@@ -23,7 +23,13 @@ RUN cd /usr/src/gtest && cmake CMakeLists.txt && make && cp *.a /usr/lib
 RUN pip install nose cpplint 'pylint==1.4.4' 'astroid==1.3.6'
 
 # MAVEN
-RUN apt-get install -y maven default-jdk
+RUN apt-get install -y software-properties-common
+RUN add-apt-repository ppa:webupd8team/java -y
+RUN apt-get update
+RUN echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections
+RUN apt-get install -y oracle-java8-installer
+RUN apt-get install -y oracle-java8-set-default
+RUN apt-get install -y maven
 
 # R
 RUN apt-get install -y software-properties-common r-base-core libcurl4-openssl-dev libssl-dev libxml2-dev
diff --git a/tests/ci_build/install/install_julia.sh b/tests/ci_build/install/install_julia.sh
old mode 100644
new mode 100755
diff --git a/tests/ci_build/install/install_library.sh b/tests/ci_build/install/install_library.sh
old mode 100644
new mode 100755
diff --git a/tests/ci_build/install/install_maven.sh b/tests/ci_build/install/install_maven.sh
old mode 100644
new mode 100755
diff --git a/tests/ci_build/install/install_openblas.sh b/tests/ci_build/install/install_openblas.sh
old mode 100644
new mode 100755
diff --git a/tests/ci_build/install/install_opencv.sh b/tests/ci_build/install/install_opencv.sh
old mode 100644
new mode 100755
diff --git a/tests/ci_build/install/install_python2.sh b/tests/ci_build/install/install_python2.sh
old mode 100644
new mode 100755
diff --git a/tests/ci_build/install/install_python3.sh b/tests/ci_build/install/install_python3.sh
old mode 100644
new mode 100755
diff --git a/tests/ci_build/install/install_testdeps.sh b/tests/ci_build/install/install_testdeps.sh
old mode 100644
new mode 100755
diff --git a/tests/ci_build/install/ubuntu_install_perl.sh b/tests/ci_build/install/ubuntu_install_perl.sh
index a981746bc18d..af49952f97d6 100755
--- a/tests/ci_build/install/ubuntu_install_perl.sh
+++ b/tests/ci_build/install/ubuntu_install_perl.sh
@@ -19,4 +19,4 @@
 
 # install libraries for mxnet's perl package on ubuntu
 apt-get update && apt-get install -y libmouse-perl pdl cpanminus swig libgraphviz-perl
-cpanm -q Function::Parameters
+cpanm -q Function::Parameters Hash::Ordered
diff --git a/tests/ci_build/install/ubuntu_install_python.sh b/tests/ci_build/install/ubuntu_install_python.sh
index bb67e3401a89..db4e9c4e0c94 100755
--- a/tests/ci_build/install/ubuntu_install_python.sh
+++ b/tests/ci_build/install/ubuntu_install_python.sh
@@ -24,5 +24,5 @@ apt-get update && apt-get install -y python-dev python3-dev
 # the version of the pip shipped with ubuntu may be too lower, install a recent version here
 cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py && python2 get-pip.py
 
-pip2 install nose pylint numpy nose-timer requests h5py
-pip3 install nose pylint numpy nose-timer requests h5py
+pip2 install nose pylint numpy nose-timer requests h5py scipy
+pip3 install nose pylint numpy nose-timer requests h5py scipy
diff --git a/tests/ci_build/install/ubuntu_install_r.sh b/tests/ci_build/install/ubuntu_install_r.sh
index 38d89a3e4783..6d34441b2962 100755
--- a/tests/ci_build/install/ubuntu_install_r.sh
+++ b/tests/ci_build/install/ubuntu_install_r.sh
@@ -24,5 +24,5 @@ gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9
 gpg -a --export E084DAB9 | apt-key add -
 
 apt-get update
-apt-get install -y r-base r-base-dev libxml2-dev libssl-dev
+apt-get install -y r-base r-base-dev libxml2-dev libssl-dev libxt-dev
 
diff --git a/tests/ci_build/install/ubuntu_install_scala.sh b/tests/ci_build/install/ubuntu_install_scala.sh
index 712eff98b02a..169ece036d2f 100755
--- a/tests/ci_build/install/ubuntu_install_scala.sh
+++ b/tests/ci_build/install/ubuntu_install_scala.sh
@@ -19,5 +19,10 @@
 
 # install libraries for mxnet's scala package on ubuntu
 
-apt-get update && apt-get install -y \
-    maven default-jdk
+apt-get install -y software-properties-common
+add-apt-repository -y ppa:webupd8team/java
+apt-get update
+echo "oracle-java8-installer shared/accepted-oracle-license-v1-1 select true" | debconf-set-selections
+apt-get install -y oracle-java8-installer
+apt-get install -y oracle-java8-set-default
+apt-get update && apt-get install -y maven
diff --git a/tests/ci_build/pip_tests/Dockerfile.pip_dependencies b/tests/ci_build/pip_tests/Dockerfile.pip_dependencies
index 02b7df847d4e..e08c22f3c7ca 100644
--- a/tests/ci_build/pip_tests/Dockerfile.pip_dependencies
+++ b/tests/ci_build/pip_tests/Dockerfile.pip_dependencies
@@ -8,6 +8,7 @@ RUN apt-get install -y python python2.7 python3.4 python3.5 python3.6
 
 # install other dependencies
 RUN apt-get install -y wget git unzip gcc
+RUN apt-get install -y libgfortran3
 
 # install virtualenv
 RUN wget https://bootstrap.pypa.io/get-pip.py && python get-pip.py && pip install virtualenv && rm -rf get-pip.py
diff --git a/tests/cpp/engine/threaded_engine_test.cc b/tests/cpp/engine/threaded_engine_test.cc
index 58b7e57a509c..be60ecfc53cf 100644
--- a/tests/cpp/engine/threaded_engine_test.cc
+++ b/tests/cpp/engine/threaded_engine_test.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file threaded_engine_test.cc
  * \brief threaded engine tests
 */
diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h
new file mode 100644
index 000000000000..51cbcd71a3b9
--- /dev/null
+++ b/tests/cpp/include/test_core_op.h
@@ -0,0 +1,662 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TEST_CORE_OP_H_
+#define TEST_CORE_OP_H_
+
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include <string>
+#include "./test_op.h"
+#include "../../../src/imperative/imperative_utils.h"
+
+namespace mxnet {
+namespace test {
+namespace op {
+
+// Tried making this a struct w/constexpr, but getting undefined reference on gcc 5.4.1
+#define COREOP_FWD_OP_NAME_KEY          "fwd_op_name"
+#define COREOP_BWD_OP_NAME_KEY          "bwd_op_name"
+#define COREOP_BWD_OP_NAME_VALUE_NONE   "[none]"
+
+enum TimingDirection {
+  kForward,
+  kBackward
+};
+
+inline const char *TimingDirectionAsString(const TimingDirection td) {
+  switch (td) {
+    case kForward:
+      return "Forward";
+    case kBackward:
+      return "Backward";
+    default:
+      CHECK(false) << "Unknown timing direction: " << static_cast<int>(td);
+      return "<unknown>";
+  }
+}
+
+/*!
+ * Low-noise operator executor
+ * @tparam DType Data type for the operator executions
+ */
+template<typename DType>
+class CoreOpExecutor : public test::op::OperatorDataInitializer<DType>
+  , public test::op::OperatorExecutorTiming {
+  /*! \brief Performance timing categories */
+  /*!
+   * \brief Access data blob as if on the CPU via a callback
+   * \tparam Type of callback Function to call with CPU-data NDArray
+   * \param src Source NDArray (on GPU or CPU)
+   * \param run_ctx Run context
+   * \param cb Callback Function to call with CPU-data NDArray
+   */
+  template <typename CallbackFunction>
+  static inline void AccessAsCPU(const NDArray &src,
+                                 const RunContext &run_ctx,
+                                 CallbackFunction cb) {
+#if MXNET_USE_CUDA
+    if (src.ctx().dev_type == Context::kCPU) {
+      cb(src);
+    } else {
+      Context cpu_ctx, gpu_ctx = src.ctx();
+      cpu_ctx.dev_type = Context::kCPU;
+      cpu_ctx.dev_id = 0;
+      NDArray on_cpu(src.shape(), cpu_ctx);
+      on_cpu.CheckAndAlloc();
+      TBlob tmp1 = on_cpu.data();
+      mxnet::ndarray::Copy<gpu, cpu>(src.data(), &tmp1, cpu_ctx, gpu_ctx, run_ctx);
+      cb(on_cpu);
+      TBlob tmp2 = src.data();
+      mxnet::ndarray::Copy<cpu, gpu>(on_cpu.data(), &tmp2, gpu_ctx, cpu_ctx, run_ctx);
+    }
+#else
+    cb(src);
+#endif
+  }
+
+  /*!
+   * \brief Parse additional arguments into NodeAttrs structure
+   * \param op Pointer to operator object
+   * \param args vector of string pairs representing argument key/value pairs
+   * \return Constructed NodeAttrs structure
+   */
+  static nnvm::NodeAttrs ParseAttrs(const nnvm::Op *op, const kwargs_t& args) {
+    const size_t count = args.size();
+    std::vector<const char *> keys, values;
+    keys.reserve(count);
+    values.reserve(count);
+    for (kwargs_t::const_iterator i_iter = args.begin(), e_iter = args.end();
+         i_iter != e_iter; ++i_iter) {
+      keys.emplace_back(i_iter->first.c_str());
+      values.emplace_back(i_iter->second.c_str());
+    }
+    return imperative::ParseAttrs(op, op->num_inputs, count, &keys[0], &values[0]);
+  }
+
+  /*!
+   * \brief Return vector of data blobs associated with anm array of NDArray objects
+   * \param src vector of NDArrays
+   * \param dest Vector to store pointers to the NDArrays' data blobs
+   * \return Reference to the supplied vector of TBlob results
+   */
+  static inline std::vector<TBlob>& CollectBlobs(const std::vector<NDArray>& src,
+                                                 std::vector<TBlob> *dest) {
+    dest->reserve(dest->size() + src.size());
+    for (size_t i = 0, n = src.size(); i < n; ++i) {
+      dest->emplace_back(src[i].data());
+    }
+    return *dest;
+  }
+
+  /*!
+   * \brief Create NDArray of random data
+   * \param shape Shape of the tensor to be created
+   * \param ctx Context to use when creating the array/tensor
+   * \return The created NDArray
+   */
+  NDArray CreateRandArray(const TShape& shape, const Context& ctx) const {
+    CHECK_GT(shape.Size(), 0);  // Check it's a valid shape
+    NDArray array(shape, ctx, true, mshadow::DataType<DType>::kFlag);
+    array.CheckAndAlloc();
+    AccessAsCPU(array, ctx_.run_ctx, [this](const NDArray &arr) {
+      test::op::OperatorDataInitializer<DType>::FillRandom(arr.data());
+    });
+    return std::move(array);
+  }
+
+  /*!
+   * \brief Create NDArray of zeros
+   * \param shape Shape of the tensor to be created
+   * \param ctx Context to use when creating the array/tensor
+   * \return The created NDArray
+   */
+  NDArray CreateZeroArray(const TShape& shape, const Context& ctx) const {
+    CHECK_GT(shape.Size(), 0);  // Check it's a valid shape
+    NDArray array(shape, ctx, true, mshadow::DataType<DType>::kFlag);
+    array.CheckAndAlloc();
+    AccessAsCPU(array, ctx_.run_ctx, [this](const NDArray &arr) {
+      test::op::OperatorDataInitializer<DType>::FillZero(arr.data());
+    });
+    return std::move(array);
+  }
+
+  nnvm::NodePtr MakeNode() const {
+    nnvm::NodePtr node = nnvm::Node::Create();
+    node->attrs = attrs_;
+    return node;
+  }
+
+  /*!
+   * \brief Get backward op executors
+   * \return Vector of backward executors
+   */
+  std::vector<std::pair<std::shared_ptr<CoreOpExecutor>, std::string>> GetBackward() {
+    std::vector<std::pair<std::shared_ptr<CoreOpExecutor>, std::string>> res;
+    static auto gradient = nnvm::Op::GetAttr<nnvm::FGradient>("FGradient");
+    nnvm::FGradient grad_fun = gradient.get(op_, nullptr);
+    if (grad_fun) {
+      std::vector<nnvm::NodeEntry> out_grads;
+      std::vector<nnvm::NodeEntry> entries = grad_fun(MakeNode(), out_grads);
+      CHECK_GE(entries.size(), 1U);
+      res.reserve(entries.size());
+      for (const nnvm::NodeEntry& node_entry : entries) {
+        CHECK_NOTNULL(node_entry.node.get());
+        CHECK_NOTNULL(node_entry.node->op());
+        CHECK_GT(node_entry.node->op()->name.size(), 0);
+        if (verbose_) {
+          std::cout << node_entry.node->op()->name << std::endl;
+        }
+        std::shared_ptr<CoreOpExecutor> pOp = std::make_shared<CoreOpExecutor>(
+          ctx().run_ctx.ctx.dev_type == Context::kGPU, ShapesOf(outputs()));
+        res.push_back({ pOp, node_entry.node->op()->name });
+      }
+    }
+    return res;
+  }
+
+  /*!
+   * \brief Attach any temp or random resources required to perform the op's compute operation
+   * \param ctx Operator context object
+   * \param attrs NodeAttrs structure (node attributes)
+   * \param op Pointer to nnvm Operator object
+   */
+  void AttachResources(OpContext *ctx, const nnvm::NodeAttrs& attrs, const nnvm::Op *op) {
+    static auto& fresource = nnvm::Op::GetAttr<FResourceRequest>("FResourceRequest");
+    if (fresource.count(op) != 0) {
+      std::vector<Resource>& requested = ctx->requested;
+      auto reqs = fresource[op](attrs);
+      // Get the resource of temporal space.
+      for (const ResourceRequest& req : reqs) {
+        if (req.type == ResourceRequest::kTempSpace) {
+          Resource r = ResourceManager::Get()->Request(ctx->run_ctx.ctx, req);
+          requested.emplace_back(r);
+        } else if (req.type == ResourceRequest::kRandom) {
+          requested.emplace_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req));
+        } else {
+          LOG(FATAL) << "resource type not yet supported";
+        }
+      }
+    }
+  }
+
+ public:
+  typedef DType   DataType;
+
+  /*! \brief Add 'fwd_op_name' to kwargs and return the new kwargs */
+  static kwargs_t ArgsWithOpName(const kwargs_t& args,
+                                 const std::string& fwd_op_name,
+                                 const std::string& bwd_op_name = "") {
+    CHECK(!fwd_op_name.empty());
+    kwargs_t new_args;
+    new_args.reserve(args.size() + 1);
+    for (const auto& a : args) {
+      if (a.first != COREOP_FWD_OP_NAME_KEY && a.first != COREOP_BWD_OP_NAME_KEY) {
+        new_args.emplace_back(a);
+      }
+    }
+    new_args.push_back({ COREOP_FWD_OP_NAME_KEY, fwd_op_name});
+    if (!bwd_op_name.empty()) {
+      new_args.push_back({ COREOP_BWD_OP_NAME_KEY, bwd_op_name});
+    }
+    return new_args;
+  }
+
+  /*! \brief Remove 'fwd_op_name' from kwargs and return the new kwargs */
+  static kwargs_t ArgsSansOpName(const kwargs_t& args,
+                                 std::string* fwd_op_name_ptr,
+                                 std::string* bwd_op_name_ptr = nullptr) {
+    CHECK_NOTNULL(fwd_op_name_ptr);
+    CHECK_NOTNULL(bwd_op_name_ptr);
+    bwd_op_name_ptr->resize(0);
+    kwargs_t new_args;
+    new_args.reserve(args.size());
+    for (const auto& a : args) {
+      if (a.first == COREOP_FWD_OP_NAME_KEY) {
+        *fwd_op_name_ptr = a.second;
+      } else if (a.first == COREOP_BWD_OP_NAME_KEY) {
+        *bwd_op_name_ptr = a.second;
+      } else {
+        new_args.emplace_back(a);
+      }
+    }
+    return new_args;
+  }
+
+  /*!
+   * \brief Constructor
+   * \param isGPU Is this going to be on the GPU?
+   * \param shapes Array of input shapes
+   */
+  CoreOpExecutor(const bool isGPU, const std::vector<TShape>& shapes)
+    : input_shapes_(shapes)
+      , op_(nullptr)  {
+    ctx_.is_train = true;
+    ctx_.run_ctx.ctx.dev_id = 0;
+    ctx_.run_ctx.stream = nullptr;
+    ctx_.run_ctx.ctx.dev_type = Context::kCPU;
+#if MXNET_USE_CUDA
+    if (isGPU) {
+      ctx_.run_ctx.ctx.dev_type = Context::kGPU;
+      allocGPUStream_.reset(new GPUStreamScope(&ctx_));
+    } else {
+      ctx_.run_ctx.ctx.dev_type = Context::kCPU;
+    }
+#else
+    CHECK(!isGPU);
+    ctx_.run_ctx.ctx.dev_type = Context::kCPU;
+#endif
+  }
+
+  /*!
+   * \brief Initialize the execution objects and execution data (only occurs once)
+   * \param args Parameter arguments
+   * \param inputs Optional input data (otherwise, random data will be used as input)
+   */
+  void Init(const kwargs_t& in_args,
+            const std::vector<NDArray>& inputs = {},
+            const std::vector<NDArray>& outputs = {},
+            const CoreOpExecutor *backward_for_op = nullptr
+  ) {
+    if (!initialized_) {
+      initialized_ = true;
+
+      std::string op_name, bwd_op_name;
+      kwargs_t args = ArgsSansOpName(in_args, &op_name, &bwd_op_name);
+      CHECK(op_name.empty() == false);
+
+      CHECK(!backward_for_op || bwd_op_name.empty())
+        << "Backward op should not be supplied another backward operator";
+
+      if (verbose_ && backward_for_op) {
+        std::cout << "Backward op: " << op_name;
+      }
+
+      op_ = nnvm::Op::Get(op_name);
+      CHECK_NOTNULL(op_);
+
+      // Set up forward
+      attrs_ = ParseAttrs(op_, args);
+
+      const int num_inputs = op_->num_inputs;
+
+      if (!inputs.empty()) {
+        CHECK_EQ(inputs.size(), static_cast<size_t>(num_inputs));
+      }
+
+      int inferred_num_outputs, num_visible_outputs;
+
+      imperative::SetNumOutputs(op_, attrs_, num_inputs, &inferred_num_outputs,
+                                &num_visible_outputs);
+
+      // Generic, all shapes the same. Probably this will need to be adjusted for more complex
+      // operators such as dot
+      std::vector<TShape> shapes;
+      for (size_t i = 0, n = std::max(num_visible_outputs, num_inputs); i < n; ++i) {
+        shapes.emplace_back(i < input_shapes_.size() ? input_shapes_[i]
+                                                  : input_shapes_[input_shapes_.size() - 1]);
+      }
+      std::vector<NDArray *> inputs_p, outputs_p;
+
+      if (!outputs.empty()) {
+        CHECK_EQ(outputs.size(), static_cast<size_t>(num_visible_outputs));
+      }
+
+      inputs_.reserve(num_inputs);
+      inputs_p.reserve(num_inputs);
+      outputs_.reserve(num_visible_outputs);
+      outputs_p.reserve(num_visible_outputs);
+
+      for (size_t i = 0; i < static_cast<size_t>(num_inputs); ++i) {
+        CHECK_LT(i, static_cast<int>(shapes.size()));
+        inputs_.emplace_back(i < inputs.size() ? inputs[i] : CreateRandArray(shapes[i],
+                                                                          ctx_.run_ctx.ctx));
+        inputs_p.emplace_back(&*inputs_.rbegin());
+      }
+
+      for (size_t i = 0; i < static_cast<size_t>(num_visible_outputs); ++i) {
+        // If supplied and valid, pass from the supplied outputs vector
+        // Otherwise use empty for forward pass, or zero-filled for backward pass
+        outputs_.emplace_back(i < outputs.size()
+                              ? outputs[i]
+                              : (backward_for_op ? CreateZeroArray(shapes[i], ctx_.run_ctx.ctx)
+                                                 : NDArray()));
+        outputs_p.emplace_back(&*outputs_.rbegin());
+      }
+
+      if (!backward_for_op) {
+        DispatchMode dispatch_mode = DispatchMode::kUndefined;
+        imperative::SetShapeType(ctx_.run_ctx.ctx, attrs_, inputs_p, outputs_p, &dispatch_mode);
+      } else {
+        // Backward op, so set based upon inputs
+        CHECK_EQ(static_cast<size_t>(num_visible_outputs), backward_for_op->inputs().size());
+        for (int i = 0; i < num_visible_outputs; ++i) {
+          CHECK_LT(static_cast<size_t>(i), shapes.size());
+          // backward outputs should look like forward inputs
+          // TODO(cjolivier01): This check fails for dot product...
+          // Need better inference of backward shapes
+          // CHECK_EQ(backward_for_op->inputs()[i].shape(), outputs_[i].shape());
+        }
+      }
+
+      std::vector<OpReqType> req;
+      imperative::SetWriteInplaceReq(inputs_p, outputs_p, &req_);
+
+      CollectBlobs(inputs_, &blob_inputs_);
+      CollectBlobs(outputs_, &blob_outputs_);
+
+      function_ = common::GetFCompute<FCompute>(op_, "FCompute", ctx_.run_ctx.ctx);
+      functionex_ = common::GetFCompute<FComputeEx>(op_, "FComputeEx", ctx_.run_ctx.ctx);
+
+      AttachResources(&ctx_, attrs_, op_);
+
+      if (!backward_for_op) {
+        bool no_backward = false;
+        // Set up backward
+        std::vector<std::pair<std::shared_ptr<CoreOpExecutor>, std::string>> bwd;
+        if (!bwd_op_name.empty()) {
+          if (bwd_op_name != COREOP_BWD_OP_NAME_VALUE_NONE) {
+            // Backward op was specified
+            std::shared_ptr<CoreOpExecutor> pOp = std::make_shared<CoreOpExecutor>(
+              ctx().run_ctx.ctx.dev_type == Context::kGPU, ShapesOf(this->outputs()));
+            bwd.push_back({pOp, bwd_op_name});
+          } else {
+            no_backward = true;
+          }
+        } else {
+          // Try to figure out backward op
+          bwd = GetBackward();
+        }
+        if (!no_backward) {
+          CHECK_GE(bwd.size(), 1U)
+            << "Can't automatically determine backward op name. Please specify";
+          for (std::pair<std::shared_ptr<CoreOpExecutor>, std::string> &bw_item : bwd) {
+            bw_item.first->set_verbose(verbose_);
+            backward_.emplace_back(bw_item.first);
+            bw_item.first->Init(ArgsWithOpName(args, bw_item.second), {}, {}, this);
+          }
+        }
+      }
+    }
+  }
+
+  template<typename OpProp>
+  inline bool initForward(const OpProp &opProp, std::vector<int> *in_type) {
+    Init(opProp.GetArgs());
+    return true;
+  }
+
+  template<typename OpProp>
+  inline bool initBackward(const OpProp &opProp, std::vector<int> *in_type) { return true; }
+
+  inline void forward(const size_t count) {
+    perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), kForward, "Forward", count);
+    VTuneResume profile;
+    for (size_t i = 0; i < count; ++i) {
+      Execute();
+    }
+  }
+
+  inline void backward(const size_t count) {
+    CHECK(HasBackward());
+    perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), kBackward, "Backward", count);
+    VTuneResume profile;
+    for (size_t i = 0; i < count; ++i) {
+      ExecuteBackward();
+    }
+  }
+
+  /*!
+   * \brief Execute the operator for a dense tensor
+   */
+  void Execute() {
+    CHECK_EQ(initialized_, true);
+    CHECK_NOTNULL(function_);
+    function_(attrs_, ctx_, blob_inputs_, req_, blob_outputs_);
+  }
+
+  /*!
+   * \brief Execute the operator for a sparse tensor
+   */
+  void ExecuteEx() {
+    CHECK_EQ(initialized_, true);
+    CHECK_NOTNULL(functionex_);
+    functionex_(attrs_, ctx_, inputs_, req_, outputs_);
+  }
+
+  bool HasBackward() const {
+    return !backward_.empty();
+  }
+
+  /*!
+   * \brief Execute backward pass on operator
+   */
+  bool ExecuteBackward() {
+    CHECK_EQ(initialized_, true);
+    CHECK(HasBackward());
+    if (!backward_.empty()) {
+      // Avoid locked ref count here
+      for (std::shared_ptr<CoreOpExecutor> &p : backward_) {
+        p->Execute();
+      }
+      return true;
+    }
+    return false;
+  }
+
+  /*!
+   * \brief Execute backward pass on operator
+   */
+  bool ExecuteBackwardEx() {
+    CHECK_EQ(initialized_, true);
+    CHECK(HasBackward());
+    if (!backward_.empty()) {
+      // Avoid locked ref count here
+      for (std::shared_ptr<CoreOpExecutor> &p : backward_) {
+        p->ExecuteEx();
+      }
+      return true;
+    }
+    return false;
+  }
+
+  /*!
+   * \brief Get the operator context
+   * \return Reference to this operator's context object
+   */
+  const OpContext& ctx() const {
+    return ctx_;
+  }
+
+  /*!
+   * \brief Access input NDArray vector
+   * \return reference to NDArray vector of forward inputs
+   */
+  std::vector<NDArray>& inputs() { return inputs_; }
+  const std::vector<NDArray>& inputs() const { return inputs_; }
+
+  /*!
+   * \brief Access input NDArray vector
+   * \return reference to NDArray vector of forward outputs
+   */
+  std::vector<NDArray>& outputs() { return outputs_; }
+  const std::vector<NDArray>& outputs() const { return outputs_; }
+
+  /*!
+   * \brief Backward inputs (i.e. output grad)
+   * \return reference to NDArray vector of backward inputs
+   */
+  std::vector<NDArray>& bwd_inputs() {
+    CHECK_EQ(backward_.size(), 1U);
+    return backward_[0]->inputs();
+  }
+
+  /*!
+   * \brief Backward outputs (i.e. input grad)
+   * \return reference to NDArray vector of backward outputs
+   */
+  std::vector<NDArray>& bwd_outputs() {
+    CHECK_EQ(backward_.size(), 1U);
+    return backward_[0]->outputs();
+  }
+
+  void set_verbose(bool verbose) {
+    verbose_ = verbose;
+  }
+
+ private:
+  /*!
+   * \brief Has the execution been initialized?
+   */
+  bool initialized_ = false;
+  /*!
+   * \brief Whether to print debug trace output
+   */
+  bool verbose_ = false;
+  /*!
+   * \brief This operator's context object
+   */
+  OpContext ctx_;
+
+#if MXNET_USE_CUDA
+  /*! \brief
+   * Scoped GPU stream
+   */
+  std::unique_ptr<GPUStreamScope> allocGPUStream_;
+#endif
+
+  /*!
+   * \brief Input data shape
+   */
+  std::vector<TShape> input_shapes_;
+  /*
+   * \brief Pointer to the operator object
+   */
+  const nnvm::Op *op_;
+  /*!
+   * \brief Operator attributes
+   */
+  nnvm::NodeAttrs attrs_;
+  /*!
+   * \brief Input and output NDArray vectors
+   */
+  std::vector<NDArray> inputs_, outputs_;
+  /*!
+   * \brief Vectors of the TBlob objects associated with the NDArrays in inputs_ and outputs_
+   */
+  std::vector<TBlob> blob_inputs_, blob_outputs_;
+  /*!
+   * \brief Operator request type vector
+   */
+  std::vector<OpReqType> req_;
+  /*!
+   * \brief Operator's FCompute function (for dense tensors)
+   */
+  FCompute function_;
+  /*!
+   * \brief Operator's FCompute function (for sparse tensors)
+   */
+  FComputeEx functionex_;
+
+  /*!
+   * \brief Backward executors (if any)
+   */
+  std::vector<std::shared_ptr<CoreOpExecutor>> backward_;
+};
+
+class CoreOpProp {
+ public:
+  void Init(const kwargs_t& kwargs) { kwargs_ = kwargs; }
+  const kwargs_t& GetArgs() const { return kwargs_; }
+ private:
+  kwargs_t          kwargs_;
+};
+
+template<typename DType>
+using CoreOperatorRunner = test::OperatorRunner<CoreOpProp, CoreOpExecutor<DType>>;
+
+
+/*!
+ * \brief Rune a core op forward and backward
+ * \tparam DType Data type
+ * \param isGPU true if operation is to be run on the GPU
+ * \param op_kwargs Operator parameters
+ * \param op_name Operator name as registered with nnvm
+ * \param backward_op_name Backwards operator name as registered with nnvm
+ *        If blank, the runner will attempt to determine the backwards operator. If it fails,
+ *        an exception will be thrown.
+ *        If the string is [none], then no backward operator will be created or executed
+ */
+template<typename DType = float>
+inline void BasicRunCoreOpBidirectional(const bool isGPU,
+                                        bool verbose,
+                                        const kwargs_t& op_kwargs,
+                                        const std::vector<TShape>& shapes,
+                                        const char *op_name,
+                                        const char *backward_op_name = "") {
+  test::op::CoreOpExecutor<DType> op(isGPU, shapes);
+  op.set_verbose(false);
+
+  op.Init(op.ArgsWithOpName(op_kwargs, op_name, backward_op_name));
+
+  if (verbose) {
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs());
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+  }
+  op.Execute();
+  if (verbose) {
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+  }
+  if (op.HasBackward()) {
+    if (verbose) {
+      PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs());
+      PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+    }
+    op.ExecuteBackward();
+    if (verbose) {
+      PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+    }
+  }
+}
+
+}  // namespace op
+}  // namespace test
+}  // namespace mxnet
+
+#endif  // TEST_CORE_OP_H_
diff --git a/tests/cpp/include/test_legacy_op.h b/tests/cpp/include/test_legacy_op.h
new file mode 100644
index 000000000000..6d326fc3c06a
--- /dev/null
+++ b/tests/cpp/include/test_legacy_op.h
@@ -0,0 +1,559 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file test_op.h
+ * \brief operator unit test utility functions
+ * \author Chris Olivier
+ *
+ * These classes offer a framework for developing, testing and debugging operators
+ * in C++.  They work for both CPU and GPU modes, as well as offer a timing
+ * infrastructure in order to test inidividual operator performance.
+ *
+ * Operator data can be validated against general logic,
+ * stored scalar values (which can be generated by this code from an existing operator via
+ * BasicOperatorData::dumpC(), as well as against each other (ie check that
+ * GPU, CPU, MKL, and CUDNN operators produce the same output given the same input.
+ *
+ * test_util.h: General testing utility functionality
+ * test_perf.h: Performance-related classes
+ * test_op.h:   Operator-specific testing classes
+ */
+#ifndef TEST_LEGACY_OP_H_
+#define TEST_LEGACY_OP_H_
+
+#include <list>
+#include <string>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <list>
+#include "../../../include/mxnet/operator.h"
+#include "./test_op.h"
+#include "./test_op_runner.h"
+
+namespace mxnet {
+namespace test {
+namespace op {
+
+/*!
+ * \brief Manage test blobs and context, and universal logic
+ * Create an operator from its "Prop" class and sets up the operator
+ * and resources for both forward and backward passes
+ * \tparam DType
+ */
+template <typename DType, typename AccReal>
+class LegacyOperatorExecutor : public OperatorDataInitializer<DType>
+                              , public OperatorExecutorTiming {
+ public:
+  typedef DType DataType;
+  typedef AccReal AccRealType;
+
+  /*! \brief Manage test blobs and context */
+  LegacyOperatorExecutor(const bool isGPU, const std::vector<TShape>& topShapes)
+#if !MXNET_USE_CUDA
+    : isGPU_(false)
+#else
+    : isGPU_(isGPU)
+#endif
+    , initializeForward_(0)   // unit testing may call inits in any order based
+      , initializeBackward_(0)  // upon its use-case (ie may not want to run forward pass first)
+      , initializeCallback_(0) {
+    opContext_.is_train = true;
+    opContext_.run_ctx.stream = nullptr;
+    CHECK(!topShapes.empty());
+    shape_input_vec_ = topShapes;
+  }
+
+  inline mxnet::Context getContext() {
+    return isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{};
+  }
+
+  /*! \brief Initialize forward blob data values */
+  virtual void resetForward() {}
+
+  /*! \brief Initialize backward blob data values */
+  virtual void resetBackward() {}
+
+  /*! \brief Initialize auxiliary and output blobs */
+  template<typename OperatorPropertyType>
+  bool initForward(const OperatorPropertyType &opProp, std::vector<int> *in_type) {
+    if (!initializeForward_++) {
+      shape_input_vec_.resize(opProp.ListArguments().size());
+      op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type));
+      if (op_) {
+        const size_t output_count = opProp.ListOutputs().size();
+        const size_t aux_count = opProp.ListAuxiliaryStates().size();
+        // Figure out what sort of blobs we need to allocate
+        std::vector<TShape> out_shape, aux_shape;
+        out_shape.resize(output_count);
+        aux_shape.resize(aux_count);
+        opProp.InferShape(&shape_input_vec_, &out_shape, &aux_shape);
+        std::vector<int> out_type(output_count, -1), aux_type(aux_count, -1);
+        opProp.InferType(in_type, &out_type, &aux_type);
+
+        // Allocate top blobs (input)
+        for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
+          int type;
+          if (x < in_type->size()) {
+            type = (*in_type)[x];
+          } else {
+            type = x ? mshadow::DataType<AccReal>::kFlag : mshadow::DataType<DType>::kFlag;
+          }
+
+          allocateBlob(&c_.blob_input_vec_, shape_input_vec_[x], false, type);
+        }
+
+        // Allocate aux blobs (scratch, hidden, etc.)
+        for (size_t x = 0, n = aux_shape.size(); x < n; ++x) {
+          CHECK(x < aux_type.size());
+          allocateBlob(&c_.blob_aux_states_, aux_shape[x], false, aux_type[x]);
+        }
+
+        // Allocate bottom blobs (output)
+        for (size_t x = 0, n = out_shape.size(); x < n; ++x) {
+          CHECK(x < out_type.size());
+          allocateBlob(&c_.blob_output_vec_, out_shape[x], false, out_type[x]);
+        }
+
+        // Get the resource of temporal space
+        std::vector<TShape> inputShapes;
+        for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
+          inputShapes.emplace_back(shape_input_vec_[x]);
+        }
+        allocateResources(opProp.ForwardResource(inputShapes));
+
+        resetForward();
+        return true;
+      }
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  /*! \brief Initialize auxiliary and output blobs */
+  template<typename OperatorPropertyType>
+  bool initBackward(const OperatorPropertyType &opProp, std::vector<int> *in_type) {
+    initForward(opProp, in_type);
+    if (!initializeBackward_++) {
+      for (size_t x = 0, n = static_cast<size_t>(opProp.NumVisibleOutputs()); x < n; ++x) {
+        CHECK_LT(x, c_.blob_output_vec_.size());
+        allocateBlob(&c_.blob_out_grad_, c_.blob_output_vec_[x].shape_,
+                     false, c_.blob_output_vec_[x].type_flag_);
+      }
+
+      for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) {
+        allocateBlob(&c_.blob_in_grad_,  c_.blob_input_vec_[x].shape_,
+                     false, c_.blob_input_vec_[x].type_flag_);
+      }
+
+      // Get the resource of temporal space
+      std::vector<TShape> ishapes;
+      allocateResources(opProp.BackwardResource(ishapes));
+
+      resetBackward();
+      return false;
+    } else {
+      return true;
+    }
+  }
+
+  /*! \brief Run operator forward */
+  void forward(const size_t count = 1) {
+    const std::vector<OpReqType> req(c_.blob_output_vec_.size(), kWriteTo);
+    // Possibly move data to/from CPU and GPU (outside of timing scope)
+    MXNET_CUDA_ONLY(std::unique_ptr<GPUOpData> gpuData(isGPU_ ?
+                      new GPUOpData(c_, &opContext_) : nullptr));
+    perf::TimingItem timeF(&OperatorExecutorTiming::GetTiming(), Forward,
+                           "Forward", count);
+    if (!isGPU_) {
+      VTuneResume profile;  // VTune sample only this scope
+      for (size_t x = 0; x < count; ++x) {
+        op()->Forward(opContext_,
+                      c_.blob_input_vec_,
+                      req,
+                      c_.blob_output_vec_,
+                      c_.blob_aux_states_);
+      }
+    } else {
+      for (size_t x = 0; x < count; ++x) {
+        MXNET_CUDA_ONLY(op()->Forward(opContext_,
+                                      gpuData->blob_input_vec_,
+                                      req,
+                                      gpuData->blob_output_vec_,
+                                      gpuData->blob_aux_states_));
+      }
+    }
+  }
+
+  /*! \brief Run operator backwards */
+  void backward(const size_t count = 1) {
+    const std::vector<OpReqType> req(c_.blob_in_grad_.size(), kWriteTo);
+    // Possibly move data to/from CPU and GPU (outside of timing scope)
+    MXNET_CUDA_ONLY(std::unique_ptr<GPUOpData> gpuData(isGPU_ ?
+                      new GPUOpData(c_, &opContext_) : nullptr));
+    perf::TimingItem timeB(&OperatorExecutorTiming::GetTiming(), Backward,
+                           "Backward", count);
+    if (!isGPU_) {
+      VTuneResume profile;  // VTune sample only this scope
+      for (size_t x = 0; x < count; ++x) {
+        op()->Backward(opContext_,
+                       c_.blob_out_grad_,
+                       c_.blob_input_vec_,
+                       c_.blob_output_vec_,
+                       req,
+                       c_.blob_in_grad_,
+                       c_.blob_aux_states_);
+      }
+    } else {
+      for (size_t x = 0; x < count; ++x) {
+        MXNET_CUDA_ONLY(op()->Backward(opContext_,
+                                       gpuData->blob_out_grad_,
+                                       gpuData->blob_input_vec_,
+                                       gpuData->blob_output_vec_,
+                                       req,
+                                       gpuData->blob_in_grad_,
+                                       gpuData->blob_aux_states_));
+      }
+    }
+  }
+
+  /*!
+   * \brief Test if operator has a backward pass
+   * \return true if this operator has a backward pass
+   */
+  MSHADOW_CINLINE bool HasBackward() const { return true; }
+
+  /*! \brief Getter functions for the operator */
+  inline Operator *op() { return op_.get(); }
+  inline const Operator *op() const { return op_.get(); }
+
+  enum BlobVectorType {
+    kInput,
+    kOutput,
+    kAux,
+    kInGrad,
+    kOutGrad,
+    kBlobVectorTypeCount
+  };
+
+#define CASE_STR(__v$) case (__v$): return #__v$
+
+  /*! \brief Convert BlobVectorType enum into a string */
+  static inline const char *bvt2String(const BlobVectorType bvt) {
+    switch (bvt) {
+      CASE_STR(kInput);
+      CASE_STR(kOutput);
+      CASE_STR(kAux);
+      CASE_STR(kInGrad);
+      CASE_STR(kOutGrad);
+      default:
+        CHECK(false);
+        return "";
+    }
+  }
+#undef CASE_STR
+
+  /*! \brief Return a particular blob in a test data set */
+  inline const std::vector<TBlob>& getBlobVect(const BlobVectorType bvt) const {
+    switch (bvt) {
+      case kInput:
+        return c_.blob_input_vec_;
+      case kOutput:
+        return c_.blob_output_vec_;
+      case kAux:
+        return c_.blob_aux_states_;
+      case kInGrad:
+        return c_.blob_in_grad_;
+      case kOutGrad:
+        return c_.blob_out_grad_;
+      default:
+        CHECK(false);
+        return c_.blob_input_vec_;
+    }
+  }
+
+  /*! \brief Dump an operator's data set into compilable C++ data code for runtime validation
+   * When writing an operator test, you can generate a "known good operator data state" in C++
+   * code with this function, and then use load() to load the blob states into this
+   * class (BasicOperatorData).
+   * After that, you can compare with the "actual" operator state (BasicOperatorData) of
+   * the operator that you are testing.
+   */
+  template<typename Stream>
+  inline void dumpC(Stream *_os, const std::string& label) {
+    Stream& os = *_os;
+    os << "static const std::vector< std::vector< std::vector<float> > > ___"
+       << label << "_data_shape_";
+    const TShape& shape = shape_input_vec_[0];
+    for (size_t i = 0, n = shape.ndim(); i < n; ++i) {
+      os << shape[i] << "_";
+    }
+    os << "__ =" << std::endl << "{" << std::endl;
+    for (size_t x = 0; x < kBlobVectorTypeCount; ++x) {
+      os << "  { /* " << bvt2String(BlobVectorType(x)) << " */" << std::endl;
+      const std::vector<TBlob>& blobVect = getBlobVect(BlobVectorType(x));
+      for (size_t i = 0, n = blobVect.size(); i < n; ++i) {
+        os << "    { ";
+        test::dump<DType>(&os, blobVect[i]);
+        os << " }";
+        if (i < n - 1) {
+          os << ",";
+        }
+        os << std::endl;
+      }
+      os << "  }";
+      if (x < kBlobVectorTypeCount - 1) {
+        os << ",";
+      }
+      os << std::endl;
+    }
+    os << "};" << std::endl;
+  }
+
+  static inline void copy(const TBlob& blob, const DType array[],
+                          const size_t start, const size_t end) {
+    const size_t blobSize = blob.Size();
+    DType *p = blob.dptr<DType>();
+    for (size_t i = 0, n = end - start; i < n; ++i) {
+      CHECK_LT(i, blobSize);
+      p[i] = array[i + start];
+    }
+  }
+
+  /*! \brief Runtime load of the C++ data code generated by dumpC() */
+  void load(const std::vector<std::vector<std::vector<DType>>>& cData) {
+    for (size_t i = 0, ni = cData.size(); i < ni; ++i) {
+      for (size_t j = 0, nj = cData[i].size(); j < nj; ++j)  {
+        const TBlob& blob = getBlobVect(BlobVectorType(i))[j];
+        const size_t sourceDataSize = cData[i][j].size();
+        CHECK_EQ(sourceDataSize, blob.Size());
+        const DType *sourceData = &cData[i][j][0];
+        copy(blob, sourceData, 0, sourceDataSize);
+      }
+    }
+  }
+
+  /*! \brief Runtime load of the C++ data code generated by dumpC() */
+  void load(const std::vector<std::vector<std::vector<DType>>>& cData,
+            const BlobVectorType type) {
+    CHECK_LT(type, cData.size());
+    for (size_t j = 0, nj = cData[type].size(); j < nj; ++j)  {
+      const TBlob& blob = getBlobVect(type)[j];
+      const size_t sourceDataSize = cData[type][j].size();
+      CHECK_EQ(sourceDataSize, blob.Size());
+      const DType *sourceData = &cData[type][j][0];
+      copy(blob, sourceData, 0, sourceDataSize);
+    }
+  }
+
+  /*! \brief Runtime load of the C++ data code generated by dumpC() */
+  void load(const std::vector<std::vector<std::vector<DType>>>& cData,
+            const BlobVectorType type, const int idx) {
+    CHECK_LT(type, cData.size());
+    CHECK_LT(idx, cData[type].size());
+    const TBlob& blob = getBlobVect(type)[idx];
+    const size_t sourceDataSize = cData[type][idx].size();
+    CHECK_EQ(sourceDataSize, blob.Size());
+    const DType *sourceData = &cData[type][idx][0];
+    copy(blob, sourceData, 0, sourceDataSize);
+  }
+
+  void FillRandom() {
+    for (size_t j = 0, jn = this->c_.all_blob_vects_.size(); j < jn; ++j) {
+      std::vector<TBlob> *data_vect = this->c_.all_blob_vects_[j];
+      if (data_vect) {
+        for (size_t i = 0, n = data_vect->size(); i < n; ++i) {
+          OperatorDataInitializer<DType>::FillRandom((*data_vect)[i]);
+        }
+      }
+    }
+  }
+
+  std::vector<TBlob>& inputs() { return c_.blob_input_vec_; }
+  const std::vector<TBlob>& inputs() const { return c_.blob_input_vec_; }
+  std::vector<TBlob>& outputs() { return c_.blob_output_vec_; }
+  const std::vector<TBlob>& outputs() const { return c_.blob_output_vec_; }
+  std::vector<TBlob>& bwd_inputs() { return c_.blob_out_grad_; }
+  std::vector<TBlob>& bwd_outputs() { return c_.blob_in_grad_; }
+
+  /*! \brief Input and output blobs */
+  OpContext                 opContext_;
+
+  std::vector<TShape>       shape_input_vec_;
+
+  struct OpData {
+    std::vector<TBlob> blob_input_vec_;
+    std::vector<TBlob> blob_output_vec_;
+    std::vector<TBlob> blob_aux_states_;
+    std::vector<TBlob> blob_in_grad_;
+    std::vector<TBlob> blob_out_grad_;  // Remaining err (loss) pushing back upstream
+
+    std::vector<std::vector<TBlob> *> all_blob_vects_;
+    inline OpData() {
+      all_blob_vects_.emplace_back(&blob_input_vec_);
+      all_blob_vects_.emplace_back(&blob_output_vec_);
+      all_blob_vects_.emplace_back(&blob_aux_states_);
+      all_blob_vects_.emplace_back(&blob_in_grad_);
+      all_blob_vects_.emplace_back(&blob_out_grad_);  // Remaining err (loss) pushing back upstream
+    }
+    virtual ~OpData() {}
+  };
+
+#if MXNET_USE_CUDA
+  class GPUOpData : public OpData {
+    GPUOpData() = delete;
+    GPUOpData(const GPUOpData& o) = delete;
+
+   public:
+    inline GPUOpData(const OpData& cpuData, OpContext *opContext)
+    : cpuData_(cpuData)
+      , allocGPUStream_(opContext) {
+      // Copy CPU->GPU
+      CHECK_EQ(gpuBlobs_.size(), 0U);
+      CHECK_EQ(cpuData_.all_blob_vects_.size(), this->all_blob_vects_.size());
+      for (size_t bvt = 0, nbvt = cpuData_.all_blob_vects_.size(); bvt < nbvt; ++bvt) {
+        std::vector<TBlob>& bv_src = *cpuData_.all_blob_vects_[bvt];
+        std::vector<TBlob>& bvt_dest = *this->all_blob_vects_[bvt];
+        for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
+          const TBlob& srcBlob = bv_src[i];
+          TBlob *destBlob = allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_,
+                                         true, srcBlob.type_flag_);
+
+          Context cpu_ctx, gpu_ctx;
+          cpu_ctx.dev_type = Context::kCPU;
+          gpu_ctx.dev_type = Context::kGPU;
+          cpu_ctx.dev_id = gpu_ctx.dev_id = 0;
+
+          mxnet::ndarray::Copy<cpu, gpu>(srcBlob, destBlob, cpu_ctx,
+                                         gpu_ctx, allocGPUStream_.opContext_.run_ctx);
+        }
+      }
+      cudaDeviceSynchronize();
+    }
+    inline ~GPUOpData() {
+      // Copy GPU->CPU
+      cudaDeviceSynchronize();
+      for (size_t bvt = 0, nbvt = this->all_blob_vects_.size(); bvt < nbvt; ++bvt) {
+        std::vector<TBlob>& bv_src = *this->all_blob_vects_[bvt];
+        std::vector<TBlob>& bvt_dest = *cpuData_.all_blob_vects_[bvt];
+        for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
+          const TBlob& srcBlob = bv_src[i];
+          TBlob *destBlob = &bvt_dest[i];
+
+          Context cpu_ctx, gpu_ctx;
+          cpu_ctx.dev_type = Context::kCPU;
+          gpu_ctx.dev_type = Context::kGPU;
+          cpu_ctx.dev_id = gpu_ctx.dev_id = 0;
+
+          mxnet::ndarray::Copy<gpu, cpu>(srcBlob, destBlob, gpu_ctx,
+                                         cpu_ctx, allocGPUStream_.opContext_.run_ctx);
+        }
+      }
+      gpuBlobs_.clear();  // Force deallocation of the GPU blob data
+      cudaDeviceSynchronize();
+    }
+
+   private:
+    /*! \brief Reference to the src/dest CPU data */
+    const OpData& cpuData_;
+    /*! \brief The GPU-allocated blobs */
+    std::list<std::unique_ptr<test::StandaloneBlob>> gpuBlobs_;
+    /*! \brief Scoped GPU stream */
+    GPUStreamScope allocGPUStream_;
+  };
+#endif  // MXNET_USE_CUDA
+
+ protected:
+  OpData                    c_;
+
+  /*! \brief Allocate the operator's resource requests */
+  void allocateResources(const std::vector<ResourceRequest>& reqs) {
+    std::map<Context, Resource> cached_temp;
+
+    Context ctx;
+    ctx.dev_type = isGPU_ ? Context::kGPU : Context::kCPU;
+    ctx.dev_id = 0;
+
+    for (const ResourceRequest& req : reqs) {
+      if (req.type == ResourceRequest::kTempSpace) {
+        if (cached_temp.count(ctx) != 0) {
+          opContext_.requested.emplace_back(cached_temp.at(ctx));
+        } else {
+          Resource r = ResourceManager::Get()->Request(ctx, req);
+          opContext_.requested.emplace_back(r);
+          cached_temp[ctx] = r;
+        }
+      } else if (req.type == ResourceRequest::kRandom) {
+        opContext_.requested.emplace_back(ResourceManager::Get()->Request(ctx, req));
+      } else {
+        LOG(FATAL) << "resource type not yet supported";
+      }
+    }
+  }
+
+  /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
+  static TBlob *allocateBlob(std::list<std::unique_ptr<test::StandaloneBlob>> *standalone_blobs,
+                             std::vector<TBlob> *dest,
+                             const TShape& shape,
+                             const bool isGPU,
+                             const int dtype) {
+    test::StandaloneBlob *blob = new test::StandaloneBlob(shape, isGPU, dtype);
+    CHECK_NE(blob, static_cast<TBlob *>(nullptr));
+    standalone_blobs->emplace_back(std::unique_ptr<test::StandaloneBlob>(blob));
+    (*dest).emplace_back(*blob);
+    return blob;
+  }
+
+  /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
+  inline TBlob *allocateBlob(std::vector<TBlob> *dest, const TShape& shape,
+                             const bool isGPU, const int dtype) {
+    return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype);
+  }
+
+  /*! \brief Performance timing categories */
+  enum TimingId {
+    Forward,
+    Backward
+  };
+
+  /*! \brief The operator */
+  std::unique_ptr<Operator>   op_;
+  /*! \brief Is this for a GPU? */
+  const bool                  isGPU_;
+  /*! \brief Assure that the Forward initialized only once */
+  std::atomic<int>            initializeForward_;
+  /*! \brief Assure that the Forward initialized only once */
+  std::atomic<int>            initializeBackward_;
+  /*! \brief Assure that the callback is initialized only once */
+  std::atomic<int>            initializeCallback_;
+  /*! \brief scoped lifecycle management of allocated blobs */
+  std::list<std::unique_ptr<test::StandaloneBlob>> standalone_blobs_;
+};
+
+template<typename OperatorProp, typename DType, typename AccReal>
+using LegacyOpRunner =
+mxnet::test::OperatorRunner<OperatorProp, LegacyOperatorExecutor<DType, AccReal>>;
+
+}  // namespace op
+}  // namespace test
+}  // namespace mxnet
+
+#endif  // TEST_LEGACY_OP_H_
diff --git a/tests/cpp/include/test_ndarray_utils.h b/tests/cpp/include/test_ndarray_utils.h
new file mode 100644
index 000000000000..f5ab96794ada
--- /dev/null
+++ b/tests/cpp/include/test_ndarray_utils.h
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TEST_NDARRAY_UTILS_H_
+#define TEST_NDARRAY_UTILS_H_
+
+#include <unistd.h>
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <mxnet/engine.h>
+#include <mxnet/ndarray.h>
+#include <cstdio>
+#include <vector>
+#include <cstdlib>
+#include <string>
+#include <map>
+#include "test_util.h"
+#include "test_op.h"
+
+namespace mxnet {
+namespace test {
+
+#define ROW_SPARSE_IDX_TYPE mshadow::kInt64
+
+using namespace mxnet;
+#define TEST_DTYPE float
+#define TEST_ITYPE int32_t
+
+inline void CheckDataRegion(const TBlob &src, const TBlob &dst) {
+  auto size = src.shape_.Size() * mshadow::mshadow_sizeof(src.type_flag_);
+  auto equals = memcmp(src.dptr_, dst.dptr_, size);
+  EXPECT_EQ(equals, 0);
+}
+
+inline unsigned gen_rand_seed() {
+  time_t timer;
+  ::time(&timer);
+  return static_cast<unsigned>(timer);
+}
+
+inline float RandFloat() {
+  static unsigned seed = gen_rand_seed();
+  double v = rand_r(&seed) * 1.0 / RAND_MAX;
+  return static_cast<float>(v);
+}
+
+// Get an NDArray with provided indices, prepared for a RowSparse NDArray.
+inline NDArray RspIdxND(const TShape shape, const Context ctx,
+                        const std::vector<TEST_ITYPE> &values) {
+  NDArray nd(shape, ctx, false, ROW_SPARSE_IDX_TYPE);
+  size_t num_val = values.size();
+  MSHADOW_TYPE_SWITCH(nd.dtype(), DType, {
+    auto tensor = nd.data().FlatTo1D<cpu, DType>();
+    for (size_t i = 0; i < num_val; i++) {
+      tensor[i] = values[i];
+    }
+  });
+  return nd;
+}
+
+// Get a dense NDArray with provided values.
+inline NDArray DnsND(const TShape shape, const Context ctx, std::vector<TEST_DTYPE> vs) {
+  NDArray nd(shape, ctx, false);
+  size_t num_val = shape.Size();
+  // generate random values
+  while (vs.size() < num_val) {
+    auto v = RandFloat();
+    vs.emplace_back(v);
+  }
+  CHECK_EQ(vs.size(), nd.shape().Size());
+  MSHADOW_TYPE_SWITCH(nd.dtype(), DType, {
+    auto tensor = nd.data().FlatTo1D<cpu, DType>();
+    for (size_t i = 0; i < num_val; i++) {
+      tensor[i] = vs[i];
+    }
+  });
+  return nd;
+}
+
+template<typename xpu>
+static void inline CopyBlob(mshadow::Stream<xpu> *s,
+                            const TBlob& dest_blob,
+                            const TBlob& src_blob) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(src_blob.type_flag_, dest_blob.type_flag_);
+  CHECK_EQ(src_blob.shape_, dest_blob.shape_);
+  MSHADOW_TYPE_SWITCH(src_blob.type_flag_, DType, {
+    // Check if the pointers are the same (in-place operation needs no copy)
+    if (src_blob.dptr<DType>() != dest_blob.dptr<DType>()) {
+      mshadow::Copy(dest_blob.FlatTo1D<xpu, DType>(s), src_blob.FlatTo1D<xpu, DType>(s), s);
+    }
+  });
+}
+
+// Get a RowSparse NDArray with provided indices and values
+inline NDArray RspND(const TShape shape, const Context ctx, const std::vector<TEST_ITYPE> idx,
+              std::vector<TEST_DTYPE> vals) {
+  CHECK(shape.ndim() <= 2) << "High dimensional row sparse not implemented yet";
+  index_t num_rows = idx.size();
+  index_t num_cols = vals.size() / idx.size();
+  // create index NDArray
+  NDArray index = RspIdxND(mshadow::Shape1(num_rows), ctx, idx);
+  print(&std::cout, "index", index);
+  CHECK_EQ(vals.size() % idx.size(), 0);
+  // create value NDArray
+  NDArray data = DnsND(mshadow::Shape2(num_rows, num_cols), ctx, vals);
+  print(&std::cout, "data", data);
+  // create result nd
+  std::vector<TShape> aux_shapes = {mshadow::Shape1(num_rows)};
+  NDArray nd(kRowSparseStorage, shape, ctx, false, mshadow::default_type_flag,
+             {}, aux_shapes);
+
+  mshadow::Stream<cpu> *s = nullptr;
+  CopyBlob(s, nd.aux_data(rowsparse::kIdx), index.data());
+  CopyBlob(s, nd.data(), data.data());
+
+  print(&std::cout, "nd", nd);
+  return nd;
+}
+
+/*! \brief Array - utility class to construct sparse arrays
+ *  \warning This class is not meant to run in a production environment.  Since it is for unit tests only,
+ *           simplicity has been chosen over performance.
+ **/
+template<typename DType>
+class Array {
+  typedef std::map<size_t, std::map<size_t, DType> > TItems;
+  static constexpr double EPSILON = 1e-5;
+
+  static const char *st2str(const NDArrayStorageType storageType) {
+    switch (storageType) {
+      case kDefaultStorage:
+        return "kDefaultStorage";
+      case kRowSparseStorage:
+        return "kRowSparseStorage";
+      case kCSRStorage:
+        return "kCSRStorage";
+      case kUndefinedStorage:
+        return "kUndefinedStorage";
+      default:
+        LOG(FATAL) << "Unsupported storage type: " << storageType;
+        return "<INVALID>";
+    }
+  }
+
+  /*! \brief Remove all zero entries */
+  void Prune() {
+    for (typename TItems::iterator i = items_.begin(), e = items_.end();
+         i != e;) {
+      const size_t y = i->first;
+      std::map<size_t, DType> &m = i->second;
+      ++i;
+      for (typename std::map<size_t, DType>::const_iterator j = m.begin(), jn = m.end();
+           j != jn;) {
+        const size_t x = j->first;
+        const DType v = j->second;
+        ++j;
+        if (IsZero(v)) {
+          m.erase(x);
+        }
+      }
+      if (m.empty()) {
+        items_.erase(y);
+      }
+    }
+  }
+
+  /*! \brief Create a dense NDArray from our mapped data */
+  NDArray CreateDense(const Context& ctx) const {
+    NDArray array(shape_, Context::CPU(-1));
+    TBlob data = array.data();
+    DType *p_data = data.dptr<DType>();
+    memset(p_data, 0, array.shape().Size() * sizeof(DType));
+    for (typename TItems::const_iterator i = items_.begin(), e = items_.end();
+         i != e; ++i) {
+      const size_t y = i->first;
+      const std::map<size_t, DType> &m = i->second;
+      for (typename std::map<size_t, DType>::const_iterator j = m.begin(), jn = m.end();
+           j != jn; ++j) {
+        const size_t x = j->first;
+        const DType v = j->second;
+        if (!IsZero(v)) {
+          const size_t offset = mxnet::test::offset(shape_, {y, x});
+          p_data[offset] = v;
+        }
+      }
+    }
+    if (ctx.dev_type == Context::kGPU) {
+      NDArray argpu(shape_, ctx);
+      CopyFromTo(array, &argpu);
+      return argpu;
+    } else {
+      return array;
+    }
+  }
+
+ public:
+  Array() = default;
+
+  explicit Array(const TShape &shape)
+    : shape_(shape) {}
+
+  explicit Array(const NDArray &arr)
+    : shape_(arr.shape()) {
+    Load(arr);
+  }
+
+  void clear() {
+    items_.clear();
+    shape_ = TShape(0);
+  }
+
+  static inline bool IsNear(const DType v1, const DType v2) { return fabs(v2 - v1) <= EPSILON; }
+  static inline bool IsZero(const DType v) { return IsNear(v, DType(0)); }
+
+  /*! Index into value maps via: [y][x] (row, col) */
+  std::map<size_t, DType> &operator[](const size_t idx) { return items_[idx]; }
+
+  const std::map<size_t, DType> &operator[](const size_t idx) const {
+    typename TItems::const_iterator i = items_.find(idx);
+    if (i != items_.end()) {
+      return i->second;
+    }
+    CHECK(false) << "Attempt to access a non-existent key in a constant map";
+    return *static_cast<std::map<size_t, DType> *>(nullptr);
+  }
+
+  bool Contains(const size_t row, const size_t col) const {
+    typename TItems::const_iterator i = items_.find(row);
+    if (i != items_.end()) {
+      typename std::map<size_t, DType>::const_iterator j = i->second.find(col);
+      if (j != i->second.end()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /*! \brief Convert from one storage type NDArray to another */
+  static NDArray Convert(const Context& ctx, const NDArray& src,
+                         const NDArrayStorageType storageType) {
+    std::unique_ptr<NDArray> pArray(
+      storageType == kDefaultStorage
+      ? new NDArray(src.shape(), ctx)
+      : new NDArray(storageType, src.shape(), ctx));
+    OpContext opContext;
+    MXNET_CUDA_ONLY(std::unique_ptr<test::op::GPUStreamScope> gpuScope;);
+    switch (ctx.dev_type) {
+#if MNXNET_USE_CUDA
+      case Context::kGPU:
+        gpuScope.reset(new test::op::GPUStreamScope(&opContext));
+        mxnet::op::CastStorageComputeImpl<gpu>(s, src, dest);
+        break;
+#endif  // MNXNET_USE_CUDA
+      default: {  // CPU
+        OpContext op_ctx;
+        mxnet::op::CastStorageComputeImpl<cpu>(op_ctx, src, *pArray);
+        break;
+      }
+    }
+    return *pArray;
+  }
+
+  /*! \brief Return NDArray of given storage type representing the value maps */
+  NDArray Save(const Context& ctx, const NDArrayStorageType storageType) const {
+    switch (storageType) {
+      case kDefaultStorage:
+        return CreateDense(ctx);
+      case kRowSparseStorage:
+      case kCSRStorage:
+        return Convert(ctx, CreateDense(ctx), storageType);
+      case kUndefinedStorage:
+      default:
+        LOG(ERROR) << "Unsupported storage type: " << storageType;
+        return NDArray(TShape(0), ctx);
+    }
+  }
+
+  void Load(NDArray array) {
+    clear();
+    shape_ = array.shape();
+    if (array.storage_type() != kDefaultStorage) {
+      array = Convert(array.ctx(), array, kDefaultStorage);
+    }
+#if MXNET_USE_CUDA
+    if (array.ctx().dev_type == Context::kGPU) {
+      NDArray tmp(array.shape(), Context::CPU(-1));
+      CopyFromTo(array, &tmp);
+      array = tmp;
+    }
+#endif  // MXNET_USE_CUDA
+    const TBlob blob = array.data();
+    DType *p = blob.dptr<DType>();
+    CHECK_EQ(shape_.ndim(), 2U);
+    for (size_t row = 0, nrow = shape_[0]; row < nrow; ++row) {
+      for (size_t col = 0, ncol = shape_[1]; col < ncol; ++col) {
+        const size_t off = test::offset(shape_, {row, col});
+        if (!IsZero(p[off])) {
+          (*this)[row][col] = p[off];
+        }
+      }
+    }
+  }
+
+  void print() const {
+    for (typename TItems::const_iterator i = items_.begin(), e = items_.end();
+         i != e; ++i) {
+      const size_t y = i->first;
+      const std::map<size_t, DType> &m = i->second;
+      CHECK_EQ(m.empty(), false);  // How did it get to have an empty map?
+      for (typename std::map<size_t, DType>::const_iterator j = m.begin(), jn = m.end();
+           j != jn; ++j) {
+        const size_t x = j->first;
+        const DType v = j->second;
+        if (!IsZero(v)) {
+          std::cout << "[row=" << y << ", col=" << x << "]: " << v << std::endl;
+        }
+      }
+    }
+    std::cout << std::flush;
+  }
+
+ private:
+  TShape shape_;
+  TItems items_;
+};
+
+template<typename StreamType>
+inline StreamType& print_dense(StreamType *_os, const std::string& label, const NDArray& arr) {
+  MSHADOW_TYPE_SWITCH(arr.data().type_flag_, DType, {
+    print(_os, label, test::Array<DType>(arr).Save(arr.ctx(), kDefaultStorage))
+      << std::endl;
+  });
+  return *_os;
+}
+
+}  // namespace test
+}  // namespace mxnet
+
+#endif  // TEST_NDARRAY_UTILS_H_
diff --git a/tests/cpp/include/test_op.h b/tests/cpp/include/test_op.h
index d8f90df8447e..bddade083074 100644
--- a/tests/cpp/include/test_op.h
+++ b/tests/cpp/include/test_op.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file test_op.h
  * \brief operator unit test utility functions
  * \author Chris Olivier
@@ -38,9 +39,7 @@
 #ifndef TEST_OP_H_
 #define TEST_OP_H_
 
-#include "test_perf.h"
-#include "test_util.h"
-
+#include <mxnet/op_attr_types.h>
 #include <ndarray/ndarray_function.h>
 #include <mshadow/base.h>
 #include <mshadow/stream_gpu-inl.h>
@@ -51,6 +50,8 @@
 #include <string>
 #include <vector>
 #include <utility>
+#include "./test_perf.h"
+#include "./test_util.h"
 
 namespace mxnet {
 namespace test {
@@ -62,491 +63,92 @@ namespace op {
 #define MXNET_CUDA_ONLY(__i$) ((void)0)
 #endif
 
+#if MXNET_USE_CUDA
 /*!
- * \brief Manage test blobs and context, and universal logic
- * Create an operator from its "Prop" class and sets up the operator
- * and resources for both forward and backward passes
- * \tparam DType
+ * \brief Maintain the lifecycle of a GPU stream
  */
-template <typename DType, typename AccReal>
-class BasicOperatorData {
-  struct GPUStreamScope {
-    explicit inline GPUStreamScope(OpContext *opContext)
+struct GPUStreamScope {
+  explicit inline GPUStreamScope(OpContext *opContext)
     : opContext_(*opContext) {
-      CHECK_EQ(opContext_.run_ctx.stream == nullptr, true)
-        << "Invalid runtime context stream state";
-      opContext_.run_ctx.stream = mshadow::NewStream<gpu>(true, true);
-      CHECK_EQ(opContext_.run_ctx.stream != nullptr, true)
-        << "Unable to allocate a GPU stream";
-    }
-    inline ~GPUStreamScope() {
-      if (opContext_.run_ctx.stream) {
-        mshadow::DeleteStream<gpu>(static_cast<mshadow::Stream<gpu> *>(opContext_.run_ctx.stream));
-        opContext_.run_ctx.stream = nullptr;
-      }
-    }
-    OpContext& opContext_;
-  };
-
- public:
-  /*! \brief Manage test blobs and context */
-  BasicOperatorData(const bool isGPU, const TShape& topShape)
-#if !MXNET_USE_CUDA
-    : isGPU_(false)
-#else
-    : isGPU_(isGPU)
-#endif
-      , initializeForward_(0)   // unit testing may call inits in any order based
-      , initializeBackward_(0)  // upon its use-case (ie may not want to run forward pass first)
-      , initializeCallback_(0) {
-    opContext_.is_train = true;
-    opContext_.run_ctx.stream = nullptr;
-
-    shape_input_vec_.push_back(topShape);
-  }
-
-  inline mxnet::Context getContext() {
-    return isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{};
+    CHECK_EQ(opContext_.run_ctx.stream == nullptr, true)
+      << "Invalid runtime context stream state";
+    opContext_.run_ctx.stream = mshadow::NewStream<gpu>(true, true);
+    CHECK_EQ(opContext_.run_ctx.stream != nullptr, true)
+      << "Unable to allocate a GPU stream";
   }
-
-  /*! \brief Initialize forward blob data values */
-  virtual void resetForward() {}
-
-  /*! \brief Initialize backward blob data values */
-  virtual void resetBackward() {}
-
-  /*! \brief Initialize auxiliary and output blobs */
-  virtual bool initForward(const OperatorProperty &opProp, std::vector<int> *in_type) {
-    if (!initializeForward_++) {
-      shape_input_vec_.resize(opProp.ListArguments().size());
-      op_.reset(opProp.CreateOperatorEx(getContext(), &shape_input_vec_, in_type));
-      if (op_) {
-        // Figure out what sort of blobs we need to allocate
-        std::vector<TShape> out_shape, aux_shape;
-        opProp.InferShape(&shape_input_vec_, &out_shape, &aux_shape);
-        std::vector<int> out_type, aux_type;
-        opProp.InferType(in_type, &out_type, &aux_type);
-
-        // Allocate top blobs (input)
-        for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
-          int type;
-          if (x < in_type->size()) {
-            type = (*in_type)[x];
-          } else {
-            type = x ? mshadow::DataType<AccReal>::kFlag : mshadow::DataType<DType>::kFlag;
-          }
-
-          allocateBlob(&c_.blob_input_vec_, shape_input_vec_[x], false, type);
-        }
-
-        // Allocate aux blobs (scratch, hidden, etc.)
-        for (size_t x = 0, n = aux_shape.size(); x < n; ++x) {
-          CHECK(x < aux_type.size());
-          allocateBlob(&c_.blob_aux_states_, aux_shape[x], false, aux_type[x]);
-        }
-
-        // Allocate bottom blobs (output)
-        for (size_t x = 0, n = out_shape.size(); x < n; ++x) {
-          CHECK(x < out_type.size());
-          allocateBlob(&c_.blob_output_vec_, out_shape[x], false, out_type[x]);
-        }
-
-        // Get the resource of temporal space
-        std::vector<TShape> inputShapes;
-        for (size_t x = 0, n = shape_input_vec_.size(); x < n; ++x) {
-          inputShapes.push_back(shape_input_vec_[x]);
-        }
-        allocateResources(opProp.ForwardResource(inputShapes));
-
-        resetForward();
-        return true;
-      }
-      return false;
-    } else {
-      return true;
+  inline ~GPUStreamScope() {
+    if (opContext_.run_ctx.stream) {
+      mshadow::DeleteStream<gpu>(static_cast<mshadow::Stream<gpu> *>(opContext_.run_ctx.stream));
+      opContext_.run_ctx.stream = nullptr;
     }
   }
+  OpContext& opContext_;
+};
+#endif  // MXNET_USE_CUDA
 
-  /*! \brief Initialize auxiliary and output blobs */
-  virtual bool initBackward(const OperatorProperty &opProp, std::vector<int> *in_type) {
-    initForward(opProp, in_type);
-    if (!initializeBackward_++) {
-      for (size_t x = 0, n = static_cast<size_t>(opProp.NumVisibleOutputs()); x < n; ++x) {
-        CHECK_LT(x, c_.blob_input_vec_.size());
-        allocateBlob(&c_.blob_out_grad_, c_.blob_input_vec_[x].shape_,
-                     false, c_.blob_input_vec_[x].type_flag_);
-      }
-
-      for (size_t x = 0, n = c_.blob_input_vec_.size(); x < n; ++x) {
-        allocateBlob(&c_.blob_in_grad_,  c_.blob_input_vec_[x].shape_,
-                     false, c_.blob_input_vec_[x].type_flag_);
-      }
-
-      // Get the resource of temporal space
-      std::vector<TShape> ishapes;
-      allocateResources(opProp.BackwardResource(ishapes));
-
-      resetBackward();
-      return false;
-    } else {
-      return true;
-    }
-  }
-
-  /*! \brief Run operator forward */
-  void forward(const size_t count = 1) {
-    // Possibly move data to/from CPU and GPU (outside of timing scope)
-    MXNET_CUDA_ONLY(std::unique_ptr<GPUOpData> gpuData(isGPU_ ?
-                       new GPUOpData(c_, &opContext_) : nullptr));
-    perf::TimingItem timeF(&timing_, Forward, "Forward", count);
-    if (!isGPU_) {
-      VTuneResume profile;  // VTune sample only this scope
-      for (size_t x = 0; x < count; ++x) {
-        op()->Forward(opContext_,
-                      c_.blob_input_vec_,
-                      {kWriteTo, kWriteTo, kWriteTo},
-                      c_.blob_output_vec_,
-                      c_.blob_aux_states_);
-      }
-    } else {
-      for (size_t x = 0; x < count; ++x) {
-        MXNET_CUDA_ONLY(op()->Forward(opContext_,
-                                      gpuData->blob_input_vec_,
-                                      {kWriteTo, kWriteTo, kWriteTo},
-                                      gpuData->blob_output_vec_,
-                                      gpuData->blob_aux_states_));
-      }
-    }
-  }
-
-  /*! \brief Run operator backwards */
-  void backward(const size_t count = 1) {
-    // Possibly move data to/from CPU and GPU (outside of timing scope)
-    MXNET_CUDA_ONLY(std::unique_ptr<GPUOpData> gpuData(isGPU_ ?
-                      new GPUOpData(c_, &opContext_) : nullptr));
-    perf::TimingItem timeB(&timing_, Backward, "Backward", count);
-    if (!isGPU_) {
-      VTuneResume profile;  // VTune sample only this scope
-      for (size_t x = 0; x < count; ++x) {
-        op()->Backward(opContext_,
-                       c_.blob_out_grad_,
-                       c_.blob_input_vec_,
-                       c_.blob_output_vec_,
-                       {kWriteTo, kWriteTo, kWriteTo},
-                       c_.blob_in_grad_,
-                       c_.blob_aux_states_);
-      }
-    } else {
-      for (size_t x = 0; x < count; ++x) {
-        MXNET_CUDA_ONLY(op()->Backward(opContext_,
-                                       gpuData->blob_out_grad_,
-                                       gpuData->blob_input_vec_,
-                                       gpuData->blob_output_vec_,
-                                       {kWriteTo, kWriteTo, kWriteTo},
-                                       gpuData->blob_in_grad_,
-                                       gpuData->blob_aux_states_));
-      }
-    }
-  }
-
-  /*! \brief Getter functions for the operator */
-  inline Operator *op() { return op_.get(); }
-  inline const Operator *op() const { return op_.get(); }
-
-  enum BlobVectorType {
-    kInput,
-    kOutput,
-    kAux,
-    kInGrad,
-    kOutGrad,
-    kBlobVectorTypeCount
-  };
-
-  #define CASE_STR(__v$) case (__v$): return #__v$
-
-  /*! \brief Convert BlobVectorType enum into a string */
-  static inline const char *bvt2String(const BlobVectorType bvt) {
-    switch (bvt) {
-      CASE_STR(kInput);
-      CASE_STR(kOutput);
-      CASE_STR(kAux);
-      CASE_STR(kInGrad);
-      CASE_STR(kOutGrad);
-      default:
-      CHECK(false);
-      return "";
-    }
-  }
-  #undef CASE_STR
-
-  /*! \brief Return a particular blob in a test data set */
-  inline const std::vector<TBlob>& getBlobVect(const BlobVectorType bvt) const {
-    switch (bvt) {
-      case kInput:
-        return c_.blob_input_vec_;
-      case kOutput:
-        return c_.blob_output_vec_;
-      case kAux:
-        return c_.blob_aux_states_;
-      case kInGrad:
-        return c_.blob_in_grad_;
-      case kOutGrad:
-        return c_.blob_out_grad_;
-      default:
-        CHECK(false);
-        return c_.blob_input_vec_;
-    }
+/*!
+ * \brief Base class for operator test-data classes
+ */
+template<typename DType>
+class OperatorDataInitializer {
+ public:
+  OperatorDataInitializer()
+  : generator_(new std::mt19937()) {
   }
 
-  /*! \brief Dump an operator's data set into compilable C++ data code for runtime validation
-   * When writing an operator test, you can generate a "known good operator data state" in C++
-   * code with this function, and then use load() to load the blob states into this
-   * class (BasicOperatorData).
-   * After that, you can compare with the "actual" operator state (BasicOperatorData) of
-   * the operator that you are testing.
+  /*!
+   * \brief Fill a blob with random values
+   * \param blob Blob which to fill with random values
    */
-  template<typename Stream>
-  inline void dumpC(Stream *_os, const std::string& label) {
-    Stream& os = *_os;
-    os << "static const std::vector< std::vector< std::vector<float> > > ___"
-       << label << "_data_shape_";
-    const TShape& shape = shape_input_vec_[0];
-    for (size_t i = 0, n = shape.ndim(); i < n; ++i) {
-      os << shape[i] << "_";
-    }
-    os << "__ =" << std::endl << "{" << std::endl;
-    for (size_t x = 0; x < kBlobVectorTypeCount; ++x) {
-      os << "  { /* " << bvt2String(BlobVectorType(x)) << " */" << std::endl;
-      const std::vector<TBlob>& blobVect = getBlobVect(BlobVectorType(x));
-      for (size_t i = 0, n = blobVect.size(); i < n; ++i) {
-        os << "    { ";
-        test::dump<DType>(&os, blobVect[i]);
-        os << " }";
-        if (i < n - 1) {
-          os << ",";
-        }
-        os << std::endl;
-      }
-      os << "  }";
-      if (x < kBlobVectorTypeCount - 1) {
-        os << ",";
-      }
-      os << std::endl;
-    }
-    os << "};" << std::endl;
-  }
-
-  static inline void copy(const TBlob& blob, const DType array[],
-                          const size_t start, const size_t end) {
-    const size_t blobSize = blob.Size();
-    DType *p = blob.dptr<DType>();
-    for (size_t i = 0, n = end - start; i < n; ++i) {
-      CHECK_LT(i, blobSize);
-      p[i] = array[i + start];
-    }
-  }
-
-  /*! \brief Runtime load of the C++ data code generated by dumpC() */
-  void load(const std::vector<std::vector<std::vector<DType>>>& cData) {
-    for (size_t i = 0, ni = cData.size(); i < ni; ++i) {
-      for (size_t j = 0, nj = cData[i].size(); j < nj; ++j)  {
-        const TBlob& blob = getBlobVect(BlobVectorType(i))[j];
-        const size_t sourceDataSize = cData[i][j].size();
-        CHECK_EQ(sourceDataSize, blob.Size());
-        const DType *sourceData = &cData[i][j][0];
-        copy(blob, sourceData, 0, sourceDataSize);
-      }
-    }
-  }
-
-  /*! \brief Runtime load of the C++ data code generated by dumpC() */
-  void load(const std::vector<std::vector<std::vector<DType>>>& cData,
-            const BlobVectorType type) {
-    CHECK_LT(type, cData.size());
-    for (size_t j = 0, nj = cData[type].size(); j < nj; ++j)  {
-      const TBlob& blob = getBlobVect(type)[j];
-      const size_t sourceDataSize = cData[type][j].size();
-      CHECK_EQ(sourceDataSize, blob.Size());
-      const DType *sourceData = &cData[type][j][0];
-      copy(blob, sourceData, 0, sourceDataSize);
-    }
-  }
-
-  /*! \brief Runtime load of the C++ data code generated by dumpC() */
-  void load(const std::vector<std::vector<std::vector<DType>>>& cData,
-            const BlobVectorType type, const int idx) {
-    CHECK_LT(type, cData.size());
-    CHECK_LT(idx, cData[type].size());
-    const TBlob& blob = getBlobVect(type)[idx];
-    const size_t sourceDataSize = cData[type][idx].size();
-    CHECK_EQ(sourceDataSize, blob.Size());
-    const DType *sourceData = &cData[type][idx][0];
-    copy(blob, sourceData, 0, sourceDataSize);
-  }
-
-  /*! \brief Input and output blobs */
-  OpContext                 opContext_;
-
-  std::vector<TShape>       shape_input_vec_;
-
-  struct OpData {
-    std::vector<TBlob> blob_input_vec_;
-    std::vector<TBlob> blob_output_vec_;
-    std::vector<TBlob> blob_aux_states_;
-    std::vector<TBlob> blob_in_grad_;
-    std::vector<TBlob> blob_out_grad_;  // Remaining err (loss) pushing back upstream
-
-    std::vector<std::vector<TBlob> *> all_blob_vects_;
-    inline OpData() {
-      all_blob_vects_.push_back(&blob_input_vec_);
-      all_blob_vects_.push_back(&blob_output_vec_);
-      all_blob_vects_.push_back(&blob_aux_states_);
-      all_blob_vects_.push_back(&blob_in_grad_);
-      all_blob_vects_.push_back(&blob_out_grad_);  // Remaining err (loss) pushing back upstream
-    }
-    virtual ~OpData() {}
-  };
-
-#if MXNET_USE_CUDA
-  class GPUOpData : public OpData {
-    GPUOpData() = delete;
-    GPUOpData(const GPUOpData& o) = delete;
-
-   public:
-    inline GPUOpData(const OpData& cpuData, OpContext *opContext)
-    : cpuData_(cpuData)
-      , allocGPUStream_(opContext) {
-      // Copy CPU->GPU
-      CHECK_EQ(gpuBlobs_.size(), 0U);
-      CHECK_EQ(cpuData_.all_blob_vects_.size(), this->all_blob_vects_.size());
-      for (size_t bvt = 0, nbvt = cpuData_.all_blob_vects_.size(); bvt < nbvt; ++bvt) {
-        std::vector<TBlob>& bv_src = *cpuData_.all_blob_vects_[bvt];
-        std::vector<TBlob>& bvt_dest = *this->all_blob_vects_[bvt];
-        for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
-          const TBlob& srcBlob = bv_src[i];
-          TBlob *destBlob = allocateBlob(&gpuBlobs_, &bvt_dest, srcBlob.shape_,
-                                         true, srcBlob.type_flag_);
-
-          Context cpu_ctx, gpu_ctx;
-          cpu_ctx.dev_type = Context::kCPU;
-          gpu_ctx.dev_type = Context::kGPU;
-          cpu_ctx.dev_id = gpu_ctx.dev_id = 0;
-
-          mxnet::ndarray::Copy<cpu, gpu>(srcBlob, destBlob, cpu_ctx,
-                                         gpu_ctx, allocGPUStream_.opContext_.run_ctx);
-        }
-      }
-      cudaDeviceSynchronize();
-    }
-    inline ~GPUOpData() {
-      // Copy GPU->CPU
-      cudaDeviceSynchronize();
-      for (size_t bvt = 0, nbvt = this->all_blob_vects_.size(); bvt < nbvt; ++bvt) {
-        std::vector<TBlob>& bv_src = *this->all_blob_vects_[bvt];
-        std::vector<TBlob>& bvt_dest = *cpuData_.all_blob_vects_[bvt];
-        for (size_t i = 0, n = bv_src.size(); i < n; ++i) {
-          const TBlob& srcBlob = bv_src[i];
-          TBlob *destBlob = &bvt_dest[i];
-
-          Context cpu_ctx, gpu_ctx;
-          cpu_ctx.dev_type = Context::kCPU;
-          gpu_ctx.dev_type = Context::kGPU;
-          cpu_ctx.dev_id = gpu_ctx.dev_id = 0;
-
-          mxnet::ndarray::Copy<gpu, cpu>(srcBlob, destBlob, gpu_ctx,
-                                         cpu_ctx, allocGPUStream_.opContext_.run_ctx);
-        }
-      }
-      gpuBlobs_.clear();  // Force deallocation of the GPU blob data
-      cudaDeviceSynchronize();
-    }
-
-   private:
-    /*! \brief Reference to the src/dest CPU data */
-    const OpData& cpuData_;
-    /*! \brief The GPU-allocated blobs */
-    std::list<std::unique_ptr<test::StandaloneBlob>> gpuBlobs_;
-    /*! \brief Scoped GPU stream */
-    GPUStreamScope allocGPUStream_;
-  };
-#endif  // MXNET_USE_CUDA
-
-  OpData                    c_;
-
- protected:
-  /*! \brief Allocate the operator's resource requests */
-  void allocateResources(const std::vector<ResourceRequest>& reqs) {
-    std::map<Context, Resource> cached_temp;
-
-    Context ctx;
-    ctx.dev_type = isGPU_ ? Context::kGPU : Context::kCPU;
-    ctx.dev_id = 0;
-
-    for (const ResourceRequest& req : reqs) {
-      if (req.type == ResourceRequest::kTempSpace) {
-        if (cached_temp.count(ctx) != 0) {
-          opContext_.requested.push_back(cached_temp.at(ctx));
-        } else {
-          Resource r = ResourceManager::Get()->Request(ctx, req);
-          opContext_.requested.push_back(r);
-          cached_temp[ctx] = r;
-        }
-      } else if (req.type == ResourceRequest::kRandom) {
-        opContext_.requested.push_back(ResourceManager::Get()->Request(ctx, req));
+  void FillRandom(const TBlob& blob) const {
+    std::uniform_real_distribution<> dis_real(-5.0, 5.0);
+    std::uniform_int_distribution<> dis_int(-128, 127);
+    test::patternFill<DType>(&blob, [this, &dis_real, &dis_int]() -> DType {
+      if (!std::is_integral<DType>::value) {
+        DType val;
+        do {
+          val = static_cast<DType>(dis_real(this->generator()));
+        } while (fabs(val) < 1e-5);  // If too close to zero, try again
+        return val;
       } else {
-        LOG(FATAL) << "resource type not yet supported";
+        DType val;
+        do {
+          val = static_cast<DType>(dis_int(this->generator()));
+        } while (!val);  // If zero, try again
+        return val;
       }
-    }
-  }
-
-  /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
-  static TBlob *allocateBlob(std::list<std::unique_ptr<test::StandaloneBlob>> *standalone_blobs,
-                             std::vector<TBlob> *dest,
-                             const TShape& shape,
-                             const bool isGPU,
-                             const int dtype) {
-    test::StandaloneBlob *blob = new test::StandaloneBlob(shape, isGPU, dtype);
-    CHECK_NE(blob, static_cast<TBlob *>(nullptr));
-    standalone_blobs->push_back(std::unique_ptr<test::StandaloneBlob>(blob));
-    (*dest).push_back(*blob);
-    return blob;
+    });
   }
 
-  /*! \brief Locally allocate a managed TBlob and insert into the supplied vector */
-  inline TBlob *allocateBlob(std::vector<TBlob> *dest, const TShape& shape,
-                             const bool isGPU, const int dtype) {
-    return allocateBlob(&standalone_blobs_, dest, shape, isGPU, dtype);
+  void FillZero(const TBlob& blob) const {
+    test::patternFill<DType>(&blob, []() -> DType { return DType(0); });
   }
 
-  /*! \brief Performance timing categories */
-  enum TimingId {
-    Forward,
-    Backward
-  };
+ private:
+  /*!
+   * \brief mt19937 generator for random number generator
+   * \return reference to mt19937 generator object
+   */
+  std::mt19937& generator() const { return *generator_; }
 
-  /*! \brief The operator */
-  std::unique_ptr<Operator>   op_;
-  /*! \brief Is this for a GPU? */
-  const bool                  isGPU_;
-  /*! \brief Assure that the Forward initialized only once */
-  std::atomic<int>            initializeForward_;
-  /*! \brief Assure that the Forward initialized only once */
-  std::atomic<int>            initializeBackward_;
-  /*! \brief Assure that the callback is initialized only once */
-  std::atomic<int>            initializeCallback_;
-  /*! \brief scoped lifecycle management of allocated blobs */
-  std::list<std::unique_ptr<test::StandaloneBlob>> standalone_blobs_;
+  /*! \brief Per-test generator */
+  std::unique_ptr<std::mt19937> generator_;
+};
 
+class OperatorExecutorTiming {
  public:
+  inline test::perf::TimingInstrument& GetTiming() { return timing_; }
+
+ private:
   /*! Timing instrumentation */
   test::perf::TimingInstrument timing_;
 };
 
 /*! \brief Top-level operator test state info structure */
-template<typename OperatorProp, typename DType, typename AccReal>
+template<typename OperatorProp, typename OperatorExecutor>
 struct OpInfo {
   /*! \brief The operator data */
-  std::shared_ptr< test::op::BasicOperatorData<DType, AccReal> > data_;
+  std::shared_ptr< OperatorExecutor > executor_;
   /*! \brief The operator prop class */
   std::shared_ptr<OperatorProp>                         prop_;
   /*! \brief The input type(s) */
@@ -554,12 +156,12 @@ struct OpInfo {
 };
 
 /*! \brief Pair of op info objects, generally for validating ops against each other */
-template<typename OperatorProp1, typename OperatorProp2, typename DType, typename AccReal>
+template<typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
 struct OpInfoPair {
   /*! \brief Operator item 1 */
-  test::op::OpInfo<OperatorProp1, DType, AccReal>  info_1_;
+  test::op::OpInfo<OperatorProp1, OperatorExecutor>  info_1_;
   /*! \brief Operator item 2 */
-  test::op::OpInfo<OperatorProp2, DType, AccReal>  info_2_;
+  test::op::OpInfo<OperatorProp2, OperatorExecutor>  info_2_;
 };
 
 /*! \brief Base validator class for validating test data */
@@ -618,26 +220,23 @@ class Validator {
   /*! \brief Compare blob data */
   static bool compare(const TBlob& b1, const TBlob& b2) {
     if (b1.shape_ == b2.shape_) {
-      MSHADOW_REAL_TYPE_SWITCH(
-        b1.type_flag_,
-        DTypeX,
-        {
-          CHECK_EQ(b1.type_flag_, b2.type_flag_)
-            << "Can't compare blobs of different data types";
-          const DTypeX *d1 = b1.dptr<DTypeX>();
-          const DTypeX *d2 = b2.dptr<DTypeX>();
-          CHECK_NE(d1, d2);  // don't compare the same memory
-          for (size_t i = 0, n = b1.Size(), warningCount = 0; i < n; ++i) {
-            const DTypeX v1 = *d1++;
-            const DTypeX v2 = *d2++;
-            const DType kErrorBound = ErrorBound(&b1, v1, v2);
-            EXPECT_NEAR(v1, v2, kErrorBound);
-            if (!isNear(v1, v2, kErrorBound) && !warningCount++) {
-              on_failure(i, n, v1, v2, kErrorBound);
-            }
+      MSHADOW_REAL_TYPE_SWITCH(b1.type_flag_, DTypeX, {
+        CHECK_EQ(b1.type_flag_, b2.type_flag_) << "Can't compare blobs of different data types";
+        const DTypeX *d1 = b1.dptr<DTypeX>();
+        const DTypeX *d2 = b2.dptr<DTypeX>();
+        CHECK_NE(d1, d2);  // don't compare the same memory
+        for (size_t i = 0, n = b1.Size(), warningCount = 0; i < n; ++i) {
+          const DTypeX v1 = *d1++;
+          const DTypeX v2 = *d2++;
+          const DType kErrorBound = ErrorBound(&b1, v1, v2);
+          EXPECT_NEAR(v1, v2, kErrorBound);
+          if (!isNear(v1, v2, kErrorBound) && !warningCount++) {
+            on_failure(i, n, v1, v2, kErrorBound);
+            return false;
           }
-          return true;
-        });
+        }
+      });
+      return true;
     }
     return false;
   }
@@ -658,67 +257,36 @@ class Validator {
     }
     return true;
   }
-
-  /*! \brief Compare similar blobs in two operator data structs */
-  static bool compare(
-    const test::op::BasicOperatorData<DType, AccReal>& i1,
-    const test::op::BasicOperatorData<DType, AccReal>& i2,
-    const typename test::op::BasicOperatorData<DType, AccReal>::BlobVectorType bvt,
-    const size_t idx, bool print = false) {
-    const std::vector<TBlob>& bv1 = i1.getBlobVect(bvt);
-    const std::vector<TBlob>& bv2 = i2.getBlobVect(bvt);
-
-    // If this is an invalid index, at least make sure the two blob vects
-    // are similarly too small for the index
-    if (bv1.size() <= idx) {
-      CHECK(bv1.size() == bv2.size());
-      return true;
-    }
-    const TBlob& b1 = bv1[idx];
-    const TBlob& b2 = bv2[idx];
-    if (print && test::debugOutput) {
-      MSHADOW_REAL_TYPE_SWITCH(
-        b1.type_flag_,
-        DTypeX,
-        {
-          test::print_blob<DTypeX>(&(std::cout << "Blob 1:"), b1, true, true);
-          test::print_blob<DTypeX>(&(std::cout << "Blob 2:"), b2, true, true);
-        });
-    }
-    return compare(b1, b2);
-  }
 };
 
 /*! \brief Operator Prop argument key/value pairs */
 typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
 
 /*! \brief Create operator data, prop, the operator itself and init default forward input */
-template<typename OperatorProp, typename OperatorData, typename DType, typename AccReal>
-static test::op::OpInfo<OperatorProp, DType, AccReal> createOpAndInfoF(const bool isGPU,
-                                                              const TShape &inputShape,
-                                                              const kwargs_t &kwargs) {
-  test::op::OpInfo<OperatorProp, DType, AccReal> info;
-  info.data_ = std::make_shared<OperatorData>(isGPU, inputShape);
+template<
+  typename OperatorProp,
+  typename OperatorExecutor,
+  typename ...Args>
+static test::op::OpInfo<OperatorProp, OperatorExecutor> createOpAndInfoF(const kwargs_t &kwargs,
+                                                                         Args... args) {
+  test::op::OpInfo<OperatorProp, OperatorExecutor> info;
+  info.executor_ = std::make_shared<OperatorExecutor>(args...);
   info.prop_ = std::make_shared<OperatorProp>();
-  // Note, assuming floating point
-  switch (sizeof(DType)) {
-    case sizeof(float):
-      info.in_type_ = {mshadow::kFloat32};
-      break;
-    case sizeof(double):
-      info.in_type_ = {mshadow::kFloat64};
-      break;
-    case sizeof(mshadow::half::half_t::half_):
-      info.in_type_ = {mshadow::kFloat16};
-      break;
-    default:
-      break;
-  }
+  info.in_type_ = { mshadow::DataType<typename OperatorExecutor::DataType>::kFlag };
   info.prop_->Init(kwargs);
-  info.data_->initForward(*info.prop_, &info.in_type_);
+  info.executor_->initForward(*info.prop_, &info.in_type_);
   return info;
 }
 
+inline std::vector<TShape> ShapesOf(const std::vector<NDArray>& arrays) {
+  std::vector<TShape> res;
+  res.reserve(arrays.size());
+  for (const NDArray& ar : arrays) {
+    res.emplace_back(ar.shape());
+  }
+  return std::move(res);
+}
+
 }  // namespace op
 }  // namespace test
 }  // namespace mxnet
diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h
new file mode 100644
index 000000000000..0992c41f760e
--- /dev/null
+++ b/tests/cpp/include/test_op_runner.h
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file test_op_runner.h
+ * \brief Run a generic operator
+ * \author Chris Olivier
+*/
+#ifndef TEST_OP_RUNNER_H_
+#define TEST_OP_RUNNER_H_
+
+#include <string>
+#include <vector>
+#include <utility>
+#include "./test_op.h"
+
+namespace mxnet {
+namespace test {
+
+/*!
+ * \brief Generic operator runner
+ * \tparam OperatorProp property class for a given operator (i.e. FullyConnectedProp, BatchNormProp)
+ * \tparam OperatorExecutor Data container for forward and backward passes for some given
+ *         data types
+ */
+template<typename OperatorProp, typename OperatorExecutor>
+class OperatorRunner {
+ public:
+  typedef typename OperatorExecutor::DataType    DType;
+
+  OperatorRunner() {
+#ifdef NDEBUG
+    total_iterations_ = 50;
+#else
+    total_iterations_ = 5;
+#endif
+  }
+
+  /*!
+   * \brief Test operator forward pass
+   * \param isGPU Whether this test is for GPU
+   * \param inputShape Input data shape
+   * \param kwargs Operator parameters
+   * \param OutShapeFunction Output shape function override
+   * \param count Number of times to run in each direction
+   * \return OpInfo object for further opereator analysis
+   */
+  test::op::OpInfo<OperatorProp, OperatorExecutor>
+  RunGenericOperatorForward(
+    bool isGPU,
+    const std::vector<TShape>& inputShapes,
+    const std::vector<std::pair<std::string, std::string> > &kwargs,
+    const size_t count = 1) {
+#if MXNET_USE_CUDA
+    if (isGPU && !test::unitTestsWithCuda) {
+      LOG(INFO) << "GPU not found, running test as non-GPU";
+    }
+#else
+    isGPU = false;
+#endif
+    test::op::OpInfo<OperatorProp, OperatorExecutor> info =
+      test::op::createOpAndInfoF<OperatorProp, OperatorExecutor>(kwargs, isGPU, inputShapes);
+    info.executor_->initForward(*info.prop_, &info.in_type_);
+    info.executor_->forward(count);
+    return info;
+  }
+
+  /*!
+   * \brief Test operator backward pass
+   * \param info OpInfo object from forward pass
+   * \param count
+   * \return OpInfo object for further opereator analysis
+   */
+  test::op::OpInfo<OperatorProp, OperatorExecutor> RunGenericOperatorBackward(
+    test::op::OpInfo<OperatorProp, OperatorExecutor> *info,
+    const size_t count = 1) {
+    CHECK(info->executor_->HasBackward());
+    info->executor_->initBackward(*info->prop_, &info->in_type_);
+    info->executor_->backward(count);
+    return *info;
+  }
+
+  /*!
+   * \brief Run operator forward and backward
+   * \param isGPU Whether this test is for GPU
+   * \param inputShape Input data shape
+   * \param kwargs Operator parameters
+   * \param OutShapeFunction Output shape function override
+   * \param count Number of times to run in each direction
+   * \return
+   */
+  test::op::OpInfo<OperatorProp, OperatorExecutor> RunBidirectional(
+    bool isGPU,
+    const std::vector<TShape>& inputShapes,
+    const std::vector<std::pair<std::string, std::string> > &kwargs,
+    const size_t count = 1) {
+    test::op::OpInfo<OperatorProp, OperatorExecutor> info =
+      RunGenericOperatorForward(isGPU, inputShapes, kwargs, count);
+    if (info.executor_->HasBackward()) {
+      return RunGenericOperatorBackward(&info, count);
+    }
+    return info;
+  }
+
+  /*!
+   * \brief Timing test a generic operator
+   * \tparam PropType
+   * \tparam DType Data type
+   * \tparam AccReal Accumulative data type (if any)
+   * \param label Label for performance output
+   * \param isGPU Whether this test is for GPU
+   * \param stochastic Whether shape should be random (batch size, channels, hm, w)
+   * \param kwargs Operator parameters
+   * \param dim Data dimensions
+   * \param count Number of times to run in each direction
+   */
+  std::unordered_map<int, perf::TimingInstrument::Info>
+  TimingTest(const std::string& label,
+             const bool isGPU,
+             const bool stochastic,
+             const test::op::kwargs_t& kwargs,
+             int dim = 0,
+             size_t count = 1,
+             const std::vector<TShape>& timing_shapes = {}) {
+    if (mxnet::test::quick_test) {
+      total_iterations_ = 2;
+      count = 1;
+    }
+
+    test::perf::TimingInstrument timing;
+
+    std::stringstream ss;
+    ss << "Timing: " << total_iterations_ << " iterations of " << count << " calls";
+    if (timing_shapes[0].ndim()) {
+      size_t lhs_total = 0;
+      ss << ", shape = ";
+      for (size_t i = 0, n = timing_shapes.size(); i < n; ++i) {
+        if (i) {
+          ss << ", ";
+        }
+        ss << timing_shapes[i];
+        if (!i) {
+          lhs_total = timing_shapes[i].Size();
+        }
+      }
+      ss << " = " << test::pretty_num(lhs_total) << " items " << std::endl << std::flush;
+    }
+    if (!mxnet::test::csv) {
+      std::cout << ss.str();
+    }
+
+    for (size_t i = 0; i < total_iterations_; ++i) {
+      index_t batchSize = 1;
+      index_t channels = 1;
+      index_t depth = 1;
+      index_t height = 1;
+      index_t width = 1;
+
+      if (timing_shapes.empty()) {
+        do {
+          batchSize = stochastic ? test::rangedRand(1U, TEST_BATCH_SIZE * 2U) : TIMING_BATCH_SIZE;
+          channels = stochastic ? test::rangedRand(1U, TEST_CHANNELS * 2U) : TIMING_CHANNELS;
+          depth = stochastic ? test::rangedRand(1U, TEST_DEPTH * 2U) : TIMING_DEPTH;
+          height = stochastic ? test::rangedRand(1U, TEST_DH * 2U) : TIMING_DH;
+          width = stochastic ? test::rangedRand(1U, TEST_DW * 2U) : TIMING_DW;
+        } while (stochastic && (height * width) == 1U);
+      } else {
+        dim = timing_shapes[0].ndim() - 1;
+      }
+
+      const size_t D = dim ? dim - 1U : test::rangedRand(0U, 2U);
+
+      test::op::OpInfo<OperatorProp, OperatorExecutor> info;
+      switch (D) {
+        case 0:
+          info = RunGenericOperatorForward(isGPU,
+                                           !timing_shapes.empty()
+                                           ? timing_shapes
+                                           : std::vector<TShape>({TShape({batchSize,
+                                                                          channels,
+                                                                          width})}),
+                                           kwargs,
+                                           count);
+          break;
+        case 1:
+          info = RunGenericOperatorForward(isGPU,
+                                           !timing_shapes.empty()
+                                           ? timing_shapes
+                                           : std::vector<TShape>({ TShape({batchSize,
+                                                                           channels,
+                                                                           height,
+                                                                           width})}),
+                                           kwargs,
+                                           count);
+          break;
+        case 2:
+          info = RunGenericOperatorForward(isGPU,
+                                           !timing_shapes.empty()
+                                           ? timing_shapes
+                                           : std::vector<TShape>({ TShape({batchSize,
+                                                                           channels,
+                                                                           depth,
+                                                                           height,
+                                                                           width})}),
+                                           kwargs,
+                                           count);
+          break;
+        default:
+          CHECK(false) << "Unsupported dimension count: " << (D + 1);
+      }
+      if (info.executor_) {
+        if (info.executor_->HasBackward()) {
+          RunGenericOperatorBackward(&info, count);
+        }
+        timing += info.executor_->GetTiming();
+      }
+    }
+
+    if (verbose_ && !mxnet::test::csv) {
+      timing.print(&std::cout, label);
+      std::cout << std::endl << std::flush;
+    }
+    return timing.data();
+  }
+
+  void set_verbose(bool verbose) { verbose_ = verbose; }
+
+  void set_total_iterations(size_t iterations) { total_iterations_ = iterations; }
+
+ protected:
+  static constexpr int TEST_BATCH_SIZE = 5;
+  static constexpr int TEST_CHANNELS = 3;
+  static constexpr int TEST_DEPTH = 2;
+  static constexpr int TEST_DH = 2;
+  static constexpr int TEST_DW = 3;
+
+  static constexpr int TIMING_BATCH_SIZE = 128;
+  static constexpr int TIMING_CHANNELS = 3;
+  static constexpr int TIMING_DEPTH = 2;
+  static constexpr int TIMING_DH = 64;
+  static constexpr int TIMING_DW = 64;
+  /*! \brief verbose output */
+  bool verbose_ = true;
+  /*! \brief Tital iterations */
+  size_t total_iterations_ = 10;
+};
+
+}  // namespace test
+}  // namespace mxnet
+
+#endif  // TEST_OP_RUNNER_H_
diff --git a/tests/cpp/include/test_perf.h b/tests/cpp/include/test_perf.h
index d74d4d5a8976..672b28a4268b 100644
--- a/tests/cpp/include/test_perf.h
+++ b/tests/cpp/include/test_perf.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file test_perf.h
  * \brief operator unit test utility functions
  * \author Chris Olivier
@@ -45,7 +46,7 @@ namespace perf {
 inline uint64_t getMicroTickCount() {
 #ifndef _WIN32
   struct timeval tv;
-  gettimeofday(&tv, NULL);
+  gettimeofday(&tv, nullptr);
   return uint64_t(tv.tv_sec) * 1000000 + tv.tv_usec;
 #else
   LARGE_INTEGER CurrentTime;
@@ -60,9 +61,23 @@ inline uint64_t getMicroTickCount() {
 #endif
 }
 
-/*! \brief millisecond tick count */
-inline uint64_t getTickCount() {
-  return getMicroTickCount() / 1000;
+/*! \brief current timestamp: millionths of a second */
+inline uint64_t getNannoTickCount() {
+#ifndef _WIN32
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return (uint64_t(tv.tv_sec) * 1000000 + tv.tv_usec) * 1000;
+#else
+  LARGE_INTEGER CurrentTime;
+  LARGE_INTEGER Frequency;
+
+  QueryPerformanceFrequency(&Frequency);
+  QueryPerformanceCounter(&CurrentTime);
+
+  CurrentTime.QuadPart *= 1000000000;
+  CurrentTime.QuadPart /= Frequency.QuadPart;
+  return CurrentTime.QuadPart;
+#endif
 }
 
 #define MICRO2MS(__micro$)  (((__micro$) + 500)/1000)
@@ -81,7 +96,7 @@ class TimedScope {
   const size_t    count_;
 
  public:
-  explicit inline TimedScope(const char *msg = NULL, size_t count = 1, const bool start = true)
+  explicit inline TimedScope(const char *msg = nullptr, size_t count = 1, const bool start = true)
     : startTime_(start ? getMicroTickCount() : 0)
       , stopTime_(0)
       , count_(count) {
@@ -145,7 +160,7 @@ class TimingInstrument {
   }
   void startTiming(int id, const char *s) {
     std::unique_lock<std::recursive_mutex>  lk(mutex_);
-    std::unordered_map<int, Info>::iterator i = data_.find(id);
+    auto i = data_.find(id);
     if (i == data_.end()) {
       i = data_.emplace(std::make_pair(id, Info(s))).first;
     }
@@ -155,7 +170,7 @@ class TimingInstrument {
   }
   void stopTiming(int id, const size_t subIterationCount = 1) {
     std::unique_lock<std::recursive_mutex>  lk(mutex_);
-    std::unordered_map<int, Info>::iterator i = data_.find(id);
+    auto i = data_.find(id);
     CHECK_NE(i == data_.end(), true) << "Can't stop timing on an object that we don't know about";
     if (i != data_.end()) {
       CHECK_NE(i->second.nestingCount_, 0U) << "While stopping timing, invalid nesting count of 0";
@@ -169,7 +184,7 @@ class TimingInstrument {
   }
   uint64_t getDuration(int id) {
     std::unique_lock<std::recursive_mutex>  lk(mutex_);
-    std::unordered_map<int, Info>::iterator i = data_.find(id);
+    auto i = data_.find(id);
     if (i != data_.end()) {
       const Info&        info = i->second;
       const uint64_t duration = info.nestingCount_.load()
@@ -183,7 +198,7 @@ class TimingInstrument {
   bool isTiming(int id) {
     std::unordered_map<int, Info>::const_iterator i = data_.find(id);
     if (i != data_.end()) {
-      return !!i->second.nestingCount_.load();
+      return i->second.nestingCount_.load() != 0;
     }
     return false;
   }
@@ -197,7 +212,7 @@ class TimingInstrument {
         i != e; ++i) {
       const Info&        info = i->second;
       const uint64_t duration = getDuration(i->first);
-      *os << /*std::endl <<*/ label_ << ": " << name_ << " Timing [" << info.name_ << "] "
+      *os << label_ << ": " << name_ << " Timing [" << info.name_ << "] "
           << (info.nestingCount_.load() ? "*" : "")
           << MICRO2MSF(duration) << " ms";
         if (info.cycleCount_.load()) {
@@ -214,7 +229,7 @@ class TimingInstrument {
 
   void reset() {
     std::unique_lock<std::recursive_mutex>  lk(mutex_);
-    for (std::unordered_map<int, Info>::iterator i = data_.begin(), e = data_.end();
+    for (auto i = data_.begin(), e = data_.end();
         i != e; ++i) {
       const int id = i->first;
       const bool wasTiming = isTiming(id);
@@ -231,9 +246,9 @@ class TimingInstrument {
   }
 
   TimingInstrument& operator += (const TimingInstrument& o) {
-    for (std::unordered_map<int, Info>::const_iterator i = o.data_.begin(), e = o.data_.end();
+    for (auto i = o.data_.begin(), e = o.data_.end();
         i != e; ++i) {
-      std::unordered_map<int, Info>::iterator j = data_.find(i->first);
+      auto j = data_.find(i->first);
       if (j != data_.end())  {
         const Info &oInfo = i->second;
         CHECK_EQ(oInfo.nestingCount_, 0U);
@@ -246,7 +261,6 @@ class TimingInstrument {
     return *this;
   }
 
- private:
   struct Info {
     explicit inline Info(const char *s)
       : name_(s ? s : "")
@@ -254,6 +268,7 @@ class TimingInstrument {
         , nestingCount_(0)
         , cycleCount_(0)
         , duration_(0) {}
+
     inline Info(const Info& o)
       : name_(o.name_)
         , baseTime_(o.baseTime_.load())
@@ -262,17 +277,36 @@ class TimingInstrument {
         , duration_(o.duration_.load()) {
       CHECK_EQ(o.nestingCount_, 0U);
     }
+
+    /*!
+     * \brief Return time for each operation in milliseconds
+     * \return Time for each operation in milliseconds
+     */
+    inline double TimeEach() const {
+      return static_cast<double>(duration_) / cycleCount_.load() / 1000.0f;
+    }
+
     std::string           name_;
     std::atomic<uint64_t> baseTime_;
     std::atomic<uint64_t> nestingCount_;
     std::atomic<uint64_t> cycleCount_;  // Note that nesting may skew averages
     std::atomic<uint64_t> duration_;
   };
+
+  typedef std::unordered_map<int, TimingInstrument::Info> timing_map_t;
+
+  const timing_map_t& data() const {
+    return data_;
+  }
+
+ private:
   std::string                   name_;
   mutable std::recursive_mutex  mutex_;
   std::unordered_map<int, Info> data_;
 };
 
+using timing_map_t = TimingInstrument::timing_map_t;
+
 /*! \brief Accumulated scoped timing, indexed by ID */
 class TimingItem {
  public:
diff --git a/tests/cpp/include/test_tune.h b/tests/cpp/include/test_tune.h
new file mode 100644
index 000000000000..725aa90a10a7
--- /dev/null
+++ b/tests/cpp/include/test_tune.h
@@ -0,0 +1,333 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file test_tune.h
+ * \brief operator tuning tester
+ * \author Chris Olivier
+*/
+
+#ifndef TEST_TUNE_H_
+#define TEST_TUNE_H_
+
+#include <sys/time.h>
+#include <dmlc/logging.h>
+#include <iomanip>
+#include <iostream>
+#include <atomic>
+#include <unordered_set>
+#include <unordered_map>
+#include <mutex>
+#include <vector>
+#include <utility>
+#include <algorithm>
+#include <string>
+#include <map>
+#include "../../src/operator/operator_tune-inl.h"
+#include "./test_util.h"
+#include "./test_op.h"
+#include "./test_core_op.h"
+
+namespace mxnet {
+namespace test {
+namespace tune {
+
+/*!
+ * \brief Tuning tests, which whether the correct tuning mode is selected by Auto
+ * \note This class makes no attempt at being performant (i.e. it does all sorts of slow
+ *       deep copies and that sort of thing), so don't insert any of thios code in the main
+ *       trunk unless you've verified the performance characteristics for that chunk of code
+ * \tparam DType Data type to test
+ */
+template<typename DType>
+class TuningTester {
+ public:
+  using kwargs_t = test::op::kwargs_t;
+
+  using bool_mode_pair = std::pair<bool, ::mxnet::op::tune::TuningMode>;
+
+  using shape_vect = std::vector<TShape>;
+  using shape_vec_to_bool_map = std::map<shape_vect, bool_mode_pair, test::less_shapevect>;
+
+ private:
+  using ShapesToPerfTimingMap =
+  std::map<shape_vect, test::perf::timing_map_t, test::less_shapevect>;
+
+  /*!
+   * \brief Run timing test on various data shapes and sizes
+   * \param isGPU true if the GPU should be used for the timing test
+   * \param op_kwargs operator parameters
+   * \param op_name The operator's registered name (with nnvm)
+   * \param backward_op_name The backward operator's registered name (with nnvm)
+   * \return ShapesToPerfTimingMap map holsing timing data for shapes
+   */
+  ShapesToPerfTimingMap RunCoreOpTimingTest(const bool isGPU,
+                                            const kwargs_t &op_kwargs,
+                                            const std::vector<shape_vect>& shapes,
+                                            const char *op_name,
+                                            const char *backward_op_name = "") {
+    ShapesToPerfTimingMap res;
+    const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
+      op_kwargs, op_name, backward_op_name);
+
+    // prime code and cache before the performance runs
+    test::op::CoreOperatorRunner<DType> runner;
+    runner.set_total_iterations(total_iterations_);
+    runner.set_verbose(false);
+    runner.RunBidirectional(false, {{10, 3, 18, 128}}, kwargs, 1);
+
+    // Do the performance runs
+    const char *pu = isGPU ? "GPU" : "CPU";
+    for (const std::vector<TShape> &this_run_shapes : shapes) {
+      test::perf::timing_map_t tmap = runner.TimingTest(std::string(op_name) + " Operator " + pu,
+                                                        isGPU, false, kwargs,
+                                                        0, calls_per_iteration_,
+                                                        this_run_shapes);
+      CHECK(res.find(this_run_shapes) == res.end());
+      res[this_run_shapes] = tmap;
+    }
+    return std::move(res);
+  }
+
+  using tuned_timing_t = std::map<
+    shape_vect,
+    std::map<::mxnet::op::tune::TuningMode, test::perf::timing_map_t>, test::less_shapevect>;
+
+  using modesort_t = std::multimap<double, ::mxnet::op::tune::TuningMode>;
+
+  /*!
+   * \brief Check if the tuning succeeded
+   * \param mode_sort modesort_t structure produced by 'CalculateModeSort'
+   * \param closeness_factor fraction of largest standard time (omp, no omp) which is an acceptable
+   *        range
+   * \return a pair <bool, TuningMode> consisting of true or false signifying if the test appears to
+   *         have made the correct decision, and the TuningMode which was closest in timing to
+   *         the Auto mode.
+   */
+  static bool_mode_pair CheckCorrectTuning(const modesort_t &mode_sort,
+                                           const double closeness_factor = 0.25) {
+    CHECK_EQ(mode_sort.size(), 3U);
+
+    // Determine fastest normal mode
+    ::mxnet::op::tune::TuningMode fastest_standard_mode = ::mxnet::op::tune::kAuto;
+    for (auto i = mode_sort.begin(), e = mode_sort.end(); i != e; ++i) {
+      if (i->second != ::mxnet::op::tune::kAuto) {
+        fastest_standard_mode = i->second;
+        break;
+      }
+    }
+    CHECK_NE(fastest_standard_mode, ::mxnet::op::tune::kAuto);
+
+    // We should be closest to the faster of kNeverOMP and kAlwaysOMP
+    // Take into account some variance, especially if kNeverOMP and kAlwaysOMP are close together
+    std::map<::mxnet::op::tune::TuningMode, double> mode2time;
+    for (auto i = mode_sort.begin(), e = mode_sort.end(); i != e; ++i) {
+      mode2time[i->second] = i->first;
+    }
+    const double time_auto = mode2time[::mxnet::op::tune::kAuto];
+    const double time_no_omp = mode2time[::mxnet::op::tune::kNeverOMP];
+    const double time_omp = mode2time[::mxnet::op::tune::kAlwaysOMP];
+
+    // Figure out which one we are closest to and return that to help in the analysis
+    ::mxnet::op::tune::TuningMode closest_to;
+    if (fabs(time_auto - time_no_omp) < fabs(time_auto - time_omp)) {
+      closest_to = ::mxnet::op::tune::kNeverOMP;
+    } else {
+      closest_to = ::mxnet::op::tune::kAlwaysOMP;
+    }
+
+    // If difference between OMP and no OMP is < closeness_factor of largest of the two,
+    // then we just want to make sure we are close to both of these
+    const double fastest_standard_time = std::min(time_no_omp, time_omp);
+    const double allowed_difference = closeness_factor * fastest_standard_time;
+    const double mustbe_asfast = fastest_standard_time + allowed_difference;
+
+    return { time_auto <= mustbe_asfast || closest_to == fastest_standard_mode,
+             closest_to };
+  }
+
+ public:
+  /*!
+   * \brief Given timing statistics, determine if 'Auto' mode made the correct choice.
+   * \param direction Compute direction for which to check (Forward or Backward)
+   * \param verbose If true, print the statistical info
+   * \return A map of shape vectors to a pair <bool, TuningMode> consisting of true or false
+   *         signifying if the test appears to have made the correct decision, and the TuningMode
+   *         which was closest in timing to the Auto mode.
+   */
+  shape_vec_to_bool_map CalculateModeSort(const test::op::TimingDirection direction,
+                                          bool verbose = true) const {
+    if (test::csv) {
+      verbose = false;
+    }
+    shape_vec_to_bool_map results;
+    // Incredibly inefficient method of grouping the results
+    for (const auto &i : timing_) {
+      // print shapes
+      const shape_vect &shapes = i.first;
+      if (verbose || test::csv) {
+        if (!test::csv) {
+          for (size_t x = 0, n = shapes.size(); x < n; ++x) {
+            const TShape &shape = shapes[x];
+            if (x) {
+              std::cout << ", ";
+            }
+            std::cout << shape;
+          }
+          const TShape &lhs_shape = shapes[0];
+          std::cout << " lhs=" << test::pretty_num(lhs_shape.Size()) << " items";
+          std::cout << "\t(" << TimingDirectionAsString(direction) << ")" << std::endl;
+        } else {
+          std::cout << test::pretty_num(shapes[0].Size()) << ",";
+        }
+      }
+      const auto &mode2timing = i.second;
+      modesort_t mode_sort;
+      for (const auto &j : mode2timing) {
+        const ::mxnet::op::tune::TuningMode mode = j.first;
+        const test::perf::timing_map_t &tm = j.second;
+        if (tm.find(direction) != tm.end()) {
+          const test::perf::TimingInstrument::Info &info = tm.find(direction)->second;
+          double duration = info.TimeEach();
+          mode_sort.insert({duration, mode});
+          if (test::csv) {
+            std::cout << TimingDirectionAsString(direction) << ","
+                      << ::mxnet::op::tune::TuningModeToString(mode) << ","
+                      << duration << ",";
+          }
+        }
+      }
+      if (test::csv) {
+        std::cout << std::endl << std::flush;
+      }
+      if (!mode_sort.empty()) {
+        // Now we have modes sorted by performance, fastest to slowest
+        const bool_mode_pair result = CheckCorrectTuning(mode_sort);
+        if (verbose && !test::csv) {
+          for (const auto &k : mode_sort) {
+            std::cout << "\t" << ::mxnet::op::tune::TuningModeToString(k.second)
+                      << ": " << k.first << " ms";
+            if (k.second == ::mxnet::op::tune::kAuto) {
+              std::cout << " (" << ::mxnet::op::tune::TuningModeToString(result.second) << ")";
+            }
+            std::cout << std::endl;
+          }
+          std::cout << std::flush;
+          if (!result.first) {
+            std::cout << "*** WARNING: Wrong OMP state selected ***" << std::endl << std::flush;
+          }
+        }
+        CHECK(results.find(shapes) == results.end()) << "Duplicate entry for set of shapes";
+        results[shapes] = result;
+      }
+    }
+    return std::move(results);
+  }
+
+  /*!
+   * \brief Perform execution runs for a given forward (and optionally backward) operator
+   * \param kwargs Parameters for the operator
+   * \param op_name Name by which the operator is registered with nnvm
+   * \param backward_op_name Backward operator name
+   */
+  void TestTunedOperator(const kwargs_t &kwargs,
+                         const bool verbose,
+                         const std::vector<shape_vect>& shapevec_vectors,
+                         const char *op_name,
+                         const char *backward_op_name = COREOP_BWD_OP_NAME_VALUE_NONE) {
+    timing_.clear();
+    using namespace mxnet::op;
+    tuned_timing_t timing;
+    for (int x = 0; x < 1; ++x) {
+      for (auto mode : {::mxnet::op::tune::kNeverOMP,
+                        ::mxnet::op::tune::kAuto,
+                        ::mxnet::op::tune::kAlwaysOMP
+                        }) {
+        if (verbose && !test::csv) {
+          std::cout << std::endl << ::mxnet::op::tune::TuningModeToString(mode)
+                    << std::endl << std::flush;
+        }
+
+        mxnet::op::OperatorTune<DType>::set_tuning_mode(mode);
+        const ShapesToPerfTimingMap shapes2perfmap = RunCoreOpTimingTest(false,
+                                                                         kwargs,
+                                                                         shapevec_vectors,
+                                                                         op_name,
+                                                                         backward_op_name);
+        for (const auto &item : shapes2perfmap) {
+          const shape_vect &shapes = item.first;
+          const test::perf::timing_map_t &tm = item.second;
+          timing_[shapes][mode] = tm;
+        }
+      }
+    }
+  }
+
+  /*!
+   * \brief Calculate the success rate of the run based upon Auto being close to the faster
+   *        OMP/non-OMP attempt
+   * \param modes List of directions to use in calculation (Forward, Backward). Empty list means all
+   * \param verbose Whether to print info
+   * \return Success rate ratio (#success/#TOTAL) (0.0-1.0)
+   */
+  float CalculateSuccessRate(std::vector<test::op::TimingDirection> directions = {},
+                             bool verbose = true) const {
+    size_t count = 0, success = 0;
+    if (directions.empty()) {
+      directions = {test::op::kForward, test::op::kBackward};
+    }
+    for (const test::op::TimingDirection direction : directions) {
+      typename test::tune::TuningTester<DType>::shape_vec_to_bool_map res_fwd =
+        CalculateModeSort(direction, verbose);
+      for (auto iter = res_fwd.begin(), e = res_fwd.end(); iter != e; ++iter) {
+        ++count;
+        if (iter->second.first) {
+          ++success;
+        }
+      }
+    }
+    if (count) {
+      return static_cast<float>(success) / static_cast<float>(count);
+    }
+    return 1.0f;  // nothing ventured, nothing failed (glass-is-half-full angle)
+  }
+
+  void set_calls_per_iteration(size_t calls_per_iterations) {
+    calls_per_iteration_ = calls_per_iterations;
+  }
+  size_t calls_per_iteration(size_t calls_per_iterations) const {
+    return calls_per_iteration_;
+  }
+  void set_total_iterations(size_t iterations) { total_iterations_ = iterations; }
+  size_t total_iterations(size_t iterations) const { return total_iterations_; }
+
+ private:
+  /*! \brief Number of iterations */
+  size_t          total_iterations_ = 10;
+  /*! \brief Calls per iteration */
+  size_t          calls_per_iteration_ = 50;
+  /*! \brief Raw timing data */
+  tuned_timing_t  timing_;
+};
+
+}  // namespace tune
+}  // namespace test
+}  // namespace mxnet
+
+#endif  // TEST_TUNE_H_
diff --git a/tests/cpp/include/test_util.h b/tests/cpp/include/test_util.h
index 3f5f4ecbb5bb..8347a8a9d7b3 100644
--- a/tests/cpp/include/test_util.h
+++ b/tests/cpp/include/test_util.h
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file test_util.h
  * \brief unit test performance analysis functions
  * \author Chris Olivier
@@ -27,6 +28,7 @@
 
 #include <gtest/gtest.h>
 #include <mxnet/storage.h>
+#include <mxnet/ndarray.h>
 #include <string>
 #include <vector>
 #include <sstream>
@@ -39,7 +41,10 @@ namespace mxnet {
 namespace test {
 
 extern bool unitTestsWithCuda;
-extern bool debugOutput;
+extern bool debug_output;
+extern bool quick_test;
+extern bool performance_run;
+extern bool csv;
 
 /*! \brief Pause VTune analysis */
 struct VTunePause {
@@ -69,6 +74,81 @@ struct VTuneResume {
   }
 };
 
+
+template<typename DType>
+inline size_t shapeMemorySize(const TShape& shape) {
+  return shape.Size() * sizeof(DType);
+}
+
+class BlobMemory {
+ public:
+  explicit inline BlobMemory(const bool isGPU) : isGPU_(isGPU) {
+    this->handle_.dptr = nullptr;
+  }
+  inline ~BlobMemory() {
+    Free();
+  }
+  void *Alloc(const size_t size) {
+    CHECK_GT(size, 0U);  // You've probably made a mistake
+    mxnet::Context context = isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{};
+    Storage *storage = mxnet::Storage::Get();
+    handle_ = storage->Alloc(size, context);
+    return handle_.dptr;
+  }
+  void Free() {
+    if (handle_.dptr) {
+      Storage *storage = mxnet::Storage::Get();
+      storage->DirectFree(handle_);
+      handle_.dptr = nullptr;
+    }
+  }
+  size_t Size() const {
+    return handle_.size;
+  }
+
+ private:
+  const bool      isGPU_;
+  Storage::Handle handle_;
+};
+
+class StandaloneBlob : public TBlob {
+ public:
+  inline StandaloneBlob(const TShape& shape, const bool isGPU, const int dtype)
+    : TBlob(nullptr, shape, isGPU ? gpu::kDevMask : cpu::kDevMask, dtype)
+      , memory_(std::make_shared<BlobMemory>(isGPU)) {
+    MSHADOW_TYPE_SWITCH(dtype, DType, {
+      this->dptr_ = memory_->Alloc(shapeMemorySize<DType>(shape)); });
+  }
+  inline ~StandaloneBlob() {
+    this->dptr_ = nullptr;
+  }
+  inline size_t MemorySize() const {
+    return memory_->Size();
+  }
+
+ private:
+  /*! \brief Locally allocated memory block for this blob */
+  std::shared_ptr<BlobMemory>  memory_;
+};
+
+#if MXNET_USE_CUDA
+/*! \brief Return blob in CPU memory  */
+inline StandaloneBlob BlobOnCPU(const RunContext &rctx, const TBlob& src) {
+  StandaloneBlob res(src.shape_, false, src.type_flag_);
+  if (src.dev_mask() == cpu::kDevMask) {
+    LOG(WARNING) << "BlobOnCPU(<cpu blob>) is safe, but try not to call this with a CPU blob"
+                 << " because it is inefficient";
+    memcpy(res.dptr_, src.dptr_, res.MemorySize());
+  } else {
+    mshadow::Stream<gpu> *stream = rctx.get_stream<gpu>();
+    MSHADOW_TYPE_SWITCH(src.type_flag_, DType, {
+      mshadow::Copy(res.FlatTo1D<cpu, DType>(), src.FlatTo1D<gpu, DType>(stream), stream);
+    });
+  }
+  return res;
+}
+#endif  // MXNET_USE_CUDA
+
 constexpr const size_t MPRINT_PRECISION = 5;
 
 template<typename DType>
@@ -129,7 +209,6 @@ inline index_t offset(const TShape& shape, const std::vector<size_t>& indices) {
   for (size_t i = 0; i < dim; ++i) {
     offset *= shape[i];
     if (indices.size() > i) {
-      CHECK_GE(indices[i], 0U);
       CHECK_LT(indices[i], shape[i]);
       offset += indices[i];
     }
@@ -169,37 +248,70 @@ inline std::string repeatedStr(const char *s, const signed int count,
   }
 }
 
+/*! \brief Pretty print a shape with optional label */
+template<typename StreamType>
+inline StreamType& print_shape(StreamType *_os, const std::string& label, const TShape& shape,
+                               const bool add_endl = true) {
+  if (!label.empty()) {
+    *_os << label << ": ";
+  }
+  *_os << "(";
+  for (size_t i = 0, n = shape.ndim(); i < n; ++i) {
+    if (i) {
+      *_os << ", ";
+    }
+    *_os << shape[i];
+  }
+  *_os << ")";
+  if (add_endl) {
+    *_os << std::endl;
+  } else {
+    *_os << " ";
+  }
+  return *_os << std::flush;
+}
+
 /*! \brief Pretty print a 1D, 2D, or 3D blob */
 template<typename DType, typename StreamType>
-inline StreamType& print_blob(StreamType *_os, const TBlob &blob,
-                              bool doChannels = true, bool doBatches = true) {
-  StreamType& os = *_os;
+inline StreamType& print_blob_(const RunContext& ctx,
+                               StreamType *_os,
+                               const TBlob &blob,
+                               const bool doChannels = true,
+                               const bool doBatches = true,
+                               const bool add_endl = true) {
+#if MXNET_USE_CUDA
+  if (blob.dev_mask() == gpu::kDevMask) {
+    return print_blob_<DType>(ctx, _os, BlobOnCPU(ctx, blob), doChannels, doBatches, add_endl);
+  }
+#endif  // MXNET_USE_CUDA
+
+  StreamType &os = *_os;
   const size_t dim = static_cast<size_t>(blob.ndim());
 
   if (dim == 1) {
-    // probably a tensor (mshadow::Tensor is deprecated)
+    // probably a 1d tensor (mshadow::Tensor is deprecated)
     TBlob changed(blob.dptr<DType>(), TShape(3), blob.dev_mask(), blob.dev_id());
     changed.shape_[0] = 1;
     changed.shape_[1] = 1;
     changed.shape_[2] = blob.shape_[0];
-    return print_blob<DType>(&os, changed, false, false);
+    return print_blob_<DType>(ctx, &os, changed, false, false, add_endl);
   } else if (dim == 2) {
-    // probably a tensor (mshadow::Tensor is deprecated)
+    // probably a 2d tensor (mshadow::Tensor is deprecated)
     TBlob changed(blob.dptr<DType>(), TShape(4), blob.dev_mask(), blob.dev_id());
     changed.shape_[0] = 1;
     changed.shape_[1] = 1;
     changed.shape_[2] = blob.shape_[0];
     changed.shape_[3] = blob.shape_[1];
-    return print_blob<DType>(&os, changed, false, false);
+    return print_blob_<DType>(ctx, &os, changed, false, false, add_endl);
   }
   CHECK_GE(dim, 3U) << "Invalid dimension zero (0)";
 
   const size_t batchSize = blob.size(0);
 
   size_t channels = 1;
-  size_t depth  = 1;
+  size_t depth = 1;
   size_t height = 1;
-  size_t width  = 1;
+  size_t width = 1;
   if (dim > 1) {
     channels = blob.size(1);
     if (dim > 2) {
@@ -207,7 +319,7 @@ inline StreamType& print_blob(StreamType *_os, const TBlob &blob,
         width = blob.size(2);
       } else if (dim == 4) {
         height = blob.size(2);
-        width  = blob.size(3);
+        width = blob.size(3);
       } else {
         depth = blob.size(2);
         if (dim > 3) {
@@ -220,7 +332,6 @@ inline StreamType& print_blob(StreamType *_os, const TBlob &blob,
     }
   }
 
-  os << std::endl;
   for (size_t r = 0; r < height; ++r) {
     for (size_t thisBatch = 0; thisBatch < batchSize; ++thisBatch) {
       if (doBatches) {
@@ -237,17 +348,16 @@ inline StreamType& print_blob(StreamType *_os, const TBlob &blob,
         }
       }
       for (size_t thisChannel = 0; thisChannel < channels; ++thisChannel) {
+        os << "[";
         for (size_t c = 0; c < width; ++c) {
           if (c) {
             os << ", ";
-          } else {
-            os << "[";
           }
           for (size_t dd = 0; dd < depth; ++dd) {
             DType val;
             switch (dim) {
               case 3:
-                val = data_at<DType>(&blob, {thisBatch, thisChannel, c });
+                val = data_at<DType>(&blob, {thisBatch, thisChannel, c});
                 break;
               case 4:
                 val = data_at<DType>(&blob, {thisBatch, thisChannel, r, c});
@@ -277,61 +387,150 @@ inline StreamType& print_blob(StreamType *_os, const TBlob &blob,
         os << " |" << std::flush;;
       }
     }
-    os << std::endl;
+    if (r < height - 1) {
+      os << std::endl;
+    }
+  }
+  if (!height) {
+    os << "[]";
+    if (add_endl) {
+      os << std::endl;
+    }
   }
-  os << std::endl << std::flush;
+  if (!add_endl) {
+    os << " ";
+  }
+  os << std::flush;
   return os;
 }
 
-template<typename DType>
-inline size_t shapeMemorySize(const TShape& shape) {
-  return shape.Size() * sizeof(DType);
+template<typename StreamType>
+inline StreamType& print(const RunContext& ctx,
+                         StreamType *_os,
+                         const TBlob &blob,
+                         const bool doChannels = true,
+                         const bool doBatches = true,
+                         const bool add_endl = true) {
+  MSHADOW_TYPE_SWITCH(blob.type_flag_, DType, {
+    print_blob_<DType>(ctx, _os, blob, doChannels, doBatches, add_endl);
+  });
+  return *_os;
 }
 
-class BlobMemory {
- public:
-  explicit inline BlobMemory(const bool isGPU) : isGPU_(isGPU) {
-    this->handle_.dptr = nullptr;
-  }
-  inline ~BlobMemory() {
-    Free();
+template<typename StreamType>
+inline StreamType& print(const RunContext& ctx, StreamType *_os, const std::string &label,
+                         const TBlob &blob,
+                         const bool doChannels = true,
+                         bool doBatches = true,
+                         const bool add_endl = true) {
+  if (!label.empty()) {
+    *_os << label << ": ";
   }
-  void *Alloc(const size_t size) {
-    CHECK_GT(size, 0U);  // You've probably made a mistake
-    mxnet::Context context = isGPU_ ? mxnet::Context::GPU(0) : mxnet::Context{};
-    Storage *storage = mxnet::Storage::Get();
-    handle_ = storage->Alloc(size, context);
-    return handle_.dptr;
+  return print(ctx, _os, blob, doChannels, doBatches, add_endl);
+}
+
+template<typename StreamType>
+inline StreamType& print(const RunContext& ctx, StreamType *_os,
+                         const std::string& label, const NDArray& arr) {
+  if (!label.empty()) {
+    *_os << label << ": ";
   }
-  void Free() {
-    if (handle_.dptr) {
-      Storage *storage = mxnet::Storage::Get();
-      storage->DirectFree(handle_);
-      handle_.dptr = nullptr;
+  switch (arr.storage_type()) {
+    case kRowSparseStorage: {
+      // data
+      const TShape& shape = arr.shape();
+      print_shape(_os, "[row_sparse] main shape", shape, false);
+      const TShape& storage_shape = arr.storage_shape();
+      const bool is_one_row = storage_shape[0] < 2;
+      print_shape(_os, "storage shape", storage_shape, false);
+      print(ctx, _os, arr.data(), true, true, !is_one_row);
+
+      // indices
+      const TShape& indices_shape = arr.aux_shape(rowsparse::kIdx);
+      print_shape(_os, "indices shape", indices_shape, false);
+      print(ctx, _os, arr.aux_data(rowsparse::kIdx), true, true, false) << std::endl;
+      break;
     }
+    case kCSRStorage: {
+      // data
+      const TShape& shape = arr.shape();
+      print_shape(_os, "[CSR] main shape", shape, false);
+      const TShape& storage_shape = arr.storage_shape();
+      const bool is_one_row = storage_shape[0] < 2;
+      print_shape(_os, "storage shape", storage_shape, false);
+      print(ctx, _os, arr.data(), true, true, !is_one_row);
+
+      // row ptrs
+      const TShape& ind_ptr_shape = arr.aux_shape(csr::kIndPtr);
+      print_shape(_os, "row ptrs shape", ind_ptr_shape, false);
+      print(ctx, _os, arr.aux_data(csr::kIndPtr), true, true, false) << std::endl;
+
+      // col indices
+      const TShape& indices_shape = arr.aux_shape(csr::kIdx);
+      print_shape(_os, "col indices shape", indices_shape, false);
+      print(ctx, _os, arr.aux_data(csr::kIdx), true, true, false) << std::endl;
+
+      break;
+    }
+    case kDefaultStorage: {
+      // data
+      const TShape& shape = arr.shape();
+      const bool is_one_row = shape[0] < 2;
+      print_shape(_os, "[dense] main shape", shape, !is_one_row);
+      print(ctx, _os, arr.data(), true, true, !is_one_row) << std::endl;
+      break;
+    }
+    default:
+      CHECK(false) << "Unsupported storage type:" << arr.storage_type();
+      break;
   }
+  return *_os << std::flush;
+}
 
- private:
-  const bool      isGPU_;
-  Storage::Handle handle_;
-};
-
-class StandaloneBlob : public TBlob {
- public:
-  inline StandaloneBlob(const TShape& shape, const bool isGPU, const int dtype)
-    : TBlob(nullptr, shape, isGPU ? gpu::kDevMask : cpu::kDevMask, dtype)
-      , memory_(isGPU) {
-    MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-      this->dptr_ = memory_.Alloc(shapeMemorySize<DType>(shape)); });
+inline void print(const RunContext& ctx,
+                  const std::string& label,
+                  const std::string& var,
+                  const std::vector<NDArray>& arrays) {
+  std::cout << label << std::endl;
+  for (size_t x = 0, n = arrays.size(); x < n; ++x) {
+    std::stringstream ss;
+    ss << var << "[" << x << "]";
+    test::print(ctx, &std::cout, ss.str(), arrays[x]);
   }
-  inline ~StandaloneBlob() {
-    this->dptr_ = nullptr;
-    memory_.Free();
+}
+
+inline void print(const RunContext& ctx,
+                  const std::string& label,
+                  const std::string& var,
+                  const std::vector<TBlob>& arrays) {
+  std::cout << label << std::endl;
+  for (size_t x = 0, n = arrays.size(); x < n; ++x) {
+    std::stringstream ss;
+    ss << var << "[" << x << "]";
+    test::print(ctx, &std::cout, ss.str(), arrays[x], true, true, false);
   }
- private:
-  /*! \brief Locally allocated memory block for this blob */
-  BlobMemory  memory_;
-};
+}
+
+inline std::string demangle(const char *name) {
+  int status = -4;  // some arbitrary value to eliminate the compiler warning
+  std::unique_ptr<char, void(*)(void*)> res {
+    abi::__cxa_demangle(name, nullptr, nullptr, &status),
+    &std::free
+  };
+  return status ? name : res.get();
+}
+
+template<typename T>
+inline std::string type_name() { return demangle(typeid(T).name()); }
+
+#define PRINT_NDARRAYS(__ctx$, __var)  test::print(__ctx$, __FUNCTION__, #__var, __var)
+#define PRINT_OP_AND_ARRAYS(__ctx$, __op, __var)  test::print(__ctx$, __FUNCTION__, \
+  static_cast<std::stringstream *>(&(std::stringstream() << #__var << \
+  "<" << type_name<__op>() << ">"))->str(), __var)
+#define PRINT_OP2_AND_ARRAYS(__ctx$, __op1, __op2, __var)  test::print(__ctx$, __FUNCTION__, \
+  static_cast<std::stringstream *>(&(std::stringstream() << #__var << \
+  "<" << type_name<__op1>().name()) << ", " \
+  << type_name<__op2>() << ">"))->str(), __var)
 
 /*! \brief Fill blob with some pattern defined by the getNextData() callback
  * Pattern fill in the defined order (important for analysis):
@@ -340,7 +539,7 @@ class StandaloneBlob : public TBlob {
  *  3D: batch item -> channel -> col
  */
 template<typename DType, typename GetNextData>
-static inline void patternFill(TBlob *blob, GetNextData getNextData) {
+static inline void patternFill(const TBlob *blob, GetNextData getNextData) {
   const size_t dim = blob->ndim();
   CHECK_LE(dim, 5U) << "Will need to handle above 3 dimensions (another for loop)";
   const size_t num = blob->size(0);
@@ -412,6 +611,84 @@ inline ScalarType rangedRand(const ScalarType min, const ScalarType max) {
   return static_cast<ScalarType>(x / bin_size + min);
 }
 
+/*!
+ * \brief Deterministically compare TShape objects as less-than,
+ *        for use in stl sorted key such as map and set
+ * \param s1 First shape
+ * \param s2 Second shape
+ * \return true if s1 is less than s2
+ */
+inline bool operator < (const nnvm::TShape &s1, const nnvm::TShape &s2) {
+  if (s1.Size() == s2.Size()) {
+    if (s1.ndim() == s2.ndim()) {
+      for (size_t i = 0, n = s1.ndim(); i < n; ++i) {
+        if (s1[i] == s2[i]) {
+          continue;
+        }
+        return s1[i] < s2[i];
+      }
+      return false;
+    }
+    return s1.ndim() < s2.ndim();
+  }
+  return s1.Size() < s2.Size();
+}
+
+/*!
+ * \brief Deterministically compare a vector of TShape objects as less-than,
+ *        for use in stl sorted key such as map and set
+ * \param v1 First vector of shapes
+ * \param v2 Second vector of shapes
+ * \return true if v1 is less than v2
+ */
+inline bool operator < (const std::vector<nnvm::TShape>& v1, const std::vector<nnvm::TShape>& v2) {
+  if (v1.size() == v2.size()) {
+    for (size_t i = 0, n = v1.size(); i < n; ++i) {
+      if (v1[i] == v2[i]) {
+        continue;
+      }
+      return v1[i] < v2[i];
+    }
+    return false;
+  }
+  return v1.size() < v2.size();
+}
+
+/*!
+ * \brief std::less compare structure for compating vectors of shapes for stl sorted containers
+ */
+struct less_shapevect {
+  bool operator()(const std::vector<nnvm::TShape>& v1, const std::vector<nnvm::TShape>& v2) const {
+    if (v1.size() == v2.size()) {
+      for (size_t i = 0, n = v1.size(); i < n; ++i) {
+        if (v1[i] == v2[i]) {
+          continue;
+        }
+        return v1[i] < v2[i];
+      }
+      return false;
+    }
+    return v1.size() < v2.size();
+  }
+};
+
+inline std::string pretty_num(uint64_t val) {
+  if (!test::csv) {
+    std::string res, s = std::to_string(val);
+    size_t ctr = 0;
+    for (int i = static_cast<int>(s.size()) - 1; i >= 0; --i, ++ctr) {
+      if (ctr && (ctr % 3) == 0) {
+        res += ",";
+      }
+      res.push_back(s[i]);
+    }
+    std::reverse(res.begin(), res.end());
+    return res;
+  } else {
+    return std::to_string(val);
+  }
+}
+
 /*! \brief Change a value during the scope of this declaration */
 template<typename T>
 struct ScopeSet {
diff --git a/tests/cpp/misc/memory_test.cc b/tests/cpp/misc/memory_test.cc
new file mode 100644
index 000000000000..8f4e8c25e890
--- /dev/null
+++ b/tests/cpp/misc/memory_test.cc
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  \file memory_test.cc
+ *  \brief Perf/profile run of ActivationOp
+ *  \author Chris Olivier
+ */
+
+#include <gtest/gtest.h>
+#include <dmlc/omp.h>
+#include <mxnet/tensor_blob.h>
+#include "../include/test_util.h"
+#include "../include/test_perf.h"
+
+using namespace mxnet;
+
+#ifdef _OPENMP
+template<typename Container>
+static typename Container::value_type average(const Container& cont) {
+  typename Container::value_type avg = 0;
+  const size_t sz = cont.size();
+  for (auto iter = cont.begin(), e_iter = cont.end(); iter != e_iter; ++iter) {
+    avg += *iter / sz;  // Attempt to not overflow by dividing up incrementally
+  }
+  return avg;
+}
+
+static int GetOMPThreadCount() {
+  return omp_get_max_threads() >> 1;
+}
+
+/*!
+ * \brief Generic bidirectional sanity test
+ */
+TEST(MEMORY_TEST, MemsetAndMemcopyPerformance) {
+  const size_t GB = 1000000000;  // memset never slower
+  uint64_t base = 100000;
+  std::list<uint64_t> memset_times, omp_set_times, memcpy_times, omp_copy_times;
+  size_t pass = 0;
+  do {
+    memset_times.resize(0);
+    omp_set_times.resize(0);
+    memcpy_times.resize(0);
+    omp_copy_times.resize(0);;
+
+    const size_t test_size = 2 * base;
+    std::cout << "====================================" << std::endl
+              << "Data size: " << test::pretty_num(test_size) << std::endl << std::flush;
+
+    std::unique_ptr<uint8_t> buffer_1(new uint8_t[test_size]), buffer_2(new uint8_t[test_size]);
+    uint8_t *src = buffer_1.get(), *dest = buffer_2.get();
+
+    for (size_t x = 0; x < 5; ++x) {
+      // Init memory with different values
+      memset(src, 3, test_size);
+      memset(dest, 255, test_size);  // wipe out some/all of src cache
+
+      // memset
+      uint64_t start = test::perf::getNannoTickCount();
+      memset(src, 123, test_size);
+      const uint64_t memset_time = test::perf::getNannoTickCount() - start;
+
+      start = test::perf::getNannoTickCount();
+      #pragma omp parallel for num_threads(GetOMPThreadCount())
+      for (int i = 0; i < static_cast<int>(test_size); ++i) {
+        src[i] = 42;
+      }
+      const uint64_t omp_set_time = test::perf::getNannoTickCount() - start;
+
+      start = test::perf::getNannoTickCount();
+      memcpy(dest, src, test_size);
+      const uint64_t memcpy_time = test::perf::getNannoTickCount() - start;
+
+      // bounce the cache and dirty logic
+      memset(src, 6, test_size);
+      memset(dest, 200, test_size);
+
+      start = test::perf::getNannoTickCount();
+      #pragma omp parallel for num_threads(GetOMPThreadCount())
+      for (int i = 0; i < static_cast<int>(test_size); ++i) {
+        dest[i] = src[i];
+      }
+      const uint64_t omp_copy_time = test::perf::getNannoTickCount() - start;
+
+      memset_times.push_back(memset_time);
+      omp_set_times.push_back(omp_set_time);
+      memcpy_times.push_back(memcpy_time);
+      omp_copy_times.push_back(omp_copy_time);
+
+      std::cout << "memset time:   " << test::pretty_num(memcpy_time) << " ns" << std::endl
+                << "omp set time:  " << test::pretty_num(omp_set_time) << " ns" << std::endl
+                << std::endl;
+      std::cout << "memcpy time:   " << test::pretty_num(memcpy_time) << " ns" << std::endl
+                << "omp copy time: " << test::pretty_num(omp_copy_time) << " ns" << std::endl
+                << std::endl;
+    }
+    std::cout << "------------------------------------" << std::endl;
+    if (average(memset_times) > average(omp_set_times)) {
+      std::cout << "<< MEMSET SLOWER FOR " << test::pretty_num(test_size)
+                << " items >>" << std::endl;
+    }
+    if (average(memcpy_times) > average(omp_copy_times)) {
+      std::cout << "<< MEMCPY SLOWER FOR " << test::pretty_num(test_size)
+                << " items >>" << std::endl;
+    }
+    if (!pass) {
+      GTEST_ASSERT_LE(average(memset_times), average(omp_set_times));
+      GTEST_ASSERT_LE(average(memcpy_times), average(omp_copy_times));
+    }
+    base *= 10;
+    ++pass;
+  } while (test::performance_run
+           && base <= GB
+           && (average(memset_times) < average(omp_set_times)
+               || average(memcpy_times), average(omp_copy_times)));
+}
+#endif  // _OPENMP
diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc
new file mode 100644
index 000000000000..65bd9aaf4064
--- /dev/null
+++ b/tests/cpp/operator/activation_perf.cc
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  \file activation_perf.cc
+ *  \brief Perf/profile run of ActivationOp
+ *  \author Chris Olivier
+ */
+
+#include <gtest/gtest.h>
+#include <mxnet/tensor_blob.h>
+#include "../include/test_op_runner.h"
+#include "../include/test_legacy_op.h"
+#include "../../src/operator/activation-inl.h"
+
+using namespace mxnet;
+
+typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
+const kwargs_t basic_activation_args = { };
+
+/*!
+ * \brief Generic bidirectional sanity test
+ */
+TEST(ACTIVATION_PERF, ExecuteBidirectional) {
+  TShape shape({5, 5});
+  kwargs_t kwargs = basic_activation_args;
+  kwargs.push_back({"act_type", "tanh"});
+  test::op::LegacyOpRunner<mxnet::op::ActivationProp, float, float> runner;
+  runner.RunBidirectional(false, { shape }, kwargs, 1);
+}
+
+/*!
+ * \brief ActivationOp timing test for CPU
+ */
+TEST(ACTIVATION_PERF, TimingCPU) {
+  kwargs_t kwargs = basic_activation_args;
+  // Which math function is arbitrary since it will have roughly constant timing among approaches
+  kwargs.push_back({"act_type", "tanh"});
+  test::op::LegacyOpRunner<mxnet::op::ActivationProp, float, float> runner;
+  runner.RunBidirectional(false,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = {
+      {1,  1, 28,  28},
+      {1,  3, 28,  28},
+      {50, 1, 18,  32},
+      {50, 3, 18,  32},
+      {20, 3, 128, 128}
+    };
+  } else {
+    shapes = {
+      {1,  1, 28,  28},
+      {50, 3, 18,  32},
+    };
+  }
+  for (const TShape &shape : shapes) {
+    runner.TimingTest("Activation Operator CPU", false, false, kwargs, 2, 10, { shape });
+  }
+}
+
+#if MXNET_USE_CUDA == 1
+/*!
+ * \brief ActivationOp timing test for GPU
+ */
+TEST(ACTIVATION_PERF, TimingGPU) {
+  kwargs_t kwargs = basic_activation_args;
+  // Which math function is arbitrary since it will have roughly constant timing among approaches
+  kwargs.push_back({"act_type", "tanh"});
+  test::OperatorRunner<mxnet::op::ActivationProp,
+    test::op::LegacyOperatorExecutor<float, float>> runner;
+  runner.RunBidirectional(true,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
+  std::vector <TShape> shapes = {
+      {1,  1, 28,  28},
+      {1,  3, 28,  28},
+      {50, 1, 18,  32},
+      {50, 3, 18,  32},
+      {20, 3, 128, 128}
+    };
+  for (const TShape &shape : shapes) {
+    runner.TimingTest("Activation Operator GPU", true, false, kwargs, 2, 10, { shape });
+  }
+}
+#endif  // MXNET_USE_CUDA == 1
+
diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc
index 3fef28f79a0a..8f53ee558837 100644
--- a/tests/cpp/operator/batchnorm_test.cc
+++ b/tests/cpp/operator/batchnorm_test.cc
@@ -18,8 +18,9 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file batchnorm_test.cc
- * \brief operator unit test utility functions
+ * \brief batchnorm operator unit test utility functions
  * \author Chris Olivier
 */
 
@@ -27,7 +28,7 @@
 #include <mxnet/tensor_blob.h>
 #include "../../src/operator/batch_norm-inl.h"
 #include "../../src/operator/batch_norm_v1-inl.h"
-#include "test_op.h"
+#include "./test_legacy_op.h"
 #include "executor/exec_pass.h"
 
 using namespace mxnet;
@@ -35,7 +36,7 @@ using namespace mxnet;
 #define SIMPLE_DIMENSIONS  0
 #define MXNET_DUMP_C  0
 #define DISABLE_VALIDATION 0  // If performance profiling, may do things
-                              // that cause validation to fail
+// that cause validation to fail
 
 #if !SIMPLE_DIMENSIONS
 static constexpr int BATCH_SIZE = 5;
@@ -57,11 +58,125 @@ static constexpr int TIMING_DEPTH = 2;
 static constexpr int TIMING_DH = 28;
 static constexpr int TIMING_DW = 28;
 
+
+/*! \brief BatchNorm-specific test data  */
+template <typename DType, typename AccReal>
+class BNOperatorExecutor : public test::op::LegacyOperatorExecutor<DType, AccReal> {
+ public:
+  BNOperatorExecutor(const bool isGPU, const TShape& inputShape,
+                     const bool hasWeightAndBias = false)
+    : test::op::LegacyOperatorExecutor<DType, AccReal>(isGPU, { inputShape })
+      , hasWeightAndBias_(hasWeightAndBias) {
+  }
+
+  void resetForward() override {
+    // Init input data
+    MSHADOW_TYPE_SWITCH(
+      this->c_.blob_input_vec_[mxnet::op::batchnorm::kData].type_flag_,
+      DTypeX,
+      {
+        DTypeX val = 0;
+        test::patternFill<DTypeX>(&this->c_.blob_input_vec_[mxnet::op::batchnorm::kData],
+                                  [&val]{ return val += 1; }); });
+
+    MSHADOW_TYPE_SWITCH(
+      this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma].type_flag_,
+      DTypeX, {
+        const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma];
+        test::fill(blob, DTypeX(1));
+        if (hasWeightAndBias_) {
+          if (blob.size(0) > 1) {
+            blob.dptr<DTypeX>()[1] = DTypeX(3);
+          }
+        }
+      });
+    MSHADOW_TYPE_SWITCH(
+      this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta].type_flag_,
+      DTypeX, {
+        const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta];
+        if (!hasWeightAndBias_) {
+          test::fill(blob, DTypeX(0));
+        } else {  // This will cause forward pass check to fail when calculating sum == 0
+          test::fill(blob, DTypeX(1));
+          if (blob.size(0) > 0) {
+            blob.dptr<DTypeX>()[0] = DTypeX(3);
+          }
+        }
+      });
+
+    // Init the moving data (all mean = 0, all var = 1)
+    MSHADOW_TYPE_SWITCH(
+      this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean].type_flag_,
+      DTypeX, {
+        test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean], DTypeX(0));
+      });
+    MSHADOW_TYPE_SWITCH(
+      this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar].type_flag_,
+      DTypeX, {
+        test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar], DTypeX(1));});
+
+    for (size_t i = 0, n = this->c_.blob_output_vec_.size(); i < n; ++i) {
+      const int dtype = this->c_.blob_output_vec_[i].type_flag_;
+      MSHADOW_TYPE_SWITCH(dtype, DTypeX,
+                          { test::fill(this->c_.blob_output_vec_[i], DTypeX(0.1234)); });
+    }
+  }
+
+  void resetBackward() override {
+    DType val = -.001;
+    MSHADOW_TYPE_SWITCH(
+      this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut].type_flag_,
+      DTypeX, {
+        test::patternFill<DTypeX>(&this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut],
+                                  [&val]{ return val += 1; });
+      });
+
+    // out-grad weights
+    if (mxnet::op::batchnorm::kGamma < this->c_.blob_out_grad_.size()) {
+      MSHADOW_TYPE_SWITCH(
+        this->c_.blob_out_grad_[mxnet::op::batchnorm::kGamma].type_flag_,
+        DTypeX,
+        { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0.1)); });
+    }
+
+    // out-grad biases
+    if (mxnet::op::batchnorm::kBeta < this->c_.blob_out_grad_.size()) {
+      MSHADOW_TYPE_SWITCH(
+        this->c_.blob_out_grad_[mxnet::op::batchnorm::kBeta].type_flag_,
+        DTypeX,
+        { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0.1)); });
+    }
+
+    // in-grad
+    MSHADOW_TYPE_SWITCH(
+      this->c_.blob_in_grad_[mxnet::op::batchnorm::kData].type_flag_,
+      DTypeX,
+      { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kData, DTypeX(0)); });
+
+    // in-grad weights
+    if (mxnet::op::batchnorm::kGamma < this->c_.blob_in_grad_.size()) {
+      MSHADOW_TYPE_SWITCH(
+        this->c_.blob_in_grad_[mxnet::op::batchnorm::kGamma].type_flag_,
+        DTypeX,
+        { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0)); });
+    }
+
+    // in-grad biases
+    if (mxnet::op::batchnorm::kBeta < this->c_.blob_in_grad_.size()) {
+      MSHADOW_TYPE_SWITCH(
+        this->c_.blob_in_grad_[mxnet::op::batchnorm::kBeta].type_flag_,
+        DTypeX,
+        { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0)); });
+    }
+  }
+
+  const bool hasWeightAndBias_;  // This will cause forward pass validation to fail
+};
+
 /*! \brief Validate batch norm test outputs */
 template<typename DType, typename AccReal>
 class BatchNormValidator : public test::op::Validator<DType, AccReal> {
   typedef test::op::Validator<DType, AccReal> Super;
-  using Super::compare;
 
   /*! \brief Only static functions in this class */
   BatchNormValidator() = delete;
@@ -107,7 +222,7 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
         // expect unit variance
         EXPECT_NEAR(1, var, kErrorBound);
         if (!Super::isNear(AccReal(1), var, kErrorBound)) {
-          LOG(WARNING) << "Variance is not close enough to 1"
+          LOG(WARNING) << "Variance is not close enough to 1 "
                        << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
         }
@@ -211,7 +326,7 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
         // expect unit variance
         EXPECT_NEAR(1, var, kErrorBound);
         if (!Super::isNear(AccReal(1), var, kErrorBound)) {
-          LOG(WARNING) << "Variance is not close enough to 1"
+          LOG(WARNING) << "Variance is not close enough to 1 "
                        << saveSum << " (" << sum << "), "
                        << saveVar << " (" << var << ")";
         }
@@ -220,10 +335,39 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
   }
 
  public:
+  template <typename ExecutorType>
+  static inline bool compare(const ExecutorType& i1,
+                             const ExecutorType& i2,
+                             const typename
+                             test::op::LegacyOperatorExecutor<DType, AccReal>::BlobVectorType bvt,
+                             const size_t idx, bool print = false) {
+    // Validate legacy data
+    auto *legacy1 = dynamic_cast<const test::op::LegacyOperatorExecutor<DType, AccReal> *>(&i1);
+    auto *legacy2 = dynamic_cast<const test::op::LegacyOperatorExecutor<DType, AccReal> *>(&i2);
+    CHECK_NOTNULL(legacy1);
+    CHECK_NOTNULL(legacy2);
+    const std::vector<TBlob> &bv1 = legacy1->getBlobVect(bvt);
+    const std::vector<TBlob> &bv2 = legacy2->getBlobVect(bvt);
+
+    // If this is an invalid index, at least make sure the two blob vects
+    // are similarly too small for the index
+    if (bv1.size() <= idx) {
+      CHECK(bv1.size() == bv2.size());
+      return true;
+    }
+    const TBlob &b1 = bv1[idx];
+    const TBlob &b2 = bv2[idx];
+    if (print && test::debug_output) {
+      test::print(RunContext(), &(std::cout << "Blob 1:"), b1, true, true);
+      test::print(RunContext(), &(std::cout << "Blob 2:"), b2, true, true);
+    }
+    return test::op::Validator<DType, AccReal>::compare(b1, b2);
+  }
+
   /*! \brief Check batch norm output */
   template<typename BNOperatorProp>
   static void validateForward(const BNOperatorProp& data) {
-    const TBlob& outputBlob = data.c_.blob_output_vec_[mxnet::op::batchnorm::kData];
+    const TBlob& outputBlob = data.outputs()[mxnet::op::batchnorm::kData];
     switch (outputBlob.ndim()) {
       case 3:
         checkBatchNorm1D(&outputBlob);
@@ -242,169 +386,57 @@ class BatchNormValidator : public test::op::Validator<DType, AccReal> {
 
   /*! \brief Compare entire operator data between two test sets */
   template<typename PropType1, typename PropType2>
-  static void compare(const test::op::OpInfo<PropType1, DType, AccReal>& info_1,
-                      const test::op::OpInfo<PropType2, DType, AccReal>& info_2) {
+  static void compare(
+    const test::op::OpInfo<PropType1, BNOperatorExecutor<DType, AccReal>>& info_1,
+    const test::op::OpInfo<PropType2, BNOperatorExecutor<DType, AccReal>>& info_2) {
     // Input
-    EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                        test::op::BasicOperatorData<DType, AccReal>::kInput,
+    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                        test::op::LegacyOperatorExecutor<DType, AccReal>::kInput,
                         op::batchnorm::kData));
-    EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                        test::op::BasicOperatorData<DType, AccReal>::kInput,
+    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                        test::op::LegacyOperatorExecutor<DType, AccReal>::kInput,
                         op::batchnorm::kGamma));
-    EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                        test::op::BasicOperatorData<DType, AccReal>::kInput,
+    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                        test::op::LegacyOperatorExecutor<DType, AccReal>::kInput,
                         op::batchnorm::kBeta));
     // Output
-    EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                        test::op::BasicOperatorData<DType, AccReal>::kOutput,
+    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                        test::op::LegacyOperatorExecutor<DType, AccReal>::kOutput,
                         op::batchnorm::kOut));
     CHECK_EQ(info_2.prop_->getParam().use_global_stats,
              info_1.prop_->getParam().use_global_stats);
 
 #if MXNET_USE_CUDNN != 1 /* CUDNN takes a different approach here on first pass */
     // Aux
-    EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                        test::op::BasicOperatorData<DType, AccReal>::kAux,
+    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                        test::op::LegacyOperatorExecutor<DType, AccReal>::kAux,
                         op::batchnorm::kMovingMean));
-    EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                        test::op::BasicOperatorData<DType, AccReal>::kAux,
+    EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                        test::op::LegacyOperatorExecutor<DType, AccReal>::kAux,
                         op::batchnorm::kMovingVar));
 #endif
     if (!info_2.prop_->getParam().use_global_stats) {
-      EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                          test::op::BasicOperatorData<DType, AccReal>::kOutput,
+      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                          test::op::LegacyOperatorExecutor<DType, AccReal>::kOutput,
                           op::batchnorm::kMean));
       // InGrad
-      EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                          test::op::BasicOperatorData<DType, AccReal>::kInGrad,
+      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                          test::op::LegacyOperatorExecutor<DType, AccReal>::kInGrad,
                           op::batchnorm::kData));
-      EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                          test::op::BasicOperatorData<DType, AccReal>::kInGrad,
+      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                          test::op::LegacyOperatorExecutor<DType, AccReal>::kInGrad,
                           op::batchnorm::kGamma));
-      EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                          test::op::BasicOperatorData<DType, AccReal>::kInGrad,
+      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                          test::op::LegacyOperatorExecutor<DType, AccReal>::kInGrad,
                           op::batchnorm::kBeta));
       // OutGrad
-      EXPECT_TRUE(compare(*info_1.data_, *info_2.data_,
-                          test::op::BasicOperatorData<DType, AccReal>::kOutGrad,
+      EXPECT_TRUE(compare(*info_1.executor_, *info_2.executor_,
+                          test::op::LegacyOperatorExecutor<DType, AccReal>::kOutGrad,
                           op::batchnorm::kData));
     }
   }
 };
 
-/*! \brief BatchNorm-specific test data  */
-template <typename DType, typename AccReal>
-class BNOperatorData : public test::op::BasicOperatorData<DType, AccReal> {
- public:
-  BNOperatorData(const bool isGPU, const TShape& inputShape, const bool hasWeightAndBias = false)
-    : test::op::BasicOperatorData<DType, AccReal>(isGPU, inputShape)
-      , hasWeightAndBias_(hasWeightAndBias) {
-  }
-
-  void resetForward() override {
-    // Init input data
-    MSHADOW_TYPE_SWITCH(
-      this->c_.blob_input_vec_[mxnet::op::batchnorm::kData].type_flag_,
-      DTypeX,
-      {
-        DTypeX val = 0;
-        test::patternFill<DTypeX>(&this->c_.blob_input_vec_[mxnet::op::batchnorm::kData],
-                                  [&val]{ return val += 1; }); });
-
-    MSHADOW_TYPE_SWITCH(
-      this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma].type_flag_,
-      DTypeX, {
-      const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kGamma];
-        test::fill(blob, DTypeX(1));
-      if (hasWeightAndBias_) {
-        if (blob.size(0) > 1) {
-          blob.dptr<DTypeX>()[1] = DTypeX(3);
-        }
-      }
-      });
-    MSHADOW_TYPE_SWITCH(
-      this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta].type_flag_,
-      DTypeX, {
-        const TBlob& blob = this->c_.blob_input_vec_[mxnet::op::batchnorm::kBeta];
-      if (!hasWeightAndBias_) {
-        test::fill(blob, DTypeX(0));
-      } else {  // This will cause forward pass check to fail when calculating sum == 0
-        test::fill(blob, DTypeX(1));
-        if (blob.size(0) > 0) {
-          blob.dptr<DTypeX>()[0] = DTypeX(3);
-        }
-      }
-      });
-
-    // Init the moving data (all mean = 0, all var = 1)
-    MSHADOW_TYPE_SWITCH(
-      this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean].type_flag_,
-      DTypeX, {
-        test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingMean], DTypeX(0));
-      });
-    MSHADOW_TYPE_SWITCH(
-      this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar].type_flag_,
-      DTypeX, {
-        test::fill(this->c_.blob_aux_states_[mxnet::op::batchnorm::kMovingVar], DTypeX(1));});
-
-    for (size_t i = 0, n = this->c_.blob_output_vec_.size(); i < n; ++i) {
-      const int dtype = this->c_.blob_output_vec_[i].type_flag_;
-      MSHADOW_TYPE_SWITCH(dtype, DTypeX,
-                          { test::fill(this->c_.blob_output_vec_[i], DTypeX(0.1234)); });
-    }
-  }
-
-  void resetBackward() override {
-    DType val = -.001;
-    MSHADOW_TYPE_SWITCH(
-      this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut].type_flag_,
-      DTypeX, {
-        test::patternFill<DTypeX>(&this->c_.blob_out_grad_[mxnet::op::batchnorm::kOut],
-                                  [&val]{ return val += 1; });
-      });
-
-    // out-grad weights
-    if (mxnet::op::batchnorm::kGamma < this->c_.blob_out_grad_.size()) {
-      MSHADOW_TYPE_SWITCH(
-        this->c_.blob_out_grad_[mxnet::op::batchnorm::kGamma].type_flag_,
-        DTypeX,
-        { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0.1)); });
-    }
-
-    // out-grad biases
-    if (mxnet::op::batchnorm::kBeta < this->c_.blob_out_grad_.size()) {
-      MSHADOW_TYPE_SWITCH(
-        this->c_.blob_out_grad_[mxnet::op::batchnorm::kBeta].type_flag_,
-        DTypeX,
-        { test::try_fill(this->c_.blob_out_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0.1)); });
-    }
-
-    // in-grad
-    MSHADOW_TYPE_SWITCH(
-      this->c_.blob_in_grad_[mxnet::op::batchnorm::kData].type_flag_,
-      DTypeX,
-      { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kData, DTypeX(0)); });
-
-    // in-grad weights
-    if (mxnet::op::batchnorm::kGamma < this->c_.blob_in_grad_.size()) {
-      MSHADOW_TYPE_SWITCH(
-        this->c_.blob_in_grad_[mxnet::op::batchnorm::kGamma].type_flag_,
-        DTypeX,
-        { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kGamma, DTypeX(0)); });
-    }
-
-    // in-grad biases
-    if (mxnet::op::batchnorm::kBeta < this->c_.blob_in_grad_.size()) {
-      MSHADOW_TYPE_SWITCH(
-        this->c_.blob_in_grad_[mxnet::op::batchnorm::kBeta].type_flag_,
-        DTypeX,
-        { test::try_fill(this->c_.blob_in_grad_, mxnet::op::batchnorm::kBeta, DTypeX(0)); });
-    }
-  }
-
-  const bool hasWeightAndBias_;  // This will cause forward pass validation to fail
-};
-
 static const test::op::kwargs_t blank_kwargs;
 static const test::op::kwargs_t blank_kwargs_nocudnn = {
   {"cudnn_off", "True"} };
@@ -424,7 +456,7 @@ static const test::op::kwargs_t nfs_ugs_kwargs_nocudnn = {
 #if !DISABLE_VALIDATION
 static bool isUGS(const test::op::kwargs_t& kwargs) {
   for (test::op::kwargs_t::const_iterator i = kwargs.begin(),
-        e = kwargs.end(); i != e; ++i) {
+         e = kwargs.end(); i != e; ++i) {
     if (!i->first.compare("use_global_stats")) {
       return i->second.compare("True") == 0;
     }
@@ -433,50 +465,48 @@ static bool isUGS(const test::op::kwargs_t& kwargs) {
 }
 #endif  // DISABLE_VALIDATION
 
-template<typename StreamType, typename DType, typename AccReal>
-static StreamType& PRT(
-  StreamType *os,
-  const test::op::BasicOperatorData<DType, AccReal>& obj,
-  const typename test::op::BasicOperatorData<DType, AccReal>::BlobVectorType bvt,
-  const size_t idx) {
-  *os << test::op::BasicOperatorData<DType, AccReal>::bvt2String(bvt) << ": " << idx
+template<typename StreamType, typename OperatorExecutor>
+static StreamType& PRT(StreamType *os, const OperatorExecutor& obj,
+                       const typename OperatorExecutor::BlobVectorType bvt, const size_t idx) {
+  *os << OperatorExecutor::bvt2String(bvt) << ": " << idx
       << ": ";
   const TBlob& blob = obj.getBlobVect(bvt)[idx];
-  MSHADOW_REAL_TYPE_SWITCH(blob.type_flag_, DTypeX, { test::print_blob<DTypeX>(os, blob); });
+
+  test::print(RunContext(), os, blob);
   return *os;
 }
 
-template<typename StreamType, typename Prop, typename DType, typename AccReal>
+template<typename StreamType, typename Prop, typename OperatorExecutor>
 static StreamType& dumpF(StreamType *os,
-                         const test::op::OpInfo<Prop, DType, AccReal>& prop,
+                         const test::op::OpInfo<Prop, OperatorExecutor>& prop,
                          const size_t x = 0) {
-  if (test::debugOutput) {
+  if (test::debug_output) {
     *os << std::endl;
     if (x) {
       *os << "=============================" << std::endl;
       *os << "= " << x << std::endl;
       *os << "=============================" << std::endl;
     }
-    typedef typename test::op::BasicOperatorData<DType, AccReal>::BlobVectorType BlobVectorType;
-    PRT(os, *prop.data_, BlobVectorType::kInput, op::batchnorm::kData);
-    PRT(os, *prop.data_, BlobVectorType::kInput, op::batchnorm::kGamma);
-    PRT(os, *prop.data_, BlobVectorType::kInput, op::batchnorm::kBeta);
+    typedef typename OperatorExecutor::BlobVectorType BlobVectorType;
+    PRT(os, *prop.executor_, BlobVectorType::kInput, op::batchnorm::kData);
+    PRT(os, *prop.executor_, BlobVectorType::kInput, op::batchnorm::kGamma);
+    PRT(os, *prop.executor_, BlobVectorType::kInput, op::batchnorm::kBeta);
 
-    PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingMean);
-    PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingVar);
+    PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingMean);
+    PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingVar);
 
-    PRT(os, *prop.data_, BlobVectorType::kOutput, op::batchnorm::kOut);
-    PRT(os, *prop.data_, BlobVectorType::kOutput, op::batchnorm::kMean);
-    PRT(os, *prop.data_, BlobVectorType::kOutput, op::batchnorm::kVar);
+    PRT(os, *prop.executor_, BlobVectorType::kOutput, op::batchnorm::kOut);
+    PRT(os, *prop.executor_, BlobVectorType::kOutput, op::batchnorm::kMean);
+    PRT(os, *prop.executor_, BlobVectorType::kOutput, op::batchnorm::kVar);
   }
   return *os;
 }
 
-template<typename StreamType, typename Prop, typename DType, typename AccReal>
+template<typename StreamType, typename Prop, typename OperatorExecutor>
 static StreamType& dumpB(StreamType *os,
-                         const test::op::OpInfo<Prop, DType, AccReal>& prop,
+                         const test::op::OpInfo<Prop, OperatorExecutor>& prop,
                          const size_t x = 0) {
-  if (test::debugOutput) {
+  if (test::debug_output) {
     *os << std::endl;
     if (x) {
       *os << "=============================" << std::endl;
@@ -484,34 +514,34 @@ static StreamType& dumpB(StreamType *os,
       *os << "=============================" << std::endl;
     }
 
-    typedef typename test::op::BasicOperatorData<DType, AccReal>::BlobVectorType BlobVectorType;
-    PRT(os, *prop.data_, BlobVectorType::kInGrad, op::batchnorm::kData);
-    PRT(os, *prop.data_, BlobVectorType::kInGrad, op::batchnorm::kGamma);
-    PRT(os, *prop.data_, BlobVectorType::kInGrad, op::batchnorm::kBeta);
+    typedef typename OperatorExecutor::BlobVectorType BlobVectorType;
+    PRT(os, *prop.executor_, BlobVectorType::kInGrad, op::batchnorm::kData);
+    PRT(os, *prop.executor_, BlobVectorType::kInGrad, op::batchnorm::kGamma);
+    PRT(os, *prop.executor_, BlobVectorType::kInGrad, op::batchnorm::kBeta);
 
-    PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingMean);
-    PRT(os, *prop.data_, BlobVectorType::kAux, op::batchnorm::kMovingVar);
+    PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingMean);
+    PRT(os, *prop.executor_, BlobVectorType::kAux, op::batchnorm::kMovingVar);
 
-    PRT(os, *prop.data_, BlobVectorType::kOutGrad, op::batchnorm::kOut);
+    PRT(os, *prop.executor_, BlobVectorType::kOutGrad, op::batchnorm::kOut);
   }
   return *os;
 }
 
-template<typename StreamType, typename Prop1, typename Prop2, typename DType, typename AccReal>
+template<typename StreamType, typename Prop1, typename Prop2, typename OperatorExecutor>
 static StreamType& dumpF(StreamType *os,
-                         const test::op::OpInfoPair<Prop1, Prop2, DType, AccReal>& bi) {
+                         const test::op::OpInfoPair<Prop1, Prop2, OperatorExecutor>& bi) {
   return dumpF(&dumpF(os, bi.info_1_, 1), bi.info_2_, 2);
 }
 
-template<typename StreamType, typename Prop1, typename Prop2, typename DType, typename AccReal>
+template<typename StreamType, typename Prop1, typename Prop2, typename OperatorExecutor>
 static StreamType& dumpB(StreamType *os,
-                         const test::op::OpInfoPair<Prop1, Prop2, DType, AccReal>& bi) {
+                         const test::op::OpInfoPair<Prop1, Prop2, OperatorExecutor>& bi) {
   return dumpB(&dumpB(os, bi.info_1_, 1), bi.info_2_, 2);
 }
 
 /*! \brief Test batch norm operator forward pass */
-template<typename OperatorProp, typename DType, typename AccReal>
-static test::op::OpInfo<OperatorProp, DType, AccReal> TestBatchNormOperatorForward(
+template<typename OperatorProp, typename OperatorExecutor>
+static test::op::OpInfo<OperatorProp, OperatorExecutor> TestBatchNormOperatorForward(
   bool isGPU,
   const TShape& inputShape,
   const std::vector<std::pair<std::string, std::string> >& kwargs,
@@ -524,16 +554,17 @@ static test::op::OpInfo<OperatorProp, DType, AccReal> TestBatchNormOperatorForwa
   isGPU = false;
 #endif
 
-  test::op::OpInfo<OperatorProp, DType, AccReal> info = test::op::createOpAndInfoF<
-    OperatorProp, BNOperatorData<DType, AccReal>, DType, AccReal>(isGPU, inputShape, kwargs);
+  test::op::OpInfo<OperatorProp, OperatorExecutor> info = test::op::createOpAndInfoF<
+    OperatorProp, OperatorExecutor>(kwargs, isGPU, inputShape);
 
-  info.data_->initForward(*info.prop_, &info.in_type_);
+  info.executor_->initForward(*info.prop_, &info.in_type_);
 
-  info.data_->forward(count);
+  info.executor_->forward(count);
 
 #if !DISABLE_VALIDATION
   if (!isUGS(kwargs)) {
-    BatchNormValidator<DType, AccReal>::validateForward(*info.data_);
+    BatchNormValidator<typename OperatorExecutor::DataType,
+      typename OperatorExecutor::AccRealType>::validateForward(*info.executor_);
   }
 #endif
 
@@ -541,20 +572,20 @@ static test::op::OpInfo<OperatorProp, DType, AccReal> TestBatchNormOperatorForwa
 }
 
 /*! \brief Test batch norm operator backward pass */
-template<typename DType, typename AccReal, typename OperatorProp>
-static test::op::OpInfo<OperatorProp, DType, AccReal> runOperatorBackward(
-  test::op::OpInfo<OperatorProp, DType, AccReal> *info,
+template<typename OperatorProp, typename OperatorExecutor>
+static test::op::OpInfo<OperatorProp, OperatorExecutor> runOperatorBackward(
+  test::op::OpInfo<OperatorProp, OperatorExecutor> *info,
   const size_t count = 1) {
-  info->data_->initBackward(*info->prop_, &info->in_type_);
+  info->executor_->initBackward(*info->prop_, &info->in_type_);
 
-  info->data_->backward(count);
+  info->executor_->backward(count);
   return *info;
 }
 
 static constexpr size_t CYCLE_COUNT = 3;
 
-template<typename OperatorProp1, typename OperatorProp2, typename DType, typename AccReal>
-static test::op::OpInfoPair<OperatorProp1, OperatorProp2, DType, AccReal> testForwardAndBackward(
+template<typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
+static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor> testForwardAndBackward(
   const bool isGPU1,
   const bool isGPU2,
   const TShape &inputShape,
@@ -562,22 +593,25 @@ static test::op::OpInfoPair<OperatorProp1, OperatorProp2, DType, AccReal> testFo
   const bool dumpC,
   const size_t count = 1,
   const size_t cycleCount = CYCLE_COUNT) {
-  test::op::OpInfo<OperatorProp1, DType, AccReal> info_1 =
-    TestBatchNormOperatorForward<OperatorProp1, DType, AccReal>(isGPU1, inputShape,
-                                                                kwargs, count);
+  test::op::OpInfo<OperatorProp1, OperatorExecutor> info_1 =
+    TestBatchNormOperatorForward<OperatorProp1, OperatorExecutor>(isGPU1, inputShape,
+                                                                  kwargs, count);
 
-  test::op::OpInfo<OperatorProp2, DType, AccReal> info_2 =
-    TestBatchNormOperatorForward<OperatorProp2, DType, AccReal>(isGPU2, inputShape,
-                                                                kwargs, count);
+  test::op::OpInfo<OperatorProp2, OperatorExecutor> info_2 =
+    TestBatchNormOperatorForward<OperatorProp2, OperatorExecutor>(isGPU2, inputShape,
+                                                                  kwargs, count);
 
   size_t thisCount = 0;
 
+  typedef typename OperatorExecutor::DataType DType;
+  typedef typename OperatorExecutor::AccRealType AccReal;
+
   do {
     const bool isLast = thisCount == cycleCount - 1;
 
     if (thisCount) {
-      info_1.data_->forward(count);
-      info_2.data_->forward(count);
+      info_1.executor_->forward(count);
+      info_2.executor_->forward(count);
     }
 
     if (isLast) {
@@ -588,18 +622,18 @@ static test::op::OpInfoPair<OperatorProp1, OperatorProp2, DType, AccReal> testFo
     // Check that everything is the same after the forward pass
     BatchNormValidator<DType, AccReal>::compare(info_1, info_2);
 
-    test::op::Validator<DType, AccReal>::compare(
-      *info_1.data_, *info_2.data_,
-      test::op::BasicOperatorData<DType, AccReal>::kInput,
-      op::batchnorm::kData);
+    BatchNormValidator<DType, AccReal>::compare(
+      *info_1.executor_, *info_2.executor_,
+      test::op::LegacyOperatorExecutor<DType, AccReal>::kInput,
+      op::batchnorm::kData, false);
 
     if (!thisCount) {
       // return backward
       runOperatorBackward(&info_1, count);
       runOperatorBackward(&info_2, count);
     } else {
-      info_1.data_->backward(count);
-      info_2.data_->backward(count);
+      info_1.executor_->backward(count);
+      info_2.executor_->backward(count);
     }
 
     if (isLast) {
@@ -612,14 +646,13 @@ static test::op::OpInfoPair<OperatorProp1, OperatorProp2, DType, AccReal> testFo
   } while (++thisCount < cycleCount);
 
   if (dumpC) {
-    info_1.data_->dumpC(&std::cerr, "BN_testForwardAndBackward");
+    info_1.executor_->dumpC(&std::cerr, "BN_testForwardAndBackward");
   }
 
   return  { info_1, info_2 };
 }
-
-template<typename OperatorProp1, typename OperatorProp2, typename DType, typename AccReal>
-static test::op::OpInfoPair<OperatorProp1, OperatorProp2, DType, AccReal>
+template<typename OperatorProp1, typename OperatorProp2, typename OperatorExecutor>
+static test::op::OpInfoPair<OperatorProp1, OperatorProp2, OperatorExecutor>
 testForwardAndBackward(const bool isGPU,
                        const TShape &inputShape,
                        const test::op::kwargs_t kwargs,
@@ -627,7 +660,7 @@ testForwardAndBackward(const bool isGPU,
                        const size_t count = 1,
                        const size_t cycleCount = CYCLE_COUNT
 ) {
-  return testForwardAndBackward<OperatorProp1, OperatorProp2, DType, AccReal>(
+  return testForwardAndBackward<OperatorProp1, OperatorProp2, OperatorExecutor>(
     isGPU,
     isGPU,
     inputShape,
@@ -637,14 +670,14 @@ testForwardAndBackward(const bool isGPU,
     cycleCount);
 }
 
-template<typename DType, typename AccReal>
-static test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>
+template<typename OperatorExecutor>
+static test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, OperatorExecutor>
 testBNForwardAndBackward2D(const bool isGPU,
-                         const TShape &inputShape,
-                         const test::op::kwargs_t kwargs,
-                         const bool dumpC = false) {
+                           const TShape &inputShape,
+                           const test::op::kwargs_t kwargs,
+                           const bool dumpC = false) {
   CHECK_EQ(inputShape.ndim(), 4);  // V1 can only handle 2D
-  return testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>(
+  return testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, OperatorExecutor>(
     isGPU,
     isGPU,
     inputShape,
@@ -661,7 +694,7 @@ TEST(BATCH_NORM, Test2DForwardV1V2) {
     DType,
     AccReal,
     {
-      auto infoA = testBNForwardAndBackward2D<DType, AccReal>(
+      auto infoA = testBNForwardAndBackward2D<BNOperatorExecutor<DType, AccReal>>(
         false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
     });
 }
@@ -675,14 +708,14 @@ TEST(BATCH_NORM, Test1DForward) {
     MSHADOW_REAL_TYPE_SWITCH_EX(
       type, DType, AccReal,
       {
-        TestBatchNormOperatorForward<op::BatchNormProp, DType, AccReal>(
+        TestBatchNormOperatorForward<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
           false, {BATCH_SIZE, CHANNELS, DW}, blank_kwargs);
       });
   }
 }
 
 TEST(BATCH_NORM, Test2DForwardV1) {
-  TestBatchNormOperatorForward<op::BatchNormProp, float, float>(
+  TestBatchNormOperatorForward<op::BatchNormProp, BNOperatorExecutor<float, float>>(
     false,
     {BATCH_SIZE, CHANNELS, DH, DW},
     blank_kwargs);
@@ -693,7 +726,8 @@ TEST(BATCH_NORM, Test2DForward) {
     MSHADOW_REAL_TYPE_SWITCH_EX(
       type, DType, AccReal,
       {
-        auto opInfoFloatH = TestBatchNormOperatorForward<op::BatchNormProp, DType, AccReal>(
+        auto opInfoFloatH = TestBatchNormOperatorForward<op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, {BATCH_SIZE, CHANNELS, DH, DW}, blank_kwargs);
       });
   }
@@ -704,26 +738,30 @@ TEST(BATCH_NORM, Test3DForward) {
     MSHADOW_REAL_TYPE_SWITCH_EX(
       type, DType, AccReal,
       {
-        TestBatchNormOperatorForward<op::BatchNormProp, DType, AccReal>(
+        TestBatchNormOperatorForward<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
           false, {BATCH_SIZE, CHANNELS, DEPTH, DH, DW}, blank_kwargs);
       });
   }
 }
 
-template<typename PropType, typename DType, typename AccReal>
+template<typename PropType, typename OperatorExecutor>
 static void timingTest(const std::string& label,
                        const bool isGPU,
                        const bool stochastic,
                        const test::op::kwargs_t& kwargs,
                        const int dim = 0,
-                       const size_t count = 1) {
+                       size_t count = 1) {
   std::cout << std::endl << std::flush;
 
 #ifdef NDEBUG
-  const size_t COUNT = 50;
+  size_t COUNT = 50;
 #else
-  const size_t COUNT = 5;
+  size_t COUNT = 5;
 #endif
+  if (mxnet::test::quick_test) {
+    COUNT = 2;
+    count = 1;
+  }
 
   test::perf::TimingInstrument timing;
 
@@ -747,22 +785,22 @@ static void timingTest(const std::string& label,
 
     const size_t D = dim ? dim - 1U : test::rangedRand(0U, 2U);
 
-    test::op::OpInfo<PropType, DType, AccReal> info;
+    test::op::OpInfo<PropType, OperatorExecutor> info;
     switch (D) {
       case 0:
-        info = TestBatchNormOperatorForward<PropType, DType, AccReal>(
+        info = TestBatchNormOperatorForward<PropType, OperatorExecutor>(
           isGPU,
           {batchSize, channels, width},
           kwargs, count);
         break;
       case 1:
-        info = TestBatchNormOperatorForward<PropType, DType, AccReal>(
+        info = TestBatchNormOperatorForward<PropType, OperatorExecutor>(
           isGPU,
           {batchSize, channels, height, width},
           kwargs, count);
         break;
       case 2:
-        info = TestBatchNormOperatorForward<PropType, DType, AccReal>(
+        info = TestBatchNormOperatorForward<PropType, OperatorExecutor>(
           isGPU,
           {batchSize, channels, depth, height, width},
           kwargs, count);
@@ -770,9 +808,9 @@ static void timingTest(const std::string& label,
       default:
         CHECK(false) << "rangedRand() returned unexpected value";
     }
-    if (info.data_.get()) {
-      runOperatorBackward<DType, AccReal>(&info, count);
-      timing += info.data_->timing_;
+    if (info.executor_.get()) {
+      runOperatorBackward<PropType, OperatorExecutor>(&info, count);
+      timing += info.executor_->GetTiming();
     }
   } while (false);
 
@@ -791,19 +829,17 @@ TEST(BATCH_NORM, TestStochasticTiming_2D) {
   MSHADOW_REAL_TYPE_SWITCH_EX(
     mshadow::kFloat32, DType, AccReal,
     {
-      timingTest<op::BatchNormProp, DType, AccReal>("RANDOM: BatchNormProp<cpu>",
-                                                    false, true,
-                                                    blank_kwargs_nocudnn,
-                                                    GPU_TEST_DIMENSIONS); });
+      timingTest<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+        "RANDOM: BatchNormProp<cpu>", false, true,
+        blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); });
 #if MXNET_USE_CUDA
   if (test::unitTestsWithCuda) {
     MSHADOW_REAL_TYPE_SWITCH_EX(
       mshadow::kFloat32, DType, AccReal,
       {
-        timingTest<op::BatchNormProp, DType, AccReal>("RANDOM: BatchNormProp<gpu>",
-                                                      true, true,
-                                                      blank_kwargs_nocudnn,
-                                                      GPU_TEST_DIMENSIONS); });
+        timingTest<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+          "RANDOM: BatchNormProp<gpu>", true, true,
+          blank_kwargs_nocudnn, GPU_TEST_DIMENSIONS); });
   }
 #endif
 }
@@ -811,47 +847,55 @@ TEST(BATCH_NORM, TestStochasticTiming_2D) {
 /*! \brief Performance tests */
 TEST(BATCH_NORM, TestTiming_2D) {
 #ifdef NDEBUG
-  const size_t THISCOUNT = 10;
+  size_t THISCOUNT = 10;
 #else
-  const size_t THISCOUNT = 2;
+  size_t THISCOUNT = 2;
 #endif
-  MSHADOW_REAL_TYPE_SWITCH_EX(
-    mshadow::kFloat32, DType, AccReal,
-    {
-      timingTest<op::BatchNormV1Prop, DType, AccReal>("BatchNormV1Prop<cpu> 2D",
-                                                      false, false,
-                                                      blank_kwargs,
-                                                      2, THISCOUNT);
+  if (mxnet::test::quick_test) {
+    THISCOUNT = 1;
+  }
+MSHADOW_REAL_TYPE_SWITCH_EX(
+  mshadow::kFloat32, DType, AccReal, {
+  timingTest<op::BatchNormV1Prop, BNOperatorExecutor<DType, AccReal>>(
+    "BatchNormV1Prop<cpu> 2D",
+    false, false,
+    blank_kwargs,
+    2, THISCOUNT);
 #if MXNET_USE_MKL2017 == 1
-      timingTest<op::BatchNormProp, DType, AccReal>("MKL BatchNormProp<cpu> 2D",
-                                                    false, false,
-                                                    blank_kwargs_nocudnn,
-                                                    2, THISCOUNT);
+  timingTest<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+    "MKL BatchNormProp<cpu> 2D",
+    false, false,
+    blank_kwargs_nocudnn,
+    2, THISCOUNT);
 #endif
-      test::ScopeSet<volatile bool> disableMKL(&mxnet::op::batchnorm::disable_mkl, true);
-      timingTest<op::BatchNormProp, DType, AccReal>("BatchNormProp<cpu> 2D",
-                                                    false, false,
-                                                    blank_kwargs_nocudnn,
-                                                    2, THISCOUNT);
+  test::ScopeSet<volatile bool> disableMKL(&mxnet::op::batchnorm::disable_mkl, true);
+  timingTest<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+    "BatchNormProp<cpu> 2D",
+    false, false,
+    blank_kwargs_nocudnn,
+    2, THISCOUNT);
 #if MXNET_USE_CUDA
-      if (test::unitTestsWithCuda) {
-        timingTest<op::BatchNormV1Prop, DType, AccReal>("BatchNormV1Prop<gpu> 2D",
-                                                        true, false,
-                                                        blank_kwargs,
-                                                        2, THISCOUNT);
-        timingTest<op::BatchNormProp, DType, AccReal>("BatchNormProp<gpu> 2D",
-                                                      true, false,
-                                                      blank_kwargs_nocudnn,
-                                                      2, THISCOUNT);
+  if (test::unitTestsWithCuda) {
+    timingTest<op::BatchNormV1Prop, BNOperatorExecutor<DType, AccReal>>(
+      "BatchNormV1Prop<gpu> 2D",
+      true, false,
+      blank_kwargs,
+      2, THISCOUNT);
+    timingTest<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+      "BatchNormProp<gpu> 2D",
+      true, false,
+      blank_kwargs_nocudnn,
+      2, THISCOUNT);
 #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5
-        timingTest<op::BatchNormProp, DType, AccReal>("CUDNN BatchNormProp<gpu> 2D",
-                                                      true, false,
-                                                      blank_kwargs,
-                                                      2, THISCOUNT);
+    timingTest<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+      "CUDNN BatchNormProp<gpu> 2D",
+      true, false,
+      blank_kwargs,
+      2, THISCOUNT);
 #endif
-      }
+  }
 #endif
-    });
+});
 }
 
 /**
@@ -860,8 +904,8 @@ TEST(BATCH_NORM, TestTiming_2D) {
 
 template<typename DType, typename AccReal>
 struct BothInfo {
-  test::op::OpInfo<op::BatchNormV1Prop, DType, AccReal>  info_v1_;
-  test::op::OpInfo<op::BatchNormProp, DType, AccReal>    info_;
+  test::op::OpInfo<op::BatchNormV1Prop, BNOperatorExecutor<DType, AccReal>>  info_v1_;
+  test::op::OpInfo<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>    info_;
 };
 
 TEST(BATCH_NORM, TestBackward2D_Simple) {
@@ -869,8 +913,10 @@ TEST(BATCH_NORM, TestBackward2D_Simple) {
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({1, 1, 2, 1});
-      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal> bi =
-        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>(
+      test::op::OpInfoPair<
+        op::BatchNormV1Prop, op::BatchNormProp, BNOperatorExecutor<DType, AccReal>> bi =
+        testForwardAndBackward<
+          op::BatchNormV1Prop, op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
           false, inputShape, blank_kwargs);  // Keep it simple
     });
 }
@@ -892,20 +938,22 @@ TEST(BATCH_NORM, TestIterAll) {
           kwargs.push_back({ "cudnn_off", "True" });
         }
         for (TShape shape : shapes) {
-          for (int g1 = 0; g1 < 2U; ++g1) {
-            for (int g2 = 0; g2 < 2U; ++g2) {
+          for (int g1 = 0; g1 < 2; ++g1) {
+            for (int g2 = 0; g2 < 2; ++g2) {
               for (int type : v2_types) {
                 MSHADOW_REAL_TYPE_SWITCH_EX(
                   type, DType, AccReal,
                   {
-                    test::op::OpInfoPair<op::BatchNormProp, op::BatchNormProp, DType, AccReal>
+                    test::op::OpInfoPair<op::BatchNormProp, op::BatchNormProp,
+                      BNOperatorExecutor<DType, AccReal>>
                       bi = testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
-                      DType, AccReal>(
+                      BNOperatorExecutor<DType, AccReal>>(
                       g1 != 0, g2 != 0, shape, kwargs, false);  // Keep it simple
                     if (shape.ndim() == 4 && type == mshadow::kFloat32 && !x3) {
-                      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>
+                      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp,
+                        BNOperatorExecutor<DType, AccReal>>
                         bi = testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp,
-                        DType, AccReal>(
+                        BNOperatorExecutor<DType, AccReal>>(
                         g1 != 0, g2 != 0, shape, kwargs, false);  // Keep it simple
                     }
                   });
@@ -935,21 +983,21 @@ static void test_V1_V2_2D(const test::op::kwargs_t &kwargs, const size_t count)
       TShape shapes[2] = {2, 3};
       const TShape inputShape({2, 3});
 
-      test::op::OpInfo<op::BatchNormV1Prop, DType, AccReal> info_1 = test::op::createOpAndInfoF<
-        op::BatchNormV1Prop,
-        BNOperatorData<DType, AccReal>,
-        DType, AccReal>(gpu_V1, inputShape, kwargs);
+      test::op::OpInfo<op::BatchNormV1Prop, BNOperatorExecutor<DType, AccReal>> info_1 =
+        test::op::createOpAndInfoF<
+          op::BatchNormV1Prop, BNOperatorExecutor<DType, AccReal>>(
+          kwargs, gpu_V1, inputShape);
 
-      test::op::OpInfo<op::BatchNormProp, DType, AccReal> info_2 = test::op::createOpAndInfoF<
-        op::BatchNormProp, BNOperatorData<DType, AccReal>, DType, AccReal>(
-        gpu_V2, inputShape, kwargs);
+      test::op::OpInfo<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>> info_2 =
+        test::op::createOpAndInfoF<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+          kwargs, gpu_V2, inputShape);
 
-      info_1.data_->initForward(*info_1.prop_, &info_1.in_type_);
-      info_2.data_->initForward(*info_1.prop_, &info_1.in_type_);
-      info_1.data_->initBackward(*info_1.prop_, &info_1.in_type_);
-      info_2.data_->initBackward(*info_1.prop_, &info_1.in_type_);
+      info_1.executor_->initForward(*info_1.prop_, &info_1.in_type_);
+      info_2.executor_->initForward(*info_1.prop_, &info_1.in_type_);
+      info_1.executor_->initBackward(*info_1.prop_, &info_1.in_type_);
+      info_2.executor_->initBackward(*info_1.prop_, &info_1.in_type_);
 
-      TBlob &blob1 = info_1.data_->c_.blob_input_vec_[op::batchnorm::kData];
+      TBlob &blob1 = info_1.executor_->inputs()[op::batchnorm::kData];
       test::data_ref<DType>(&blob1, {0, 0}) = -0.05f;
       test::data_ref<DType>(&blob1, {0, 1}) = -0.19f;
       test::data_ref<DType>(&blob1, {0, 2}) = 0.02f;
@@ -957,7 +1005,7 @@ static void test_V1_V2_2D(const test::op::kwargs_t &kwargs, const size_t count)
       test::data_ref<DType>(&blob1, {1, 1}) = 0.06f;
       test::data_ref<DType>(&blob1, {1, 2}) = -0.01f;
 
-      TBlob &blob2 = info_2.data_->c_.blob_input_vec_[op::batchnorm::kData];
+      TBlob &blob2 = info_2.executor_->inputs()[op::batchnorm::kData];
       test::data_ref<DType>(&blob2, {0, 0}) = -0.05f;
       test::data_ref<DType>(&blob2, {0, 1}) = -0.19f;
       test::data_ref<DType>(&blob2, {0, 2}) = 0.02f;
@@ -965,20 +1013,20 @@ static void test_V1_V2_2D(const test::op::kwargs_t &kwargs, const size_t count)
       test::data_ref<DType>(&blob2, {1, 1}) = 0.06f;
       test::data_ref<DType>(&blob2, {1, 2}) = -0.01f;
 
-      test::data_ref<DType>(&info_1.data_->c_.blob_input_vec_[op::batchnorm::kGamma], {1}) = 3;
-      test::data_ref<DType>(&info_2.data_->c_.blob_input_vec_[op::batchnorm::kGamma], {1}) = 3;
+      test::data_ref<DType>(&info_1.executor_->inputs()[op::batchnorm::kGamma], {1}) = 3;
+      test::data_ref<DType>(&info_2.executor_->inputs()[op::batchnorm::kGamma], {1}) = 3;
 
-      test::data_ref<DType>(&info_1.data_->c_.blob_input_vec_[op::batchnorm::kBeta], {0}) = 3;
-      test::data_ref<DType>(&info_2.data_->c_.blob_input_vec_[op::batchnorm::kBeta], {0}) = 3;
+      test::data_ref<DType>(&info_1.executor_->inputs()[op::batchnorm::kBeta], {0}) = 3;
+      test::data_ref<DType>(&info_2.executor_->inputs()[op::batchnorm::kBeta], {0}) = 3;
 
       for (size_t x = 0; x < count; ++x) {
-        info_1.data_->forward();
-        info_2.data_->forward();
+        info_1.executor_->forward();
+        info_2.executor_->forward();
 
         BatchNormValidator<DType, AccReal>::compare(info_1, info_2);
 
-        info_1.data_->backward();
-        info_2.data_->backward();
+        info_1.executor_->backward();
+        info_2.executor_->backward();
 
         BatchNormValidator<DType, AccReal>::compare(info_1, info_2);
       }
@@ -1002,8 +1050,10 @@ TEST(BATCH_NORM, TestBackward2D_SimpleNFG) {
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({1, 1, 2, 1});
-      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal> bi =
-        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>(
+      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp,
+        BNOperatorExecutor<DType, AccReal>> bi =
+        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, inputShape, nonfixgamma_kwargs);
     });
 }
@@ -1012,10 +1062,12 @@ TEST(BATCH_NORM, Test2DBackward_Complex) {
   MSHADOW_REAL_TYPE_SWITCH_EX(
     mshadow::kFloat32, DType, AccReal,
     {
-      test::ScopeSet<bool> noDebugOutput(&test::debugOutput, false);
+      test::ScopeSet<bool> noDebugOutput(&test::debug_output, false);
       const TShape inputShape({9, 14, 16, 91});
-      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal> bi =
-        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>(
+      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp,
+        BNOperatorExecutor<DType, AccReal>> bi =
+        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, inputShape, blank_kwargs);
     });
 }
@@ -1024,12 +1076,14 @@ struct Test2DBackward2DPlusLoadAndCompareLogicUtil {
   template <typename DType, typename AccReal>
   static void test() {
     const TShape inputShape({1, 1, 2, 1});
-    test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal> bi =
-      testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>(
+    test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp,
+      BNOperatorExecutor<DType, AccReal>> bi =
+      testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp,
+        BNOperatorExecutor<DType, AccReal>>(
         false, inputShape, blank_kwargs, false, 1, 5);
 
 #if MXNET_DUMP_C
-    bi.info_1_.data_->dumpC(&std::cerr, "Test2DBackward2DPlusLoadAndCompareLogic");
+    bi.info_1_.executor_->dumpC(&std::cerr, "Test2DBackward2DPlusLoadAndCompareLogic");
 #endif
 
     static const std::vector< std::vector< std::vector<DType> > >
@@ -1060,17 +1114,17 @@ struct Test2DBackward2DPlusLoadAndCompareLogicUtil {
     // Expected data state when running forward+backward starting with default values
     // Note: This data structure generated by dumpC()
     // Test loaded data agsinst calculated data
-    test::op::OpInfo<op::BatchNormProp, DType, AccReal> info_checkLoad =
-      test::op::createOpAndInfoF<op::BatchNormProp, BNOperatorData<DType, AccReal>,
-        DType, AccReal>(false, inputShape, blank_kwargs);
-    info_checkLoad.data_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
-    info_checkLoad.data_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
-    info_checkLoad.data_->load(___Test2DBackward2DPlusLoadAndCompareLogic_data_shape_1_1_2_1___);
+    test::op::OpInfo<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>> info_checkLoad =
+      test::op::createOpAndInfoF<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+        blank_kwargs, false, inputShape);
+    info_checkLoad.executor_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
+    info_checkLoad.executor_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
+    info_checkLoad.executor_->load(
+      ___Test2DBackward2DPlusLoadAndCompareLogic_data_shape_1_1_2_1___);
     BatchNormValidator<DType, AccReal>::compare(bi.info_1_, info_checkLoad);
   }
 };
 
-
 TEST(BATCH_NORM, Test2DBackward2DPlusLoadAndCompareLogic) {
   test::ScopeSet<volatile bool> disableMKL(&mxnet::op::batchnorm::disable_mkl, true);
   MSHADOW_REAL_TYPE_SWITCH_EX(
@@ -1080,17 +1134,20 @@ TEST(BATCH_NORM, Test2DBackward2DPlusLoadAndCompareLogic) {
     });
 }
 
-template<typename PropType, typename DType, typename AccReal>
+template<typename PropType, typename OperatorExecutor>
 void compare(const bool isGPU,
-             const test::op::OpInfo<PropType, DType, AccReal>& object,
-             const std::vector< std::vector< std::vector<DType> > >& values) {
-  test::op::OpInfo<PropType, DType, AccReal> info_checkLoad =
-    test::op::createOpAndInfoF<PropType, BNOperatorData<DType, AccReal>, DType, AccReal>(
-      isGPU, object.data_->c_.blob_input_vec_[0].shape_, blank_kwargs);
-  info_checkLoad.data_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
-  info_checkLoad.data_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
-  info_checkLoad.data_->load(values);
-  BatchNormValidator<DType, AccReal>::compare(object, info_checkLoad);
+             const test::op::OpInfo<PropType, OperatorExecutor>& object,
+             const std::vector<
+               std::vector< std::vector<typename OperatorExecutor::DataType> > >& values) {
+  test::op::OpInfo<PropType, OperatorExecutor> info_checkLoad =
+    test::op::createOpAndInfoF<PropType, OperatorExecutor>(
+      blank_kwargs, isGPU, object.executor_->inputs()[0].shape_);
+  info_checkLoad.executor_->initForward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
+  info_checkLoad.executor_->initBackward(*info_checkLoad.prop_, &info_checkLoad.in_type_);
+  info_checkLoad.executor_->load(values);
+  BatchNormValidator<
+    typename OperatorExecutor::DataType,
+    typename OperatorExecutor::AccRealType>::compare(object, info_checkLoad);
 }
 
 TEST(BATCH_NORM, TestBackward1D_Simple) {
@@ -1098,44 +1155,43 @@ TEST(BATCH_NORM, TestBackward1D_Simple) {
     mshadow::kFloat32, DTypeX, AccReal,
     {
       const TShape inputShape({1, 1, 2});
-      test::op::OpInfo<op::BatchNormProp, DTypeX, AccReal> info =
-        TestBatchNormOperatorForward<op::BatchNormProp, DTypeX, AccReal>(false,
-                                                                        inputShape,
-                                                                        blank_kwargs);
-      info.data_->initBackward(*info.prop_, &info.in_type_);
+      test::op::OpInfo<op::BatchNormProp, BNOperatorExecutor<DTypeX, AccReal>> info =
+        TestBatchNormOperatorForward<op::BatchNormProp, BNOperatorExecutor<DTypeX, AccReal>>(
+          false, inputShape, blank_kwargs);
+      info.executor_->initBackward(*info.prop_, &info.in_type_);
       runOperatorBackward(&info);
 
 #if MXNET_DUMP_C
-      info.data_->dumpC(&std::cerr, "BN_TestBackward1D_Simple");
+      info.executor_->dumpC(&std::cerr, "BN_TestBackward1D_Simple");
 #endif
 
       // Expected data state when running forward+backward starting with default values
       // Note: This data structure generated by dumpC()
       static const std::vector< std::vector< std::vector<DTypeX> > >
         ___BN_TestBackward1D_Simple_data_shape_1_1_2___ = {
-          { /* kInput */
-            { 1.0f, 2.0f },
-            { 1.0f },
-            { 0.0f }
-          },
-          { /* kOutput */
-            { -0.998006f, 0.998006f },
-            { 1.5f },
-            { 0.25f }
-          },
-          { /* kAux */
-            { 0.15f },
-            { 0.925f }
-          },
-          { /* kInGrad */
-            { -0.00397621f, 0.00397609f },
-            { 0.0f },
-            { 2.998f }
-          },
-          { /* kOutGrad */
-            { 0.999f, 1.999f }
-          }
-        };
+        { /* kInput */
+          { 1.0f, 2.0f },
+          { 1.0f },
+          { 0.0f }
+        },
+        { /* kOutput */
+          { -0.998006f, 0.998006f },
+          { 1.5f },
+          { 0.25f }
+        },
+        { /* kAux */
+          { 0.15f },
+          { 0.925f }
+        },
+        { /* kInGrad */
+          { -0.00397621f, 0.00397609f },
+          { 0.0f },
+          { 2.998f }
+        },
+        { /* kOutGrad */
+          { 0.999f, 1.999f }
+        }
+      };
       compare(false, info, ___BN_TestBackward1D_Simple_data_shape_1_1_2___);
     });
 }
@@ -1145,13 +1201,13 @@ TEST(BATCH_NORM, TestBackward3D) {
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({2, 3, 2, 3, 5});
-      test::op::OpInfo<op::BatchNormProp, DType, AccReal> info =
-        TestBatchNormOperatorForward<op::BatchNormProp, DType, AccReal>(
+      test::op::OpInfo<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>> info =
+        TestBatchNormOperatorForward<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
           false, inputShape, blank_kwargs);
-      info.data_->initBackward(*info.prop_, &info.in_type_);
+      info.executor_->initBackward(*info.prop_, &info.in_type_);
       runOperatorBackward(&info);
 #if MXNET_DUMP_C
-      info.data_->dumpC(&std::cerr, "TestBackward3D");
+      info.executor_->dumpC(&std::cerr, "TestBackward3D");
 #endif
     });
 }
@@ -1162,8 +1218,10 @@ TEST(BATCH_NORM, Test2DBackwardMixed_cpu_cpu_nfg) {
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({1, 1, 2, 1});
-      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal> bi =
-        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>(
+      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp,
+        BNOperatorExecutor<DType, AccReal>> bi =
+        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, false, inputShape, nonfixgamma_kwargs, false);
       dumpF(&std::cout, bi);
       dumpB(&std::cout, bi);
@@ -1176,8 +1234,10 @@ TEST(BATCH_NORM, Test2DBackwardMixed_cpu_cpu_ugs) {
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({1, 1, 2, 1});
-      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal> bi =
-        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>(
+      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp,
+        BNOperatorExecutor<DType, AccReal>> bi =
+        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, false, inputShape, useglobalstats_kwargs, false);
       dumpF(&std::cout, bi);
       dumpB(&std::cout, bi);
@@ -1201,7 +1261,7 @@ class ChannelAxisTestData {
     std::vector<size_t> indexes(channel_count, 0);
     for (size_t outer = 0, outerCount = tensor3.OuterSize(); outer < outerCount; ++outer) {
       for (size_t channel = 0, channelCount = tensor3.ChannelCount();
-          channel < channelCount; ++channel) {
+           channel < channelCount; ++channel) {
         CHECK_LT(channel, channel_data_.size());
         for (size_t inner = 0, innerCount = tensor3.InnerSize(); inner < innerCount; ++inner) {
           CHECK_LT(indexes[channel], channel_data_[channel].size());
@@ -1219,7 +1279,7 @@ class ChannelAxisTestData {
   std::vector<std::vector<DType>>   channel_data_;
 
   static void print(const std::string& label, const std::vector<std::vector<DType>>& m) {
-    if (test::debugOutput) {
+    if (test::debug_output) {
       if (!label.empty()) {
         std::cout << label << ": ";
       }
@@ -1241,7 +1301,7 @@ class ChannelAxisTestData {
   }
 
   static void print(const std::string& label, const TBlob& blob) {
-    if (test::debugOutput) {
+    if (test::debug_output) {
       if (!label.empty()) {
         std::cout << label << ": ";
       }
@@ -1259,7 +1319,7 @@ class ChannelAxisTestData {
   }
 
   void save(const TBlob& blob, const int channel_axis) {
-      loadOrSave(blob, channel_axis, SAVE);
+    loadOrSave(blob, channel_axis, SAVE);
   }
 
   void load(const TBlob& blob, const int channel_axis) {
@@ -1274,8 +1334,8 @@ static void compare(const TBlob& blob, const std::vector<DType>& vals) {
   for (size_t i = 0, n = vals.size(); i < n; ++i) {
     const DType vBlob = v[i];
     const DType vVect = vals[i];
-    const bool near = test::op::Validator<DType, AccReal>::isNear(
-      vBlob, vVect, test::op::Validator<DType, AccReal>::ErrorBound(&blob));
+    const bool near = BatchNormValidator<DType, AccReal>::isNear(
+      vBlob, vVect, BatchNormValidator<DType, AccReal>::ErrorBound(&blob));
     EXPECT_TRUE(near);
     if (!near) {
       LOG(WARNING) << vBlob << " is not near enough to " << vVect << std::endl;
@@ -1294,8 +1354,8 @@ static void compare(const std::vector<std::vector<float>>& d1,
     for (size_t i = 0, n = vec1.size(); i < n; ++i) {
       const DType v1 = vec1[i];
       const DType v2 = vec2[i];
-      const bool near = test::op::Validator<DType, AccReal>::isNear(
-        v1, v2, test::op::Validator<DType, AccReal>::ERROR_BOUND());
+      const bool near = BatchNormValidator<DType, AccReal>::isNear(
+        v1, v2, BatchNormValidator<DType, AccReal>::ERROR_BOUND());
       EXPECT_TRUE(near);
       if (!near) {
         LOG(WARNING) << v1 << " is not near enough to " << v2 << std::endl;
@@ -1335,7 +1395,7 @@ TEST(BATCH_NORM, TestChannelAxisSaveAndLoad) {
   typedef float AccReal;
 
   const std::vector<std::vector<DType>> myData =
-    { { 1.0f, 1.0f, 1.0, 1.0 },
+    { { 1.0f, 1.0f, 1.0f, 1.0f },
       { 2.0f, 2.0f, 2.0f, 2.0f },
       { 3.0f, 3.0f, 3.0f, 3.0f } };
 
@@ -1365,11 +1425,11 @@ static TShape MakeShape(const std::vector<index_t>& shape,
   CHECK_LT(channelAxis, shape.size() + 1);
   const index_t dim = index_t(shape.size()) + 1;
   TShape newShape(dim);
-  for (size_t x = 0; x < channelAxis; ++x) {
+  for (size_t x = 0; x < static_cast<size_t>(channelAxis); ++x) {
     newShape[x] = index_t(shape[x]);
   }
   newShape[channelAxis] = index_t(channelCount);
-  for (int x = channelAxis + 1; x < dim; ++x) {
+  for (index_t x = channelAxis + 1; x < dim; ++x) {
     newShape[x] = shape[x - 1];
   }
   return newShape;
@@ -1435,66 +1495,67 @@ static void runChannelAxisTest(
 
   // Create operator 1 with ChannelAxis2 (normally the experimental one)
   kwargs.push_back({"axis", std::to_string(channelAxis1)});
-  test::op::OpInfo<op::BatchNormProp, DType, AccReal> info_c1 = test::op::createOpAndInfoF<
-    op::BatchNormProp, BNOperatorData<DType, AccReal>, DType, AccReal>(
-    isGPU1, shape_c1, kwargs);
+  test::op::OpInfo<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>> info_c1 =
+    test::op::createOpAndInfoF<
+      op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+      kwargs, isGPU1, shape_c1);
 
   // Create operator 2 with ChannelAxis2 (normally the control one)
   kwargs.pop_back();
   kwargs.push_back({"axis", std::to_string(channelAxis2)});
-  test::op::OpInfo<op::BatchNormProp, DType, AccReal> info_c2 = test::op::createOpAndInfoF<
-    op::BatchNormProp, BNOperatorData<DType, AccReal>, DType, AccReal>(
-    isGPU2, shape_c2, kwargs);
+  test::op::OpInfo<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>> info_c2 =
+    test::op::createOpAndInfoF<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
+      kwargs, isGPU2, shape_c2);
   kwargs.pop_back();
 
   // Init operators
-  info_c1.data_->initForward(*info_c1.prop_, &info_c1.in_type_);
-  info_c1.data_->initBackward(*info_c1.prop_, &info_c1.in_type_);
-  info_c2.data_->initForward(*info_c2.prop_, &info_c2.in_type_);
-  info_c2.data_->initBackward(*info_c2.prop_, &info_c2.in_type_);
+  info_c1.executor_->initForward(*info_c1.prop_, &info_c1.in_type_);
+  info_c1.executor_->initBackward(*info_c1.prop_, &info_c1.in_type_);
+  info_c2.executor_->initForward(*info_c2.prop_, &info_c2.in_type_);
+  info_c2.executor_->initBackward(*info_c2.prop_, &info_c2.in_type_);
 
   // Save input data to blob with new shape 1
-  data_c1.save(info_c1.data_->c_.blob_input_vec_[0], channelAxis1);
-  ChannelAxisTestData<DType>::print("blob 1 input", info_c1.data_->c_.blob_input_vec_[0]);
+  data_c1.save(info_c1.executor_->inputs()[0], channelAxis1);
+  ChannelAxisTestData<DType>::print("blob 1 input", info_c1.executor_->inputs()[0]);
 
   // Save input data to blob with new shape 2
-  data_c2.save(info_c2.data_->c_.blob_input_vec_[0], channelAxis2);
-  ChannelAxisTestData<DType>::print("blob 2 input", info_c2.data_->c_.blob_input_vec_[0]);
+  data_c2.save(info_c2.executor_->inputs()[0], channelAxis2);
+  ChannelAxisTestData<DType>::print("blob 2 input", info_c2.executor_->inputs()[0]);
 
   // Save output grad to blob with new shape 1
-  grad_c1.save(info_c1.data_->c_.blob_out_grad_[0], channelAxis1);
-  ChannelAxisTestData<DType>::print("blob 1 output grad", info_c1.data_->c_.blob_out_grad_[0]);
+  grad_c1.save(info_c1.executor_->bwd_inputs()[0], channelAxis1);
+  ChannelAxisTestData<DType>::print("blob 1 output grad", info_c1.executor_->bwd_inputs()[0]);
 
   // Save output grad to blob with new shape 2
-  grad_c2.save(info_c2.data_->c_.blob_out_grad_[0], channelAxis2);
-  ChannelAxisTestData<DType>::print("blob 2 output grad", info_c2.data_->c_.blob_out_grad_[0]);
+  grad_c2.save(info_c2.executor_->bwd_inputs()[0], channelAxis2);
+  ChannelAxisTestData<DType>::print("blob 2 output grad", info_c2.executor_->bwd_inputs()[0]);
 
   // Run both operators forward and backwards several times
-  for (int x = 0; x < numberOfPasses; ++x) {
-    info_c1.data_->forward();
-    info_c2.data_->forward();
+  for (index_t x = 0; x < numberOfPasses; ++x) {
+    info_c1.executor_->forward();
+    info_c2.executor_->forward();
 
-    info_c1.data_->backward();
-    info_c2.data_->backward();
+    info_c1.executor_->backward();
+    info_c2.executor_->backward();
   }
 
   // Transform operator 1's blob output to a normalized shape
-  data_c1.load(info_c1.data_->c_.blob_output_vec_[0], channelAxis1);
+  data_c1.load(info_c1.executor_->outputs()[0], channelAxis1);
   ChannelAxisTestData<DType>::print("channel data 1", data_c1.channel_data_);
 
   // Transform operator 2's blob output to a normalized shape
-  data_c2.load(info_c2.data_->c_.blob_output_vec_[0], channelAxis2);
+  data_c2.load(info_c2.executor_->outputs()[0], channelAxis2);
   ChannelAxisTestData<DType>::print("channel data 2", data_c2.channel_data_);
 
   // Compare the operators' output data while they're in a normalized shape
   compare<DType, AccReal>(data_c1.channel_data_, data_c2.channel_data_);
 
   // Transform operator 1's input-grad blob to a normalized shape
-  grad_c1.load(info_c1.data_->c_.blob_in_grad_[0], channelAxis1);
+  grad_c1.load(info_c1.executor_->bwd_outputs()[0], channelAxis1);
   ChannelAxisTestData<DType>::print("input grad 1", grad_c1.channel_data_);
 
   // Transform operator 2's input-grad blob to a normalized shape
-  grad_c2.load(info_c2.data_->c_.blob_in_grad_[0], channelAxis2);
+  grad_c2.load(info_c2.executor_->bwd_outputs()[0], channelAxis2);
   ChannelAxisTestData<DType>::print("input grad 2", grad_c2.channel_data_);
 
   // Compare the operators' input grad data while they're in a normalized shape
@@ -1526,12 +1587,15 @@ TEST(BATCH_NORM, TestChannelAxisSimple) {
  *  backward result equivalence here implies correctness for other channel positions
  */
 TEST(BATCH_NORM, TestChannelAxis) {
-  test::ScopeSet<bool> noDebugOutput(&test::debugOutput, false);
+  test::ScopeSet<bool> noDebugOutput(&test::debug_output, false);
 
   test::op::kwargs_t kwargs;
   const std::vector<std::vector<index_t>> shapes =
-    { {1, 2}, {1, 2, 1}, {1, 2, 3}, {1, 2, 3, 4} };
-  const char *tof[2] = { "False", "True" };
+    {{1, 2},
+     {1, 2, 1},
+     {1, 2, 3},
+     {1, 2, 3, 4}};
+  const char *tof[2] = {"False", "True"};
 
   for (size_t x1 = 0; x1 < 2U; ++x1) {
     kwargs.push_back({"fix_gamma", tof[x1]});
@@ -1539,8 +1603,8 @@ TEST(BATCH_NORM, TestChannelAxis) {
       kwargs.push_back({"use_global_stats", tof[x2]});
       for (size_t x3 = 0; x3 < 2U; ++x3) {
         kwargs.push_back({"cudnn_off", tof[x3]});
-        for (int g1 = 0; g1 < 2U; ++g1) {
-          for (int g2 = 0; g2 < 2U; ++g2) {
+        for (index_t g1 = 0; g1 < 2U; ++g1) {
+          for (index_t g2 = 0; g2 < 2U; ++g2) {
             for (const std::vector<index_t> &simpleShape : shapes) {
               const int dim = static_cast<int>(simpleShape.size());
               for (signed int channelAxis = -dim, shapeDim = dim;
@@ -1569,11 +1633,11 @@ TEST(BATCH_NORM, Test2DForwardV12D_gpu) {
   MSHADOW_REAL_TYPE_SWITCH_EX(
     mshadow::kFloat32, DType, AccReal,
     {
-      TestBatchNormOperatorForward<op::BatchNormV1Prop, DType, AccReal>(
+      TestBatchNormOperatorForward<op::BatchNormV1Prop, BNOperatorExecutor<DType, AccReal>>(
         true,
         {BATCH_SIZE, CHANNELS, DH, DW},
         blank_kwargs);
-      TestBatchNormOperatorForward<op::BatchNormV1Prop, DType, AccReal>(
+      TestBatchNormOperatorForward<op::BatchNormV1Prop, BNOperatorExecutor<DType, AccReal>>(
         true,
         {BATCH_SIZE, CHANNELS, DH, DW},
         blank_kwargs);
@@ -1585,11 +1649,11 @@ TEST(BATCH_NORM, Test2DForward2D_gpu) {
     MSHADOW_REAL_TYPE_SWITCH_EX(
       type, DType, AccReal,
       {
-        TestBatchNormOperatorForward<op::BatchNormProp, DType, AccReal>(
+        TestBatchNormOperatorForward<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
           true,
           {BATCH_SIZE, CHANNELS, DH, DW},
           blank_kwargs);
-        TestBatchNormOperatorForward<op::BatchNormProp, DType, AccReal>(
+        TestBatchNormOperatorForward<op::BatchNormProp, BNOperatorExecutor<DType, AccReal>>(
           true,
           {BATCH_SIZE, CHANNELS, DH, DW},
           blank_kwargs_nocudnn);
@@ -1603,7 +1667,8 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1_gpu_cpu) {
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({1, 1, 2, 1});
-      testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormV1Prop, DType, AccReal>(
+      testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormV1Prop,
+        BNOperatorExecutor<DType, AccReal>>(
         false, true, inputShape, blank_kwargs, false);
     });
 }
@@ -1613,7 +1678,8 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1Complex_gpu_cpu) {
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-      testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormV1Prop, DType, AccReal>(
+      testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormV1Prop,
+        BNOperatorExecutor<DType, AccReal>>(
         false, true, inputShape, blank_kwargs, false);
     });
 }
@@ -1624,9 +1690,11 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu) {
       type, DType, AccReal,
       {
         const TShape inputShape({1, 1, 2, 1});
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, blank_kwargs, false);
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, blank_kwargs_nocudnn, false);
       });
   }
@@ -1638,9 +1706,11 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu) {
       type, DType, AccReal,
       {
         const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, blank_kwargs, false);
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, blank_kwargs_nocudnn, false);
       });
   }
@@ -1654,7 +1724,8 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1V2Complex_cpu_cpu_nfg) {
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-      testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>(
+      testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp,
+        BNOperatorExecutor<DType, AccReal>>(
         false, false, inputShape, nonfixgamma_kwargs, false);
     });
 }
@@ -1665,9 +1736,11 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_nfg) {
       type, DType, AccReal,
       {
         const TShape inputShape({1, 1, 2, 1});
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, nonfixgamma_kwargs, false);
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, nonfixgamma_kwargs_nocudnn, false);
       });
   }
@@ -1679,9 +1752,11 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_nfg) {
       type, DType, AccReal,
       {
         const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, nonfixgamma_kwargs, false);
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, nonfixgamma_kwargs_nocudnn, false);
       });
   }
@@ -1695,8 +1770,10 @@ TEST(BATCH_NORM, Test2DBackwardMixedV1V2Complex_cpu_cpu_ugs) {
     mshadow::kFloat32, DType, AccReal,
     {
       const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal> bi =
-        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp, DType, AccReal>(
+      test::op::OpInfoPair<op::BatchNormV1Prop, op::BatchNormProp,
+        BNOperatorExecutor<DType, AccReal>> bi =
+        testForwardAndBackward<op::BatchNormV1Prop, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, false, inputShape, useglobalstats_kwargs, false);
       dumpF(&std::cout, bi);
       dumpB(&std::cout, bi);
@@ -1709,9 +1786,11 @@ TEST(BATCH_NORM, Test2DBackwardMixed_gpu_cpu_ugs) {
       type, DType, AccReal,
       {
         const TShape inputShape({2, 3, 2, 2});
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, useglobalstats_kwargs_nocudnn, false);
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, useglobalstats_kwargs, false);
       });
   }
@@ -1723,13 +1802,14 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_ugs) {
       type, DType, AccReal,
       {
         const TShape inputShape({BATCH_SIZE, CHANNELS, DH, DW});
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, useglobalstats_kwargs, false);
-        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp, DType, AccReal>(
+        testForwardAndBackward<op::BatchNormProp, op::BatchNormProp,
+          BNOperatorExecutor<DType, AccReal>>(
           false, true, inputShape, useglobalstats_kwargs_nocudnn, false);
       });
   }
 }
 
 #endif  // MXNET_USE_CUDA
-
diff --git a/tests/cpp/operator/coreop_perf.cc b/tests/cpp/operator/coreop_perf.cc
new file mode 100644
index 000000000000..2655740677e2
--- /dev/null
+++ b/tests/cpp/operator/coreop_perf.cc
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  \file activation_perf.cc
+ *  \brief Perf/profile run of ActivationOp
+ *  \author Chris Olivier
+ */
+
+#include <gtest/gtest.h>
+#include <mxnet/tensor_blob.h>
+#include "../../src/operator/activation-inl.h"
+#include "../include/test_op_runner.h"
+#include "../include/test_core_op.h"
+
+using namespace mxnet;
+
+using kwargs_t = test::op::kwargs_t;
+
+template<typename DType = float>
+static void RunCoreOpBidirectional(const bool isGPU,
+                                   const kwargs_t& op_kwargs,
+                                   const char *op_name,
+                                   const char *backward_op_name = "") {
+  const TShape shape({5, 5});
+  test::op::CoreOpExecutor<DType> op(isGPU, { shape });
+  op.set_verbose(false);
+
+  op.Init(op.ArgsWithOpName(op_kwargs, op_name, backward_op_name));
+
+  PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs());
+  PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+  op.Execute();
+  PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+  if (op.HasBackward()) {
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs());
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+    op.ExecuteBackward();
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+  }
+}
+
+template<typename DType = float>
+static void RunCoreOpTimingTest(const bool isGPU,
+                                const kwargs_t& op_kwargs,
+                                const char *op_name,
+                                const char *backward_op_name = "") {
+  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
+    op_kwargs, op_name, backward_op_name);
+
+  // prime code and cache before the performance runs
+  test::op::CoreOperatorRunner<DType> runner;
+  runner.RunBidirectional(false, { {20, 3, 128, 128} }, kwargs, 1);
+
+  // Do the performance runs
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = {
+      {1,  1, 28,  28},
+      {1,  3, 28,  28},
+      {50, 1, 18,  32},
+      {50, 3, 18,  32},
+      {20, 3, 128, 128}
+    };
+  } else {
+    shapes = {
+      {1,  1, 28,  28},
+      {50, 3, 18,  32},
+    };
+  }
+  const char *pu = isGPU ? "GPU" : "CPU";
+  for (const TShape &shape : shapes) {
+    runner.TimingTest(std::string(op_name) + " Operator " + pu, isGPU, false, kwargs,
+                      2, 10, { shape });
+  }
+}
+
+/*!
+ * \brief Generic bidirectional sanity test
+ */
+TEST(COREOP_PERF, ExecuteBidirectional) {
+  std::cout << "NEGATIVE CLIP GRADIENT" << std::endl;
+  RunCoreOpBidirectional(false, { {"lr", "0.01" }, { "clip_gradient", "-1" } },
+                         "sgd_mom_update",
+                         COREOP_BWD_OP_NAME_VALUE_NONE);
+  std::cout << "POSITIVE CLIP GRADIENT" << std::endl;
+  RunCoreOpBidirectional(false, { {"lr", "0.01" }, { "clip_gradient", "1" } },
+                         "sgd_mom_update",
+                         COREOP_BWD_OP_NAME_VALUE_NONE);
+}
+
+/*!
+ * \brief ActivationOp timing test for CPU
+ */
+TEST(COREOP_PERF, TimingCPU) {
+  std::cout << "NEGATIVE CLIP GRADIENT" << std::endl;
+  RunCoreOpTimingTest(false, { {"lr", "0.01" }, { "clip_gradient", "-1" } },
+                      "sgd_mom_update",
+                      COREOP_BWD_OP_NAME_VALUE_NONE);
+  std::cout << "POSITIVE CLIP GRADIENT" << std::endl;
+  RunCoreOpTimingTest(false, { {"lr", "0.01" }, { "clip_gradient", "1" } },
+                      "sgd_mom_update",
+                      COREOP_BWD_OP_NAME_VALUE_NONE);
+}
+
+#if MXNET_USE_CUDA == 1
+/*!
+ * \brief ActivationOp timing test for GPU
+ */
+TEST(COREOP_PERF, TimingGPU) {
+  std::cout << "NEGATIVE CLIP GRADIENT" << std::endl;
+  RunCoreOpTimingTest(true, { {"lr", "0.01" }, { "clip_gradient", "-1" } },
+                      "sgd_mom_update",
+                      COREOP_BWD_OP_NAME_VALUE_NONE);
+  std::cout << "POSITIVE CLIP GRADIENT" << std::endl;
+  RunCoreOpTimingTest(true, { {"lr", "0.01" }, { "clip_gradient", "1" } },
+                      "sgd_mom_update",
+                      COREOP_BWD_OP_NAME_VALUE_NONE);
+}
+#endif  // MXNET_USE_CUDA == 1
+
diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc
new file mode 100644
index 000000000000..d9a3795f46b1
--- /dev/null
+++ b/tests/cpp/operator/fully_conn_perf.cc
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  \file fully_conn_perf.cc
+ *  \brief Sample for running C++ performance tests on a single operator.  This method is also
+ *         useful for profiling with vtune or gprof, avoiding the "noise" of python and executor
+ *  \author Chris Olivier
+ */
+
+#include <dmlc/logging.h>
+#include <mxnet/tensor_blob.h>
+#include "../../src/operator/fully_connected-inl.h"
+#include "../include/test_op_runner.h"
+#include "../include/test_legacy_op.h"
+
+using namespace mxnet;
+
+typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
+
+const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"} };
+/*!
+ * \brief Generic bidirectional sanity test
+ */
+TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) {
+  TShape shape({5, 5});
+  kwargs_t kwargs = basic_fullyconn_args;
+  test::op::LegacyOpRunner<mxnet::op::FullyConnectedProp, float, float> runner;
+  runner.RunBidirectional(false, { shape }, kwargs, 1);
+}
+
+/*!
+ * \brief Timing test for CPU
+ */
+TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) {
+  kwargs_t kwargs = basic_fullyconn_args;
+  test::op::LegacyOpRunner<mxnet::op::FullyConnectedProp, float, float> runner;
+  runner.RunBidirectional(false,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = {
+      {1,  1, 28,  28},
+      {1,  3, 28,  28},
+      {50, 1, 18,  32},
+      {50, 3, 18,  32},
+      {20, 3, 128, 128}
+    };
+  } else {
+    shapes = {
+      {1,  1, 28,  28},
+      {50, 3, 18,  32},
+    };
+  }
+  for (const TShape& shape : shapes) {
+    runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10, { shape });
+  }
+}
+
+#if MXNET_USE_CUDA == 1
+/*!
+ * \brief Timing test for GPU
+ */
+TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) {
+  kwargs_t kwargs = basic_fullyconn_args;
+  test::OperatorRunner<mxnet::op::FullyConnectedProp,
+    test::op::LegacyOperatorExecutor<float, float>>
+    runner;
+  runner.RunBidirectional(true,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs, 1);  // prime code and cache
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = {
+      {1,  1, 28,  28},
+      {1,  3, 28,  28},
+      {50, 1, 18,  32},
+      {50, 3, 18,  32},
+      {20, 3, 128, 128}
+    };
+  } else {
+    shapes = {
+      {1,  1, 28,  28},
+      {50, 3, 18,  32},
+    };
+  }
+  for (const TShape& shape : shapes) {
+    runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10, { shape });
+  }
+}
+#endif  // MXNET_USE_CUDA == 1
diff --git a/tests/cpp/operator/krprod_test.cc b/tests/cpp/operator/krprod_test.cc
index 31b8ab9dd781..26c2661bc352 100644
--- a/tests/cpp/operator/krprod_test.cc
+++ b/tests/cpp/operator/krprod_test.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2017 by Contributors
  *  \file krprod_test.cc
  *  \brief Test Khatri-Rao product
  *  \author Jencir Lee
diff --git a/tests/cpp/operator/runner/core_op_runner_test.cc b/tests/cpp/operator/runner/core_op_runner_test.cc
new file mode 100644
index 000000000000..6cc2baddae28
--- /dev/null
+++ b/tests/cpp/operator/runner/core_op_runner_test.cc
@@ -0,0 +1,342 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  \file core_op_runner.cc
+ *  \brief Test operator runner (unary and binary ops validated here)
+ *  \note This is NOT where you test your operator performance. These tests validate that
+ *        the testing framework is functional.
+ *  \author Chris Olivier
+ */
+
+#include <gtest/gtest.h>
+#include <mxnet/tensor_blob.h>
+#include <mxnet/imperative.h>
+#include "../../src/imperative/imperative_utils.h"
+#include "../include/test_op_runner.h"
+#include "../include/test_core_op.h"
+
+using namespace mxnet;
+
+using kwargs_t = test::op::kwargs_t;
+
+static const kwargs_t basic_args = {};
+
+static const std::vector<std::pair<std::string, std::string>> test_unary_operators = {
+  { "relu",    "" },  // Code can figure out what the backward op is for some
+  { "sigmoid", "" },
+  { "sqrt",    "" }
+};
+
+static const std::vector<std::pair<std::string, std::string>> test_binary_operators = {
+  { "elemwise_add", "_backward_add" },
+  { "elemwise_mul", "_backward_mul" }
+};
+
+template<typename TT>
+inline std::vector<TT> AsVect(const TT& t) {
+  return std::move(std::vector<TT>({ t }));
+}
+
+/*!
+ * \brief Generic bidirectional sanity test for simple unary op
+ */
+TEST(CORE_OP_RUNNER, ExecuteBidirectionalSimpleUnaryList) {
+  TShape shape({5, 5});
+  kwargs_t kwargs = basic_args;
+
+  for (const std::pair<std::string, std::string>& i : test_unary_operators) {
+    const char *op_name = i.first.c_str();
+    const char *backward_op_name = i.second.c_str();
+
+    test::op::CoreOpExecutor<float> op(false, AsVect(shape));
+    op.set_verbose(false);
+
+    op.Init(op.ArgsWithOpName(kwargs, op_name, backward_op_name));
+
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs());
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+    op.Execute();
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs());
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+    op.ExecuteBackward();
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+  }
+}
+
+/*!
+ * \brief Generic bidirectional sanity test for binary op
+ */
+TEST(CORE_OP_RUNNER, ExecuteBidirectionalList) {
+  for (const std::pair<std::string, std::string>& i : test_binary_operators) {
+    const char *op_name = i.first.c_str();
+    const char *backward_op_name = i.second.c_str();
+
+    TShape shape({5, 5});
+    kwargs_t kwargs = basic_args;
+
+    test::op::CoreOpExecutor<float> op(false, AsVect(shape));
+
+    op.set_verbose(false);
+    op.Init(op.ArgsWithOpName(kwargs, op_name, backward_op_name));
+
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs());
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+    op.Execute();
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs());
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+    op.ExecuteBackward();
+    PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+  }
+}
+
+/*!
+ * \brief Execute bidirectional dot product, which has different shaped inputs and outputs
+ */
+TEST(CORE_OP_RUNNER, ExecuteBidirectionalDotProduct) {
+  const char *op_name = "dot";
+  const char *backward_op_name = "_backward_dot";
+
+  kwargs_t kwargs = basic_args;
+
+  test::op::CoreOpExecutor<float> op(false, { TShape({ 2, 3 }), TShape({ 3, 2 }) });
+
+  op.set_verbose(false);
+  op.Init(op.ArgsWithOpName(kwargs, op_name, backward_op_name));
+
+  PRINT_NDARRAYS(op.ctx().run_ctx, op.inputs());
+  PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+  op.Execute();
+  PRINT_NDARRAYS(op.ctx().run_ctx, op.outputs());
+
+  PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_inputs());
+  PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+  op.ExecuteBackward();
+  PRINT_NDARRAYS(op.ctx().run_ctx, op.bwd_outputs());
+}
+
+TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerSimpleUnary) {
+  typedef float DType;
+  TShape shape({5, 5});
+  for (const std::pair<std::string, std::string>& i : test_unary_operators) {
+    const char *op_name = i.first.c_str();
+    const char *backward_op_name = i.second.c_str();
+    test::op::CoreOperatorRunner<DType> runner;
+    runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<DType>::ArgsWithOpName(
+      basic_args, op_name, backward_op_name), 1);
+  }
+}
+
+TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunner) {
+  typedef float DType;
+  TShape shape({5, 5});
+  for (const std::pair<std::string, std::string>& i : test_binary_operators) {
+    const char *op_name = i.first.c_str();
+    const char *backward_op_name = i.second.c_str();
+    test::op::CoreOperatorRunner<DType> runner;
+    runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor<DType>::ArgsWithOpName(
+      basic_args, op_name, backward_op_name), 1);
+  }
+}
+
+/*!
+ * \brief Test RunBidirectional dot product, which has different shaped inputs and outputs
+ */
+TEST(CORE_OP_RUNNER, ExecuteBidirectionalRunnerDotProduct) {
+  typedef float DType;
+  const char *op_name = "dot";
+  const char *backward_op_name = "_backward_dot";
+  test::op::CoreOperatorRunner<DType> runner;
+  runner.RunBidirectional(false,
+                          { TShape({ 2, 3 }), TShape({ 3, 2 }) },
+                          test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args,
+                                                                          op_name,
+                                                                          backward_op_name),
+                          1);
+}
+
+/*!
+ * \brief Timing tests for CPU
+ */
+TEST(CORE_OP_RUNNER, TimingCPUSimpleUnary) {
+  typedef float DType;
+
+  const char *op_name = "relu";
+
+  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name);
+
+  test::op::CoreOperatorRunner<DType> runner;
+  runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1);  // prime code and cache
+
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = {
+      {1,  1, 28,  28},
+      {1,  3, 28,  28},
+      {50, 1, 18,  32},
+      {50, 3, 18,  32},
+      {20, 3, 128, 128}
+    };
+  } else {
+    shapes = {
+      {1,  1, 28,  28},
+      {50, 3, 18,  32},
+    };
+  }
+  for (const TShape &shape : shapes) {
+    runner.TimingTest(std::string(op_name) +  "Operator CPU",
+                      false, false, kwargs, 2, 10, { shape });
+  }
+}
+
+TEST(CORE_OP_RUNNER, TimingCPUBinary) {
+  typedef float DType;
+
+  const char *op_name = "elemwise_add";
+  const char *backward_op_name = "_backward_add";
+
+  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
+    basic_args, op_name, backward_op_name);
+
+  test::op::CoreOperatorRunner<DType> runner;
+  runner.RunBidirectional(false, { TShape({10, 10, 10, 10}) }, kwargs, 1);  // prime code and cache
+
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = {
+      {1,  1, 28,  28},
+      {1,  3, 28,  28},
+      {50, 1, 18,  32},
+      {50, 3, 18,  32},
+      {20, 3, 128, 128}
+    };
+  } else {
+    shapes = {
+      {1,  1, 28,  28},
+      {50, 3, 18,  32},
+    };
+  }
+  for (const TShape &shape : shapes) {
+    runner.TimingTest(std::string(op_name) + "Operator CPU", false,
+                      false, kwargs, 2, 10, { shape });
+  }
+}
+
+/*!
+ * \brief Performance run dot product, which has different shaped inputs and outputs
+ */
+TEST(CORE_OP_RUNNER, TimingCPUBinaryDotProduct) {
+  typedef float DType;
+
+  const char *op_name = "dot";
+  const char *backward_op_name = "_backward_dot";
+
+  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
+    basic_args, op_name, backward_op_name);
+
+  test::op::CoreOperatorRunner<DType> runner;
+  runner.RunBidirectional(false, { {2, 3}, {3, 2} }, kwargs, 1);  // prime code and cache
+
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = { {28,  28}, {18,  32}, {128, 24}, {128, 256} };
+  } else {
+    shapes = { {28,  28}, {128, 24} };
+  }
+  std::vector<TShape> input_shapes(2);
+  for (const TShape &shape : shapes) {
+    input_shapes[0] = shape;
+    input_shapes[1] = TShape({shape[1], shape[0]});
+    runner.TimingTest(std::string(op_name) + " Operator CPU", false,
+                      false, kwargs, 2, 10, input_shapes);
+  }
+}
+#if MXNET_USE_CUDA == 1
+TEST(CORE_OP_RUNNER, TimingGPUSimpleUnary) {
+  typedef float DType;
+
+  const char *op_name = "relu";
+
+  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(basic_args, op_name);
+
+  test::op::CoreOperatorRunner<DType> runner;
+  runner.RunBidirectional(false,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs,
+                          1);  // prime code and cache
+
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = {
+      {1,  1, 28,  28},
+      {1,  3, 28,  28},
+      {50, 1, 18,  32},
+      {50, 3, 18,  32},
+      {20, 3, 128, 128}
+    };
+  } else {
+    shapes = {
+      {1,  1, 28,  28},
+      {50, 3, 18,  32},
+    };
+  }
+  for (const TShape &shape : shapes) {
+    runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, { shape });
+  }}
+
+TEST(CORE_OP_RUNNER, TimingGPUBinary) {
+  typedef float DType;
+
+  const char *op_name = "elemwise_add";
+  const char *backward_op_name = "_backward_add";
+
+  const kwargs_t kwargs = test::op::CoreOpExecutor<DType>::ArgsWithOpName(
+    basic_args, op_name, backward_op_name);
+
+  test::op::CoreOperatorRunner<DType> runner;
+  runner.RunBidirectional(true,
+                          { TShape({10, 10, 10, 10}) },
+                          kwargs,
+                          1);  // prime code and cache
+
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = {
+      {1,  1, 28,  28},
+      {1,  3, 28,  28},
+      {50, 1, 18,  32},
+      {50, 3, 18,  32},
+      {20, 3, 128, 128}
+    };
+  } else {
+    shapes = {
+      {1,  1, 28,  28},
+      {50, 3, 18,  32},
+    };
+  }
+  for (const TShape &shape : shapes) {
+    runner.TimingTest(std::string(op_name) + "Operator GPU", true, false, kwargs, 2, 10, { shape });
+  }
+}
+
+#endif  // MXNET_USE_CUDA == 1
diff --git a/tests/cpp/operator/slice_channel_perf.cc b/tests/cpp/operator/slice_channel_perf.cc
new file mode 100644
index 000000000000..dc42d2a5d437
--- /dev/null
+++ b/tests/cpp/operator/slice_channel_perf.cc
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  \file activation_perf.cc
+ *  \brief Perf/profile run of ActivationOp
+ *  \author Chris Olivier
+ */
+
+#include <gtest/gtest.h>
+#include <mxnet/tensor_blob.h>
+#include "../include/test_op_runner.h"
+#include "../include/test_legacy_op.h"
+#include "../../src/operator/slice_channel-inl.h"
+
+using namespace mxnet;
+
+typedef std::vector<std::pair<std::string, std::string> > kwargs_t;
+const kwargs_t basic_activation_args = { };
+
+/*!
+ * \brief Generic bidirectional sanity test
+ */
+TEST(SLICE_CHANNEL_PERF, ExecuteBidirectional) {
+  TShape shape({1, 160, 200});
+  kwargs_t kwargs = basic_activation_args;
+  kwargs.push_back({"num_outputs", "160"});
+  test::op::LegacyOpRunner<mxnet::op::SliceChannelProp, float, float> runner;
+  runner.RunBidirectional(false, { shape }, kwargs, 1);
+}
+
+/*!
+ * \brief ActivationOp timing test for CPU
+ */
+TEST(SLICE_CHANNEL_PERF, TimingCPU) {
+  kwargs_t kwargs = basic_activation_args;
+  // Which math function is arbitrary since it will have roughly constant timing among approaches
+  kwargs.push_back({"num_outputs", "160"});
+  test::op::LegacyOpRunner<mxnet::op::SliceChannelProp, float, float> runner;
+  runner.RunBidirectional(false,
+                          { TShape({1, 160, 200}) },
+                          kwargs, 1);  // prime code and cache
+  std::vector <TShape> shapes;
+  if (test::performance_run) {
+    shapes = {
+      {1, 160, 200},
+      {10, 160, 200},
+      {100, 160, 200},
+      {10, 160, 500},
+      {100, 160, 500}
+    };
+  } else {
+    shapes = {
+      {1, 160, 200},
+      {1, 160, 200}
+    };
+  }
+  for (const TShape &shape : shapes) {
+    runner.TimingTest("SliceChannel Operator CPU", false, false, kwargs, 2, 10, { shape });
+  }
+}
+
+#if MXNET_USE_CUDA == 1
+/*!
+ * \brief ActivationOp timing test for GPU
+ */
+TEST(SLICE_CHANNEL_PERF, TimingGPU) {
+  kwargs_t kwargs = basic_activation_args;
+  // Which math function is arbitrary since it will have roughly constant timing among approaches
+  kwargs.push_back({"num_outputs", "160"});
+  test::OperatorRunner<mxnet::op::SliceChannelProp,
+    test::op::LegacyOperatorExecutor<float, float>> runner;
+  runner.RunBidirectional(true,
+                          { TShape({1, 160, 200}) },
+                          kwargs, 1);  // prime code and cache
+  std::vector <TShape> shapes = {
+      {1, 160, 200},
+      {1, 160, 200},
+      {1, 160, 200},
+      {1, 160, 200},
+      {1, 160, 200}
+    };
+  for (const TShape &shape : shapes) {
+    runner.TimingTest("SliceChannel Operator GPU", true, false, kwargs, 2, 10, { shape });
+  }
+}
+#endif  // MXNET_USE_CUDA == 1
+
diff --git a/tests/cpp/operator/tune/operator_tune_test.cc b/tests/cpp/operator/tune/operator_tune_test.cc
new file mode 100644
index 000000000000..5ecb03cc5b53
--- /dev/null
+++ b/tests/cpp/operator/tune/operator_tune_test.cc
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <gtest/gtest.h>
+#include <mxnet/tensor_blob.h>
+#include "../../src/operator/activation-inl.h"
+#include "../../src/operator/operator_tune-inl.h"
+#include "../include/test_op_runner.h"
+#include "../include/test_core_op.h"
+#include "../include/test_tune.h"
+
+using namespace mxnet;
+
+/*!
+ * \brief ActivationOp timing test for CPU
+ */
+TEST(OMP_TUNING, ShowAllTunedOps) {
+  const std::unordered_set<std::string>& op_names = op::OperatorTune<float>::TunedOperatorNames();
+  for (auto iter = op_names.begin(), e_iter = op_names.end(); iter != e_iter; ++iter) {
+    std::cout << *iter << std::endl;
+  }
+}
+
+using kwargs_t = test::op::kwargs_t;
+
+static std::vector<std::vector<TShape>> tuning_shapes() {
+  std::vector<std::vector<TShape>> shapes;
+  if (test::performance_run || test::csv) {
+    shapes = {
+      {{1,  1, 28,  28}},
+      {{1,  3, 28,  28}},
+      {{50, 1, 18,  32}},
+      {{25, 3, 64,  64}},
+      {{10, 3, 128, 128}},
+      {{20, 3, 128, 128}},
+      {{30, 3, 128, 128}},
+      {{30, 3, 256, 128}},
+    };
+  } else {
+    shapes = {
+      // Non-performance dataset acts as a sanity test
+      {{1,  1, 28, 28}},
+      {{50, 3, 18, 32}}
+    };
+  }
+  return std::move(shapes);
+}
+
+/*!
+ * \brief Generic bidirectional sanity test
+ */
+TEST(OMP_TUNING, ExecuteBidirectional) {
+  test::op::BasicRunCoreOpBidirectional(false, true, {}, {tuning_shapes()[0]},
+                                        "elemwise_add", "_backward_add");
+}
+
+/* Some test results:
+ * AWS c4.8xlarge:
+  Success rate for type float: 0.90278
+  Success rate for type double: 0.88889
+  Success rate for type mshadow::half::half_t: 0.83333
+  Success rate for type unsigned char: 0.86111
+  Success rate for type int: 0.95833
+  Success rate for type long: 0.88889
+ * desktop: 12-core (6 real CPU cores + hyperthreading)
+  Success rate for type float: 0.78125
+  Success rate for type double: 0.85417
+  Success rate for type mshadow::half::half_t: 0.84375
+  Success rate for type unsigned char: 0.80208
+  Success rate for type int: 0.94444
+  Success rate for type long: 1.00000
+ */
+
+/*!
+ * \brief Rune a tuning evaluation
+ * \tparam DType Data type for which to evaluate tuning
+ */
+template<typename DType>
+static float EvaluateTune(const bool verbose = true) {
+  std::vector<std::pair<std::string, std::string>> binary_operators;
+  if (test::csv) {
+    binary_operators = {
+      {"elemwise_add", COREOP_BWD_OP_NAME_VALUE_NONE}
+    };
+  } else if (test::performance_run) {
+    binary_operators = {
+      {"relu",         ""},  // Code can figure out what the backward op is for some
+      {"sigmoid",      ""},
+      {"sqrt",         ""},
+      {"elemwise_add", "_backward_add"},
+      {"elemwise_mul", "_backward_mul"},
+      {"elemwise_div", "_backward_div"}
+    };
+  } else {
+    binary_operators = {
+      {"elemwise_add", "_backward_add"}
+    };
+  }
+  std::vector<float> rates;
+  for (size_t i = 0, n = binary_operators.size(); i < n; ++i) {
+    test::tune::TuningTester<DType> tuningTester;
+    tuningTester.set_calls_per_iteration(10);
+    tuningTester.set_total_iterations(5);
+    std::cout << "******************************" << std::endl;
+    std::cout << "Operators: " << binary_operators[i].first
+              << ", " << binary_operators[i].second
+              << " for type: " << test::type_name<DType>()
+              << std::endl;
+    std::cout << "******************************" << std::endl;
+
+    // Do the performance runs
+    std::vector<std::vector<TShape>> shapes = tuning_shapes();
+
+    tuningTester.TestTunedOperator({}, verbose, shapes,
+                                   binary_operators[i].first.c_str(),
+                                   binary_operators[i].second.c_str());
+    rates.push_back(tuningTester.CalculateSuccessRate());
+  }
+  return std::accumulate(rates.begin(), rates.end(), 0.0f) / rates.size();
+}
+
+/*! \brief ActivationOp timing test for CPU for float */
+TEST(OMP_TUNING, EvaluateTuneTestFloat) {
+  typedef float DType;
+  const float result = EvaluateTune<DType>();
+  std::cout << "Success rate for type " << test::type_name<DType>() << ": " << result << std::endl;
+}
+/*! \brief ActivationOp timing test for CPU for double */
+TEST(OMP_TUNING, EvaluateTuneTestDouble) {
+  typedef double DType;
+  const float result = EvaluateTune<DType>();
+  std::cout << "Success rate for type " << test::type_name<DType>() << ": " << result << std::endl;
+}
+/*! \brief ActivationOp timing test for CPU for float16 */
+TEST(OMP_TUNING, EvaluateTuneTestFloat16) {
+  typedef mshadow::half::half_t DType;
+  const float result = EvaluateTune<DType>();
+  std::cout << "Success rate for type " << test::type_name<DType>() << ": " << result << std::endl;
+}
+/*! \brief ActivationOp timing test for CPU for int8_t */
+TEST(OMP_TUNING, EvaluateTuneTestInt8) {
+  typedef uint8_t DType;
+  const float result = EvaluateTune<DType>();
+  std::cout << "Success rate for type " << test::type_name<DType>() << ": " << result << std::endl;
+}
+/*! \brief ActivationOp timing test for CPU for int32_t */
+TEST(OMP_TUNING, EvaluateTuneTestInt32) {
+  typedef int32_t DType;
+  const float result = EvaluateTune<DType>();
+  std::cout << "Success rate for type " << test::type_name<DType>() << ": " << result << std::endl;
+}
+/*! \brief ActivationOp timing test for CPU for int64_t */
+TEST(OMP_TUNING, EvaluateTuneTestInt64) {
+  typedef int64_t DType;
+  const float result = EvaluateTune<DType>();
+  std::cout << "Success rate for type " << test::type_name<DType>() << ": " << result << std::endl;
+}
+
diff --git a/tests/cpp/storage/storage_test.cc b/tests/cpp/storage/storage_test.cc
index 8af3984eb40f..269480b83c37 100644
--- a/tests/cpp/storage/storage_test.cc
+++ b/tests/cpp/storage/storage_test.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file storage_test.cc
  * \brief cpu/gpu storage tests
 */
diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc
index b8ffbbd0ad04..a882b0b21f2c 100644
--- a/tests/cpp/test_main.cc
+++ b/tests/cpp/test_main.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ * Copyright (c) 2017 by Contributors
  * \file test_main.cc
  * \brief operator unit test utility functions
  * \author Chris Olivier
@@ -35,14 +36,19 @@ static bool dumpCallback(const google_breakpad::MinidumpDescriptor& descriptor,
 }
 #endif
 
-namespace mxnet { namespace test {
+namespace mxnet {
+namespace test {
 bool unitTestsWithCuda = false;
 #ifdef NDEBUG
-bool debugOutput = false;
+bool debug_output = false;
 #else
-bool debugOutput = false;
+bool debug_output = false;
 #endif
-}}
+bool quick_test = false;
+bool performance_run = false;
+bool csv = false;
+}  // namespace test
+}  // namespace mxnet
 
 #if MXNET_USE_CUDA
 
@@ -84,7 +90,13 @@ int main(int argc, char ** argv) {
       // override (ie force attempt CUDA)
       mxnet::test::unitTestsWithCuda = true;
     } else if (!strcmp(argv[x], "--debug")) {
-      mxnet::test::debugOutput = true;
+      mxnet::test::debug_output = true;
+    } else if (!strcmp(argv[x], "--perf")) {
+      mxnet::test::performance_run = true;
+    } else if (!strcmp(argv[x], "--csv")) {
+      mxnet::test::csv = true;
+    } else if (!strcmp(argv[x], "--quick") || !strcmp(argv[x], "-q")) {
+      mxnet::test::quick_test = true;
     }
   }
 
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
index 11ea6d141a53..030b24026e6c 100644
--- a/tests/cpp/unittest.mk
+++ b/tests/cpp/unittest.mk
@@ -1,6 +1,6 @@
 TEST_SRC = $(shell find tests/cpp/ -name "*.cc")
 TEST_OBJ = $(patsubst %.cc, build/%.o, $(TEST_SRC))
-TEST = build/tests/cpp/mxnet_test
+TEST = build/tests/cpp/mxnet_unit_tests
 
 GTEST_LIB=$(GTEST_PATH)/lib/
 GTEST_INC=$(GTEST_PATH)/include/
diff --git a/tests/jenkins/set_user_permissions.sh b/tests/jenkins/set_user_permissions.sh
old mode 100644
new mode 100755
diff --git a/tests/nightly/TestDoc/doc_spell_checker.py b/tests/nightly/TestDoc/doc_spell_checker.py
index a7b8b250c928..a33807e3d576 100644
--- a/tests/nightly/TestDoc/doc_spell_checker.py
+++ b/tests/nightly/TestDoc/doc_spell_checker.py
@@ -92,7 +92,7 @@ def check_doc(file_content, spell_checker, spell_check_ret):
     """
     spell_checker.set_text(file_content)
     for error in spell_checker:
-        if spell_check_ret.has_key(error.word):
+        if error.word in spell_check_ret:
             spell_check_ret[error.word] += 1
         else:
             spell_check_ret[error.word] = 1
diff --git a/tests/nightly/compilation_warnings/compilation_warnings.sh b/tests/nightly/compilation_warnings/compilation_warnings.sh
old mode 100644
new mode 100755
diff --git a/tests/nightly/compilation_warnings/process_output.py b/tests/nightly/compilation_warnings/process_output.py
index 5f85af592f6c..d7ed3297e202 100644
--- a/tests/nightly/compilation_warnings/process_output.py
+++ b/tests/nightly/compilation_warnings/process_output.py
@@ -46,9 +46,10 @@ def print_summary(time, warnings):
     print 'START - Compilation warnings summary'
     print 'Time taken to compile:', time, 's'
     print 'Total number of warnings:', total_count, '\n'
-    print 'Below is the list of unique warnings and the number of occurrences of that warning'
-    for warning, count in sorted_warnings:
-        print count, ': ', warning
+    if total_count>0:
+        print 'Below is the list of unique warnings and the number of occurrences of that warning'
+        for warning, count in sorted_warnings:
+            print count, ': ', warning
     print 'END - Compilation warnings summary'
 
 c_output = open(sys.argv[1],'r')
diff --git a/tests/nightly/dist_device_sync_kvstore.py b/tests/nightly/dist_device_sync_kvstore.py
new file mode 100644
index 000000000000..75b48f42c5e8
--- /dev/null
+++ b/tests/nightly/dist_device_sync_kvstore.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import sys
+sys.path.insert(0, "../../python/")
+import mxnet as mx
+import numpy as np
+import numpy.random as rnd
+import time
+
+def check_diff_to_scalar(A, x, rank=None):
+    """ assert A == x"""
+    assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x)
+
+# setup
+keys = ['3', '5', '7']
+init_test_keys = [str(i) for i in range(200,300)]
+init_test_keys_big = [str(i) for i in range(300,400)]
+init_test_keys_device = [str(i) for i in range(400,500)]
+init_test_keys_device_big = [str(i) for i in range(500,600)]
+
+rate = 2
+shape = (2, 3)
+big_shape = (1200, 1200)        # bigger than MXNET_KVSTORE_BIGARRAY_BOUND
+
+kv = mx.kv.create('dist_device_sync')
+
+def init_kv():
+    # init kv dns keys
+    kv.init(keys, [mx.nd.ones(shape)] * len(keys))
+    kv.init('99', mx.nd.ones(big_shape))
+    # worker info
+    my_rank = kv.rank
+    nworker = kv.num_workers
+    # init updater on servers
+    kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate))
+    return kv, my_rank, nworker
+
+def test_sync_push_pull():
+    kv, my_rank, nworker = init_kv()
+    num_gpus = 2
+    def check_default_keys(kv, my_rank, nworker):
+        nrepeat = 3
+        # checks pull after push in loop, because behavior during
+        # consecutive pushes doesn't offer any guarantees
+        for i in range(nrepeat):
+            scale = my_rank + 1
+            kv.push('3', [mx.nd.ones(shape, ctx=mx.gpu(j)) * scale for j in range(num_gpus)])
+            kv.push('99', [mx.nd.ones(big_shape, ctx=mx.gpu(j)) * scale for j in range(num_gpus)])
+            num = (nworker + 1) * nworker * rate * num_gpus / 2 * (i + 1) + 1
+            val = mx.nd.zeros(shape)
+            kv.pull('3', out=val)
+            check_diff_to_scalar(val, num)
+            val2 = mx.nd.zeros(big_shape)
+            kv.pull('99', out=val2)
+            check_diff_to_scalar(val2, num)
+
+    check_default_keys(kv, my_rank, nworker)
+    print('worker ' + str(my_rank) + ' is done')
+
+def test_sync_init():
+    def check_init(kv, cur_keys, cur_shape, device=False):
+        ctx = mx.gpu(0) if device else mx.cpu()
+        val = [mx.nd.zeros(cur_shape, ctx) for i in cur_keys]
+        for i in range(len(cur_keys)):
+            expected = i
+            kv.init(cur_keys[i], [mx.nd.ones(cur_shape, ctx) * i])
+            kv.pull(cur_keys[i], out=val[i])
+            check_diff_to_scalar(val[i], expected)
+    check_init(kv, init_test_keys, shape)
+    check_init(kv, init_test_keys_big, big_shape)
+    check_init(kv, init_test_keys_device, shape, device=True)
+    check_init(kv, init_test_keys_device_big, big_shape, device=True)
+    my_rank = kv.rank
+    print('worker ' + str(my_rank) + ' is initialized')
+
+if __name__ == "__main__":
+    test_sync_init()
+    test_sync_push_pull()
diff --git a/tests/nightly/dist_sync_kvstore.py b/tests/nightly/dist_sync_kvstore.py
index 3fbf9f910879..df85fe586054 100644
--- a/tests/nightly/dist_sync_kvstore.py
+++ b/tests/nightly/dist_sync_kvstore.py
@@ -22,45 +22,295 @@
 sys.path.insert(0, "../../python/")
 import mxnet as mx
 import numpy as np
-import time
+import numpy.random as rnd
+from mxnet.test_utils import assert_almost_equal
+from test_kvstore import compute_expected_2bit_quantization
 
-def check_diff_to_scalar(A, x):
+def check_diff_to_scalar(A, x, rank=None):
     """ assert A == x"""
-    assert(np.sum(np.abs((A - x).asnumpy())) == 0), A.asnumpy()
+    assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x)
 
 # setup
-keys = [3, 5, 7]
-rate = 2
-shape = (2, 2)
-big_shape = (1200, 1200)        # big than BIGARRAY_BOUND
+keys = ['3', '5', '7']
+rsp_keys = ['9', '11', '13']
+init_test_keys = [str(i) for i in range(200,300)]
+init_test_keys_big = [str(i) for i in range(300,400)]
+init_test_keys_device = [str(i) for i in range(400,500)]
+init_test_keys_device_big = [str(i) for i in range(500,600)]
 
+rate = 2
+shape = (2, 3)
+irregular_shape = (1211,1211)
+big_shape = (1200, 1200)        # bigger than MXNET_KVSTORE_BIGARRAY_BOUND
 
 kv = mx.kv.create('dist_sync')
 
-# init kv
-kv.init(keys, [mx.nd.ones(shape)] * len(keys))
-kv.init(99, mx.nd.ones(big_shape))
-# init updater on servers
-kv.set_optimizer(mx.optimizer.create('test', rate))
+def init_kv():
+    # init kv dns keys
+    kv.init(keys, [mx.nd.ones(shape)] * len(keys))
+    kv.init('99', mx.nd.ones(big_shape))
+    # init kv row_sparse keys
+    kv.init(rsp_keys, [mx.nd.ones(shape).tostype('row_sparse')] * len(rsp_keys))
+    kv.init('100', mx.nd.ones(big_shape).tostype('row_sparse'))
+    # worker info
+    my_rank = kv.rank
+    nworker = kv.num_workers
+    # init updater on servers
+    kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate))
+    return kv, my_rank, nworker
 
-my_rank = kv.rank
-nworker = kv.num_workers
+def init_kv_compressed(kv):
+    threshold = 0.5
+    kv.set_gradient_compression({'type': '2bit', 'threshold':threshold})
+    # init kv compression keys
+    kv.init('11221', mx.nd.zeros(big_shape))
+    kv.init('112221', mx.nd.zeros(irregular_shape))
+    kv.init('1121', mx.nd.zeros(shape))
+    # to test inactive mode
+    kv.init('1122', mx.nd.ones(shape))
+    return kv, threshold
 
 def test_sync_push_pull():
-    nrepeat = 3
-    for i in range(nrepeat):
-        kv.push(3, mx.nd.ones(shape)*(my_rank+1))
-        kv.push(99, mx.nd.ones(big_shape)*(my_rank+1))
+    kv, my_rank, nworker = init_kv()
+    def check_default_keys(kv, my_rank, nworker):
+        nrepeat = 3
+        # checks pull after push in loop, because behavior during
+        # consecutive pushes doesn't offer any guarantees
+        for i in range(nrepeat):
+            kv.push('3', mx.nd.ones(shape)*(my_rank+1))
+            kv.push('99', mx.nd.ones(big_shape)*(my_rank+1))
+            num = (nworker + 1) * nworker * rate / 2 * (i + 1) + 1
+            val = mx.nd.zeros(shape)
+            kv.pull('3', out=val)
+            check_diff_to_scalar(val, num)
+            val2 = mx.nd.zeros(big_shape)
+            kv.pull('99', out=val2)
+            check_diff_to_scalar(val2, num)
+
+    def check_row_sparse_keys(kv, my_rank, nworker):
+        nrepeat = 3
+        # prepare gradient
+        v = mx.nd.zeros(shape)
+        my_row = my_rank % shape[0]
+        v[my_row] = my_rank + 1
+        # push
+        for i in range(nrepeat):
+            kv.push('9', v.tostype('row_sparse'))
+            # select a random subset of rows this worker is interested in
+            num_rows = shape[0]
+            row_ids_np = np.random.randint(num_rows, size=num_rows)
+            row_ids = mx.nd.array(row_ids_np, dtype='int64')
+            # perform pull
+            val = mx.nd.zeros(shape, stype='row_sparse')
+            kv.row_sparse_pull('9', out=val, row_ids=row_ids)
+            # prepare updated values
+            updated_val = mx.nd.ones(shape)
+            for rank in range(nworker):
+                row = rank % shape[0]
+                updated_val[row] += (rank + 1) * rate * (i+1)
+            # verify subset of updated values
+            expected = mx.nd.zeros(shape)
+            for row in row_ids_np:
+                expected[row] = updated_val[row]
+            check_diff_to_scalar(val, expected)
+
+    def check_row_sparse_keys_with_zeros(kv, my_rank, nworker):
+        nrepeat = 3
+        # prepare gradient
+        v = mx.nd.sparse.zeros('row_sparse', shape)
+        big_v = mx.nd.sparse.zeros('row_sparse', big_shape)
+        # push
+        for i in range(nrepeat):
+            kv.push('11', v)
+            kv.push('100', big_v)
+            # pull a subset of rows this worker is interested in
+            all_row_ids = np.arange(shape[0])
+            val = mx.nd.sparse.zeros('row_sparse', shape)
+            big_val = mx.nd.sparse.zeros('row_sparse', big_shape)
+            kv.row_sparse_pull('11', out=val, row_ids=mx.nd.array(all_row_ids))
+            big_all_row_ids = np.arange(big_shape[0])
+            kv.row_sparse_pull('100', out=big_val, row_ids=mx.nd.array(big_all_row_ids))
+            # verify results
+            check_diff_to_scalar(val, 1)
+            check_diff_to_scalar(big_val, 1)
+            # pull empty weights
+            kv.row_sparse_pull('11', out=val, row_ids=mx.nd.array([]))
+            kv.row_sparse_pull('100', out=big_val, row_ids=mx.nd.array([]))
+            check_diff_to_scalar(val, 0)
+            check_diff_to_scalar(big_val, 0)
+
+    def check_big_row_sparse_keys(kv, my_rank, nworker):
+        mx.random.seed(123)
+        rnd.seed(123)
+        density = 0.3
+        nrepeat = 3
+        # prepare gradient
+        v = mx.nd.zeros(big_shape)
+        idx_sample = rnd.rand(big_shape[0])
+        indices = np.argwhere(idx_sample < density).flatten()
+        # each worker chooses a subset of the indices to update
+        update_rows = []
+        for rank in range(nworker):
+            rows = []
+            i = 0
+            step = (rank + 1) * 2
+            while i < len(indices):
+                rows.append(indices[i])
+                i += step
+            update_rows.append(np.array(rows))
+        # rows to update for this worker
+        for row in update_rows[my_rank]:
+            v[row] = my_rank + 1
+        # push
+        for i in range(nrepeat):
+            kv.push('100', v.tostype('row_sparse'))
+
+            # select a random subset of rows this worker is interested in
+            mx.random.seed(my_rank)
+            rnd.seed(my_rank)
+            num_rows = big_shape[0]
+            row_ids_np = np.random.randint(num_rows, size=num_rows)
+            row_ids = mx.nd.array(row_ids_np)
+            # perform pull
+            val = mx.nd.zeros(big_shape, stype='row_sparse')
+            kv.row_sparse_pull('100', out=val, row_ids=row_ids)
+            # prepare expected result
+            updated_val = mx.nd.ones(big_shape)
+            # apply updates from each worker
+            for rank in range(nworker):
+                for row in update_rows[rank]:
+                    updated_val[row] += (rank + 1) * rate * (i+1)
+
+            expected = mx.nd.zeros(big_shape)
+            for row in row_ids_np:
+                expected[row] = updated_val[row]
+            check_diff_to_scalar(val, expected, rank=my_rank)
+
+    def check_compr_residual(kv, threshold, nworker):
+        for k,s in [('1121', shape),('112221',irregular_shape),('11221', big_shape)]:
+            # doesn't meet threshold
+            kv.push(k, mx.nd.ones(s)*0.4)
+            val=mx.nd.zeros(s)
+            kv.pull(k,val)
+            check_diff_to_scalar(val, 0)
+
+            # just meets threshold with residual
+            kv.push(k, mx.nd.ones(s)*(threshold - 0.4))
+            val2 = mx.nd.zeros(s)
+            kv.pull(k,val2)
+            curval = threshold * rate * nworker
+            check_diff_to_scalar(val2, curval)
+
+            # doesn't meet threshold
+            kv.push(k, mx.nd.ones(s)*0.2)
+            val3= mx.nd.zeros(s)
+            kv.pull(k, val3)
+            check_diff_to_scalar(val3, curval)
+
+            # exceeds again
+            kv.push(k, mx.nd.ones(s)*(threshold-0.2))
+            val4 = mx.nd.zeros(s)
+            kv.pull(k,val4)
+            curval += threshold*rate*nworker
+            check_diff_to_scalar(val4, curval)
+            # residual is 0 now
+
+    def check_compr_ones(kv, threshold, nworker):
+        for k,s in [('1121', shape),('112221',irregular_shape),('11221', big_shape)]:
+            val = mx.nd.zeros(s)
+            kv.pull(k, val)
+            curval = val[0][0].asnumpy()[0]
+            kv.push(k,mx.nd.ones(s)*threshold)
+            val2 = mx.nd.zeros(s)
+            kv.pull(k, val2)
+            newval = curval + rate*nworker*threshold
+            check_diff_to_scalar(val2, newval)
+            # residual = 0  again
+
+    def check_compr_pull_before_push(kv):
+        for k,s in [('1121', shape),('112221',irregular_shape),
+                    ('11221', big_shape), ('1122',shape)]:
+            if k=='1122':
+                # tests that GC is not used for init of a key
+                val = mx.nd.zeros(s)
+                kv.pull(k, val)
+                check_diff_to_scalar(val, 1)
+            else:
+                val = mx.nd.ones(s)
+                kv.pull(k, val)
+                check_diff_to_scalar(val, 0)
+
+    def check_compr_zero(kv):
+        for k,s in [('1121', shape),('112221',irregular_shape),('11221', big_shape)]:
+            kv.push(k, mx.nd.zeros(s))
+            # to check that all are set to 0s
+            val = mx.nd.ones(s)
+            kv.pull(k, val)
+            check_diff_to_scalar(val, 0)
+
+    def check_compr_random(kv, threshold, nworker):
+        # set a seed so all workers generate same data. knowing this helps
+        # calculate expected value after pull
+        mx.random.seed(123)
+        rnd.seed(123)
+        nrepeat = 5
+        compr_random_keys_shapes = [('2121', shape),('212221',irregular_shape),('21221', big_shape)]
+        # use new keys so residual is 0 for calculation of expected
+        for k,s in compr_random_keys_shapes:
+            kv.init(k, mx.nd.zeros(s))
+        for k,s in compr_random_keys_shapes:
+            curr_residual = np.zeros(s)
+            for l in range(nrepeat):
+                orig_val = mx.nd.zeros(s)
+                kv.pull(k, orig_val)
+
+                grad = mx.nd.array(rnd.rand(s[0], s[1]))
+                # creates a copy because push changes grad because of assignment
+                grad_cpy = mx.nd.array(grad)
+                kv.push(k, grad)
+                val = mx.nd.zeros(s)
+                kv.pull(k, val)
+
+                diff = val - orig_val
+
+                # compute expected by using simulation of operator
+                compr, curr_residual, decompr = compute_expected_2bit_quantization(grad_cpy, curr_residual, threshold)
+                decompr *= nworker * rate
+                assert_almost_equal(diff.asnumpy(), decompr)
+
+    print ('worker '+str(my_rank)+' started with non compression tests')
+    check_default_keys(kv, my_rank, nworker)
+    check_row_sparse_keys(kv, my_rank, nworker)
+    check_row_sparse_keys_with_zeros(kv, my_rank, nworker)
+    check_big_row_sparse_keys(kv, my_rank, nworker)
+    print('worker ' + str(my_rank) + ' is done with non compression tests')
 
-    num = (nworker + 1 ) * nworker * rate / 2 * nrepeat + 1
-    val = mx.nd.zeros(shape)
-    kv.pull(3, out = val)
-    check_diff_to_scalar(val, num)
-    # print val.asnumpy()
+    # don't run non compressed keys after this as kvstore now is set to compressed
+    print ('worker '+str(my_rank)+' started with compression tests')
+    kv, threshold = init_kv_compressed(kv)
+    check_compr_pull_before_push(kv)
+    check_compr_zero(kv)
+    check_compr_residual(kv, threshold, nworker)
+    check_compr_ones(kv, threshold, nworker)
+    check_compr_random(kv, threshold, nworker)
+    print('worker ' + str(my_rank) + ' is done with compression tests')
 
-    val2 = mx.nd.zeros(big_shape)
-    kv.pull(99, out = val2)
-    check_diff_to_scalar(val2, num)
+def test_sync_init():
+    def check_init(kv, cur_keys, cur_shape, device=False):
+        ctx = mx.gpu(0) if device else mx.cpu()
+        val = [mx.nd.zeros(cur_shape, ctx) for i in cur_keys]
+        for i in range(len(cur_keys)):
+            expected = i
+            kv.init(cur_keys[i], [mx.nd.ones(cur_shape, ctx) * i])
+            kv.pull(cur_keys[i], out=val[i])
+            check_diff_to_scalar(val[i], expected)
+    check_init(kv, init_test_keys, shape)
+    check_init(kv, init_test_keys_big, big_shape)
+    check_init(kv, init_test_keys_device, shape, device=True)
+    check_init(kv, init_test_keys_device_big, big_shape, device=True)
+    my_rank = kv.rank
+    print('worker ' + str(my_rank) + ' is initialized')
 
 if __name__ == "__main__":
+    test_sync_init()
     test_sync_push_pull()
diff --git a/tests/nightly/download.sh b/tests/nightly/download.sh
old mode 100644
new mode 100755
diff --git a/tests/nightly/sh2ju.sh b/tests/nightly/sh2ju.sh
old mode 100644
new mode 100755
diff --git a/tests/nightly/test_all.sh b/tests/nightly/test_all.sh
index 32913c9f5f5b..04d895fecf21 100755
--- a/tests/nightly/test_all.sh
+++ b/tests/nightly/test_all.sh
@@ -72,10 +72,24 @@ check_val() {
 example_dir=../../example/image-classification
 # python: lenet + mnist
 test_lenet() {
-    python $example_dir/train_mnist.py \
-        --data-dir `pwd`/data/mnist/ --network lenet --gpus $gpus --num-epochs 10 \
-        2>&1 | tee log
-    check_val 0.99
+    optimizers="adam sgd adagrad"
+    for optimizer in ${optimizers}; do
+        echo "OPTIMIZER: $optimizer"
+        if [ "$optimizer" == "adam" ]; then
+            learning_rate=0.0005
+            desired_accuracy=0.98
+        else
+            learning_rate=0.01
+            desired_accuracy=0.99
+        fi
+        python $example_dir/train_mnist.py --lr $learning_rate \
+            --network lenet --optimizer $optimizer --gpus $gpus \
+            --num-epochs 10 2>&1 | tee log
+       if [ $? -ne 0 ]; then
+           return $?
+       fi
+       check_val $desired_accuracy
+    done
 }
 juLog -name=Python.Lenet.Mnist -error=Fail test_lenet
 
diff --git a/tests/nightly/test_image_classification.sh b/tests/nightly/test_image_classification.sh
new file mode 100755
index 000000000000..b4b4dcaaae83
--- /dev/null
+++ b/tests/nightly/test_image_classification.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+# setup
+export LD_LIBRARY_PATH=`pwd`/`dirname $0`/lib:/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+export PYTHONPATH=`pwd`/`dirname $0`/python
+# bc is required by sh2ju.sh
+apt-get install bc
+cd `pwd`/`dirname $0`
+. sh2ju.sh
+
+## clean last build log
+juLogClean
+
+if [ -f $(which nvidia-smi) ]; then
+    if [ $# -eq 1 ]; then
+        num_gpus=$1
+    else
+        num_gpus=$(nvidia-smi -L | grep "GPU" | wc -l)
+    fi
+    gpus=`seq 0 $((num_gpus-1)) | paste -sd ","`
+    device_arg="--gpus $gpus"
+else
+    device_arg=""
+fi
+
+# build
+build() {
+    make -C ../.. clean
+    make -C ../.. -j8
+    return $?
+}
+
+cp ../../make/config.mk ../..
+cat >>../../config.mk <<EOF
+USE_CUDA=1
+USE_CUDA_PATH=/usr/local/cuda
+USE_CUDNN=1
+USE_DIST_KVSTORE=1
+EOF
+
+juLog -name=Build -error=Error build
+
+# check if the final evaluation accuracy exceed the threshold
+check_val() {
+    expected=$1
+    pass="Final validation >= $expected, PASS"
+    fail="Final validation < $expected, FAIL"
+    python ../../tools/parse_log.py log --format none | tail -n1 | \
+        awk "{ if (\$3~/^[.0-9]+$/ && \$3 > $expected) print \"$pass\"; else print \"$fail\"}"
+    rm -f log
+}
+
+example_dir=../../example/image-classification
+# python: lenet + mnist
+test_lenet() {
+    optimizers="adam sgd adagrad"
+    for optimizer in ${optimizers}; do
+        echo "OPTIMIZER: $optimizer"
+        if [ "$optimizer" == "adam" ]; then
+            learning_rate=0.0005
+        else
+            learning_rate=0.01
+        fi
+        desired_accuracy=0.98
+        python $example_dir/train_mnist.py --lr $learning_rate \
+            --network lenet --optimizer $optimizer --gpus $gpus \
+            --num-epochs 10 2>&1 | tee log
+       if [ $? -ne 0 ]; then
+           return $?
+       fi
+       check_val $desired_accuracy
+    done
+}
+juLog -name=Python.Lenet.Mnist -error=FAIL test_lenet
+
+exit $errors
diff --git a/tests/nightly/test_kvstore.py b/tests/nightly/test_kvstore.py
index b39ec89cf728..a14feac7a3aa 100644
--- a/tests/nightly/test_kvstore.py
+++ b/tests/nightly/test_kvstore.py
@@ -21,23 +21,65 @@
 sys.path.insert(0, "../../python/")
 import mxnet as mx
 import numpy as np
+import numpy.random as rnd
+import copy
 
-keys = [3, 5, 7]
-# let the last shape exceed MXNET_KVSTORE_BIGARRAY_BOUND
-shapes = [(4, 4), (100, 100), (2000, 2000)];
+from mxnet.test_utils import assert_almost_equal
 
-lr = .1
-nworker = 4
-nrepeat = 10
+def check_diff_to_scalar(A, x, rank=None):
+    """ assert A == x"""
+    assert(np.sum(np.abs((A - x).asnumpy())) == 0), (rank, A.asnumpy(), x)
 
-## generate data
-data = [[[np.random.random(s)*2-1 for i in range(nworker)] for s in shapes] for j in range(nrepeat)]
+def compute_expected_2bit_quantization(arr, curr_residual, threshold):
+    from struct import pack,unpack
+    def bits2int(bits):
+        bits = [int(x) for x in bits[::-1]]
+        x = 0
+        for i in range(len(bits)):
+            x += bits[i]*2**i
+        return x
+
+    def as_float32(s):
+        return unpack("f",pack("I", bits2int(s)))[0]
+
+    # str_quant stores the quantized representation as a sequence of bits
+    str_quant = ''
+    new_residual = []
+    decompr = []
+
+    arr_npy = arr.asnumpy()
+    for i, a in np.ndenumerate(arr_npy):
+        a += curr_residual[i]
+        if a >= threshold:
+            str_quant += '11'
+            new_residual.append(a - threshold)
+            decompr.append(threshold)
+        elif a <= (-1*threshold):
+            str_quant += '10'
+            new_residual.append(a + threshold)
+            decompr.append(-1*threshold)
+        else:
+            str_quant += '00'
+            new_residual.append(a)
+            decompr.append(0)
+    # append extra bits when size of array not a factor of 16
+    if len(str_quant)%16 != 0:
+        str_quant += '0'*(16 - len(str_quant)%16)
+
+    compr = []
+    # converts the string generated into integers 32chars at a time
+    i = 0
+    while i<len(str_quant):
+        cur_float = str_quant[i+24:i+32] + str_quant[i+16:i+24] + str_quant[i+8:i+16] + str_quant[i:i+8]
+        compr.append(as_float32(cur_float))
+        i+=32
+    return np.array(compr), np.array(new_residual).reshape(arr.shape), np.array(decompr).reshape(arr.shape)
 
 ## individual key interface
 def test_kvstore(kv_type):
     print(kv_type)
     kv = mx.kv.create(kv_type)
-    kv.set_optimizer(mx.optimizer.create('test', lr))
+    kv.set_optimizer(mx.optimizer.create('test', rescale_grad=lr))
     for k, s in zip(keys, shapes):
         kv.init(k, mx.nd.zeros(s))
 
@@ -55,15 +97,124 @@ def test_kvstore(kv_type):
             err = sum(err) / np.sum(np.abs(res[j]))
             assert(err < 1e-6), (err, shapes[j])
 
-test_kvstore('local_update_cpu')
-test_kvstore('local_allreduce_cpu')
-test_kvstore('local_allreduce_device')
+def test_compress_kvstore(kv_type, compression='2bit', threshold=0.5):
+    print(kv_type + ' with ' + compression + ' compression')
+    rate = 2
+    kv = mx.kv.create(kv_type)
+    kv.set_gradient_compression({'type':compression, 'threshold':threshold})
+    kv.set_optimizer(mx.optimizer.create('test', rescale_grad=rate))
+    for k, s in zip(keys, shapes):
+        kv.init(k, mx.nd.zeros(s))
+    # init one key with 1s so we can check if it was compressed during init
+    kv.init(gc_init_test_key, mx.nd.ones(shapes[0]))
+    # use different keys for random tests so that
+    # we can track residual from start
+    random_keys = [13, 15, 17]
+    for k, s in zip(random_keys, shapes):
+        kv.init(k, mx.nd.zeros(s))
+
+    def pull_init_test(kv):
+        # checks that compression is not applied to init of key
+        out = [mx.nd.zeros(shapes[0], mx.gpu(g)) for g in range(nworker)]
+        kv.pull(gc_init_test_key, out=out)
+        exp = np.ones_like(out[0].asnumpy())
+        for o in out:
+            assert_almost_equal(o.asnumpy(), exp)
+
+    def pull_before_push(kv):
+        for i in range(nrepeat):
+            for j in range(len(keys)):
+                out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)]
+                kv.pull(keys[j], out=out)
+                exp = np.zeros_like(out[0].asnumpy())
+                for o in out:
+                    assert_almost_equal(o.asnumpy(), exp)
+
+    def push_zeros(kv):
+        for i in range(nrepeat):
+            for j in range(len(keys)):
+                kv.push(keys[j], [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)])
+                out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)]
+                kv.pull(keys[j], out=out)
+                exp = np.zeros_like(out[0].asnumpy())
+                for o in out:
+                    assert_almost_equal(o.asnumpy(), exp)
+
+    def verify_residual(kv, threshold, rate):
+        for j in range(len(keys)):
+            kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*0.4 for g in range(nworker)])
+            out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)]
+            kv.pull(keys[j],out=out)
+            for o in out:
+                check_diff_to_scalar(o, 0)
+
+            kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(threshold-0.3) for g in range(nworker)])
+            out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)]
+            kv.pull(keys[j],out=out)
+            curval = threshold * rate * nworker
+            for o in out:
+                check_diff_to_scalar(o, curval)
+
+            kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(0.2) for g in range(nworker)])
+            out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)]
+            kv.pull(keys[j],out=out)
+            for o in out:
+                check_diff_to_scalar(o, curval)
+
+            kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*(threshold-0.3) for g in range(nworker)])
+            out = [mx.nd.zeros(shapes[j], mx.gpu(g)) for g in range(nworker)]
+            kv.pull(keys[j],out=out)
+            curval += threshold*rate*nworker
+            for o in out:
+                check_diff_to_scalar(o, curval)
+            # residual would be 0 now
+        return curval
+
+    def check_neg(kv, neg, rate, curval):
+        for r in range(nrepeat):
+            curval = curval + rate*nworker*neg
+            for j in range(len(keys)):
+                kv.push(keys[j], [mx.nd.ones(shapes[j], mx.gpu(g))*neg for g in range(nworker)])
+                out = [mx.nd.ones(shapes[j], mx.gpu(g)) for g in range(nworker)]
+                kv.pull(keys[j], out=out)
+                for o in out:
+                    check_diff_to_scalar(o, curval)
+            # residual would be 0 again
+
+    def check_compr_random(kv, threshold):
+        for k, s in zip(random_keys, shapes):
+            curr_residual = [np.zeros(s) for g in range(nworker)]
+            orig_val = [mx.nd.zeros(s, mx.gpu(g)) for g in range(nworker)]
+            kv.pull(k, out=orig_val)
+            grads = [mx.nd.random_uniform(-0.6, 0.6, shape=s, ctx=mx.gpu(g)) for g in range(nworker)]
+            grads_cpy = copy.deepcopy(grads)
+            kv.push(k, grads)
+            val = [mx.nd.zeros(s, mx.gpu(g)) for g in range(nworker)]
+            kv.pull(k, out=val)
+            diffs = [val[g] - orig_val[g] for g in range(nworker)]
+            # compute expected by using simulation of operator
+            # on cpu
+            sum_dequantized_vals = np.zeros(s)
+            for g in range(nworker):
+                compr, curr_residual[g], decompr = compute_expected_2bit_quantization(
+                                                    grads_cpy[g], curr_residual[g], threshold)
+                sum_dequantized_vals += (decompr * rate)
+
+            for g in range(nworker):
+                assert_almost_equal(diffs[g].asnumpy(), sum_dequantized_vals)
+
+    pull_init_test(kv)
+    pull_before_push(kv)
+    push_zeros(kv)
+    curval = verify_residual(kv, threshold, rate)
+    check_neg(kv, -1*threshold, rate, curval)
+    check_compr_random(kv, threshold)
 
 ## group keys interface
 def test_group_kvstore(kv_type):
     print(kv_type)
     kv = mx.kv.create(kv_type)
-    kv.set_optimizer(mx.optimizer.create('test', lr))
+    kv.set_optimizer(mx.optimizer.create('test', rescale_grad=lr))
     kv.init(keys, [mx.nd.zeros(s) for s in shapes])
     res = [np.zeros(s) for s in shapes]
     out = [[mx.nd.zeros(s, mx.gpu(g)) for g in range(nworker)] for s in shapes]
@@ -79,6 +230,27 @@ def test_group_kvstore(kv_type):
             err = sum(err) / np.sum(np.abs(a))
             assert(err < 1e-6), (err, a.shape)
 
-test_group_kvstore('local_update_cpu')
-test_group_kvstore('local_allreduce_cpu')
-test_group_kvstore('local_allreduce_device')
+if __name__ == "__main__":
+    keys = [3, 5, 7]
+    # let the last shape exceed MXNET_KVSTORE_BIGARRAY_BOUND
+    shapes = [(4, 4), (100, 100), (2000, 2000)]
+
+    gc_init_test_key = 9
+
+    lr = .1
+    nworker = 4
+    nrepeat = 10
+
+    ## generate data
+    data = [[[np.random.random(s)*2-1 for i in range(nworker)] for s in shapes] for j in range(nrepeat)]
+
+    test_kvstore('local_update_cpu')
+    test_kvstore('local_allreduce_cpu')
+    test_kvstore('local_allreduce_device')
+
+    # compression for local kvstore happens only when reduce is on device
+    test_compress_kvstore('local_allreduce_device')
+
+    test_group_kvstore('local_update_cpu')
+    test_group_kvstore('local_allreduce_cpu')
+    test_group_kvstore('local_allreduce_device')
diff --git a/tests/nightly/test_tutorial.py b/tests/nightly/test_tutorial.py
index 56b530a59bb4..ff7ee9db678e 100644
--- a/tests/nightly/test_tutorial.py
+++ b/tests/nightly/test_tutorial.py
@@ -106,3 +106,6 @@ def test_tutorial_nb(file_path):
         print "[Passed: %d of %d]" % (success_num, len(tutorial_list))
         print "Stats end"
 
+        if fail_num > 0:
+            exit(1)
+
diff --git a/tests/python/common/get_data.py b/tests/python/common/get_data.py
index 35482f8de584..5802a069198c 100644
--- a/tests/python/common/get_data.py
+++ b/tests/python/common/get_data.py
@@ -19,36 +19,63 @@
 import os, gzip
 import pickle as pickle
 import sys
+from mxnet.test_utils import download
+import zipfile
+import mxnet as mx
 
 # download mnist.pkl.gz
 def GetMNIST_pkl():
-    if not os.path.isdir("data/"):
-        os.system("mkdir data/")
+    if not os.path.isdir("data"):
+        os.makedirs('data')
     if not os.path.exists('data/mnist.pkl.gz'):
-        os.system("wget -q http://deeplearning.net/data/mnist/mnist.pkl.gz -P data/")
+        download('http://deeplearning.net/data/mnist/mnist.pkl.gz',
+                 dirname='data')
 
 # download ubyte version of mnist and untar
 def GetMNIST_ubyte():
-    if not os.path.isdir("data/"):
-        os.system("mkdir data/")
+    if not os.path.isdir("data"):
+        os.makedirs('data')
     if (not os.path.exists('data/train-images-idx3-ubyte')) or \
        (not os.path.exists('data/train-labels-idx1-ubyte')) or \
        (not os.path.exists('data/t10k-images-idx3-ubyte')) or \
        (not os.path.exists('data/t10k-labels-idx1-ubyte')):
-        os.system("wget -q http://data.mxnet.io/mxnet/data/mnist.zip -P data/")
-        os.chdir("./data")
-        os.system("unzip -u mnist.zip")
-        os.chdir("..")
+        zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
+                                 dirname='data')
+        with zipfile.ZipFile(zip_file_path) as zf:
+            zf.extractall('data')
 
 # download cifar
 def GetCifar10():
-    if not os.path.isdir("data/"):
-        os.system("mkdir data/")
+    if not os.path.isdir("data"):
+        os.makedirs('data')
     if (not os.path.exists('data/cifar/train.rec')) or \
        (not os.path.exists('data/cifar/test.rec')) or \
        (not os.path.exists('data/cifar/train.lst')) or \
        (not os.path.exists('data/cifar/test.lst')):
-        os.system("wget -q http://data.mxnet.io/mxnet/data/cifar10.zip -P data/")
-        os.chdir("./data")
-        os.system("unzip -u cifar10.zip")
-        os.chdir("..")
+        zip_file_path = download('http://data.mxnet.io/mxnet/data/cifar10.zip',
+                                 dirname='data')
+        with zipfile.ZipFile(zip_file_path) as zf:
+            zf.extractall('data')
+
+def MNISTIterator(batch_size, input_shape):
+    """return train and val iterators for mnist"""
+    # download data
+    GetMNIST_ubyte()
+    flat = False if len(input_shape) == 3 else True
+
+    train_dataiter = mx.io.MNISTIter(
+        image="data/train-images-idx3-ubyte",
+        label="data/train-labels-idx1-ubyte",
+        input_shape=input_shape,
+        batch_size=batch_size,
+        shuffle=True,
+        flat=flat)
+
+    val_dataiter = mx.io.MNISTIter(
+        image="data/t10k-images-idx3-ubyte",
+        label="data/t10k-labels-idx1-ubyte",
+        input_shape=input_shape,
+        batch_size=batch_size,
+        flat=flat)
+
+    return (train_dataiter, val_dataiter)
diff --git a/tests/python/gpu/test_kvstore_gpu.py b/tests/python/gpu/test_kvstore_gpu.py
new file mode 100644
index 000000000000..517d2e7ca42f
--- /dev/null
+++ b/tests/python/gpu/test_kvstore_gpu.py
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+import mxnet as mx
+import numpy as np
+import unittest
+from mxnet.test_utils import assert_almost_equal, default_context
+
+shape = (4, 4)
+keys = [5, 7, 11]
+str_keys = ['b', 'c', 'd']
+
+
+def init_kv_with_str(stype='default'):
+    """init kv """
+    kv = mx.kv.create()
+    # single
+    kv.init('a', mx.nd.zeros(shape, stype=stype))
+    # list
+    kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys))
+    return kv
+
+
+@unittest.skip("Test fails intermittently. Temporarily disabled until fixed. Tracked at https://github.com/apache/incubator-mxnet/issues/8262")
+def test_row_sparse_pull():
+    kv = init_kv_with_str('row_sparse')
+    kv.init('e', mx.nd.ones(shape).tostype('row_sparse'))
+
+    def check_row_sparse_pull(kv, count, ctx=default_context()):
+        num_rows = shape[0]
+        vals = []
+        row_ids = []
+        all_row_ids = np.arange(num_rows)
+        for i in range(count):
+            vals.append(mx.nd.zeros(shape, ctx=ctx).tostype('row_sparse'))
+            row_id = np.random.randint(num_rows, size=num_rows)
+            row_ids.append(mx.nd.array(row_id, dtype='int64'))
+        row_ids_to_pull = row_ids[0] if len(row_ids) == 1 else row_ids
+        vals_to_pull = vals[0] if len(vals) == 1 else vals
+
+        kv.row_sparse_pull('e', out=vals_to_pull, row_ids=row_ids_to_pull)
+        for val, row_id in zip(vals, row_ids):
+            retained = val.asnumpy()
+            excluded_row_ids = np.setdiff1d(all_row_ids, row_id.asnumpy())
+            for row in range(num_rows):
+                expected_val = np.zeros_like(retained[row])
+                expected_val += 0 if row in excluded_row_ids else 1
+                assert_almost_equal(retained[row], expected_val)
+
+    check_row_sparse_pull(kv, 1, mx.gpu(0))
+    check_row_sparse_pull(kv, 4, mx.gpu(0))
+
+
+if __name__ == '__main__':
+    test_row_sparse_pull()
diff --git a/tests/python/gpu/test_nccl.py b/tests/python/gpu/test_nccl.py
new file mode 100644
index 000000000000..0e0c18fe5f34
--- /dev/null
+++ b/tests/python/gpu/test_nccl.py
@@ -0,0 +1,42 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import numpy as np
+import unittest
+
+shapes = [(10), (100), (1000), (10000), (100000), (2,2), (2,3,4,5,6,7,8)]
+keys = [1,2,3,4,5,6,7]
+gpus = range(1,1+len(mx.test_utils.list_gpus()))
+
+@unittest.skip("Test requires NCCL library installed and enabled during build")
+def test_nccl_pushpull():
+    for shape, key in zip(shapes, keys):
+        for n_gpus in gpus:
+            kv_nccl = mx.kv.create('nccl')
+            a = mx.nd.ones(shape, mx.gpu(0))
+            cur_key = str(key*max(gpus)+n_gpus)
+            kv_nccl.init(cur_key, a)
+            arr_list = [mx.nd.ones(shape, mx.gpu(x)) for x in range(n_gpus)]
+            res = [mx.nd.zeros(shape, mx.gpu(x)) for x in range(n_gpus)]
+            kv_nccl.push(cur_key, arr_list)
+            kv_nccl.pull(cur_key, res)
+            for x in range(n_gpus):
+                assert(np.sum(np.abs((res[x]-n_gpus).asnumpy()))==0)
+
+if __name__ == '__main__':
+    test_nccl_pushpull()
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 866f6ad8abc0..13b547eb47fc 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -18,8 +18,10 @@
 import sys
 import os
 import time
+import unittest
 import mxnet as mx
 import numpy as np
+import unittest
 from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal
 from numpy.testing import assert_allclose
 
@@ -29,8 +31,15 @@
 from test_optimizer import *
 from test_random import *
 from test_gluon import *
+from test_loss import *
 #from test_rnn import *
 from test_gluon_rnn import *
+from test_sparse_ndarray import test_create_csr, test_create_row_sparse, test_sparse_nd_slice
+from test_sparse_ndarray import test_create_sparse_nd_empty, test_create_sparse_nd_from_sparse
+from test_sparse_ndarray import test_create_sparse_nd_from_dense, test_create_sparse_nd_infer_shape
+from test_sparse_ndarray import test_sparse_nd_check_format
+from test_sparse_operator import *
+from test_ndarray import *
 
 set_default_context(mx.gpu(0))
 del test_support_vector_machine_l1_svm
@@ -38,7 +47,7 @@
 
 
 def check_countsketch(in_dim,out_dim,n):
-    sym = mx.contrib.sym.count_sketch(name='countsketch',out_dim = out_dim)
+    sym = mx.sym.contrib.count_sketch(name='countsketch',out_dim = out_dim)
     shape = [(n,in_dim), (1,in_dim),(1,in_dim)]     #shape of input x, hash h and hash s
 
     arr = [mx.nd.empty(shape[i]) for i in range(3)]
@@ -104,7 +113,7 @@ def check_ifft(shape):
             shape = tuple(lst)
             shape_old = shape
         shape = (shape[0],shape[1],shape[2],shape[3]*2)
-    sym = mx.contrib.sym.ifft(name='ifft', compute_size = 128)
+    sym = mx.sym.contrib.ifft(name='ifft', compute_size = 128)
     init = [np.random.normal(size=shape, scale=1.0)]
     arr_grad = [mx.nd.empty(shape)]
     ctx_list = [{'ctx': mx.gpu(0),'ifft_data': shape, 'type_dict': {'ifft_data': np.float32}}]
@@ -170,7 +179,7 @@ def test_ifft():
             check_ifft(shape)
 
 def check_fft(shape):
-    sym = mx.contrib.sym.fft(name='fft', compute_size = 128)
+    sym = mx.sym.contrib.fft(name='fft', compute_size = 128)
     if len(shape) == 2:
         if shape[1]%2 != 0:
             lst = list(shape)
@@ -461,6 +470,10 @@ def test_convolution_options():
     sym = mx.sym.Convolution(num_filter=3, kernel=(3,), dilate=(2,), name='conv')
     sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,), dilate=(2,), cudnn_off=True, name='conv')
     check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
+    # 1x1 convolution
+    sym = mx.sym.Convolution(num_filter=3, kernel=(1,), pad=(0,), name='conv')
+    sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(1,), pad=(0,), cudnn_off=True, name='conv')
+    check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
 
     # 2D convolution
     ctx_list = [{'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float64}},
@@ -480,6 +493,10 @@ def test_convolution_options():
     sym = mx.sym.Convolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='conv')
     sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,3), dilate=(2,2), cudnn_off=True, name='conv')
     check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
+    # 1x1 convolution
+    sym = mx.sym.Convolution(num_filter=3, kernel=(1,1), pad=(0,0), name='conv')
+    sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(1,1), pad=(0,0), cudnn_off=True, name='conv')
+    check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
 
     # 3D convolution
     ctx_list = [{'ctx': mx.cpu(0), 'conv_data': (2, 2, 5, 7, 7), 'type_dict': {'conv_data': np.float64}},
@@ -494,6 +511,10 @@ def test_convolution_options():
     sym = mx.sym.Convolution(num_filter=3, kernel=(2,3,3), stride=(2,2,2), name='conv')
     sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(2,3,3), stride=(2,2,2), cudnn_off=True, name='conv')
     check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
+    # 1x1 convolution
+    sym = mx.sym.Convolution(num_filter=3, kernel=(1,1,1), pad=(0,0,0), name='conv')
+    sym_no_cudnn = mx.sym.Convolution(num_filter=3, kernel=(1,1,1), pad=(0,0,0), cudnn_off=True, name='conv')
+    check_consistency_NxM([sym, sym_no_cudnn], ctx_list)
 
 def test_convolution_versions():
     # 2D convolution NCHW
@@ -636,6 +657,7 @@ def test_grid_generator_with_type():
     check_consistency(sym, ctx_list)
     check_consistency(sym, ctx_list, grad_req="add")
 
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/7645")
 def test_spatial_transformer_with_type():
     np.random.seed(1234)
     data = mx.sym.Variable('data')
@@ -926,6 +948,11 @@ def test_fullyconnected_with_type():
                 {'ctx': mx.cpu(0), 'inner_data': (2, 10), 'type_dict': {'inner_data': np.float64}},
                 {'ctx': mx.cpu(0), 'inner_data': (2, 10), 'type_dict': {'inner_data': np.float32}}]
     check_consistency(sym, ctx_list)
+    # Sizes are divisible by 8 to test TensorCore on Volta GPU.
+    sym = mx.sym.FullyConnected(num_hidden=8, name='inner')
+    ctx_list = [{'ctx': mx.gpu(0), 'inner_data': (16, 24), 'type_dict': {'inner_data': np.float16}},
+                {'ctx': mx.cpu(0), 'inner_data': (16, 24), 'type_dict': {'inner_data': np.float32}}]
+    check_consistency(sym, ctx_list)
 
 
 def test_activation_with_type():
@@ -1132,7 +1159,7 @@ def test_psroipooling_with_type():
         'psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])}
 
     # plain psroipooling
-    sym = mx.contrib.sym.PSROIPooling(spatial_scale=0.0625, output_dim=2, pooled_size=3, name='psroipool')
+    sym = mx.sym.contrib.PSROIPooling(spatial_scale=0.0625, output_dim=2, pooled_size=3, name='psroipool')
     ctx_list = [{'ctx': mx.gpu(0),
                  'psroipool_data': (1, 18, 14, 14),
                  'psroipool_rois': (2, 5),
@@ -1156,7 +1183,7 @@ def test_deformable_psroipooling_with_type():
         'deformable_psroipool_rois': np.array([[0, 10, 22, 161, 173], [0, 20, 15, 154, 160]])}
 
     # deformable psroipooling
-    sym = mx.contrib.sym.DeformablePSROIPooling(spatial_scale=0.0625, sample_per_part=4, group_size=3, pooled_size=3,
+    sym = mx.sym.contrib.DeformablePSROIPooling(spatial_scale=0.0625, sample_per_part=4, group_size=3, pooled_size=3,
                                                 output_dim=2, trans_std=0.1, no_trans=False, name='deformable_psroipool')
 
     ctx_list = [{'ctx': mx.gpu(0),
@@ -1185,7 +1212,7 @@ def test_deformable_psroipooling_with_type():
 
 def test_deformable_convolution_with_type():
     np.random.seed(1234)
-    sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), name='deformable_conv')
+    sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), name='deformable_conv')
     # since atomicAdd does not support fp16 (which deformable conv uses in backward), we do not test fp16 here
     ctx_list = [{'ctx': mx.gpu(0),
                  'deformable_conv_data': (2, 2, 10, 10),
@@ -1230,7 +1257,7 @@ def test_deformable_convolution_options():
                 #  'deformable_offset': (2, 18, 7, 7),
                 #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}},
                 ]
-    sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), pad=(1,1), name='deformable_conv')
+    sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), pad=(1,1), name='deformable_conv')
     check_consistency(sym, ctx_list)
 
     # Stride > 1
@@ -1248,7 +1275,7 @@ def test_deformable_convolution_options():
                 # 'deformable_conv_offset': (2, 18, 3, 3),
                 #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}},
                 ]
-    sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), stride=(2,2), name='deformable_conv')
+    sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), stride=(2,2), name='deformable_conv')
     check_consistency(sym, ctx_list)
 
     # Dilate > 1
@@ -1266,7 +1293,7 @@ def test_deformable_convolution_options():
                 # 'deformable_conv_offset': (2, 18, 3, 3),
                 #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}},
                 ]
-    sym = mx.contrib.sym.DeformableConvolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='deformable_conv')
+    sym = mx.sym.contrib.DeformableConvolution(num_filter=3, kernel=(3,3), dilate=(2,2), name='deformable_conv')
     check_consistency(sym, ctx_list)
 
     # Deformable group > 1
@@ -1284,7 +1311,7 @@ def test_deformable_convolution_options():
                 #  'deformable_conv_offset': (2, 36, 5, 5),
                 #  'type_dict': {'deformable_conv_data': np.float16, 'deformable_offset': np.float16}},
                 ]
-    sym = mx.contrib.sym.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2,
+    sym = mx.sym.contrib.DeformableConvolution(num_filter=4, kernel=(3,3), num_deformable_group=2,
                                                name='deformable_conv')
 
 def test_residual_fused():
@@ -1335,17 +1362,92 @@ def test_rnn_layer():
 def test_sequence_reverse():
     check_sequence_reverse(mx.gpu(0))
 
-
+@unittest.skip("Test fails intermittently. Temporarily disabled until fixed. Tracked at https://github.com/apache/incubator-mxnet/issues/8211")
 def test_autograd_save_memory():
-    x = mx.nd.zeros((128, 1024, 1024), ctx=mx.gpu(0))
+    x = mx.nd.zeros((128, 512, 512), ctx=mx.gpu(0))
     x.attach_grad()
 
     with mx.autograd.record():
-        for i in range(50):
+        for i in range(200):
             x = x + 1
             x.wait_to_read()
     x.backward()
 
+def test_gluon_ctc_consistency():
+    loss = mx.gluon.loss.CTCLoss()
+    data = mx.nd.arange(0, 4, repeat=40, ctx=mx.gpu(0)).reshape((2,20,4)).flip(axis=0)
+    cpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.cpu(0))
+    gpu_label = mx.nd.array([[2,1,-1,-1],[3,2,2,-1]], ctx=mx.gpu(0))
+
+    cpu_data = data.copy().as_in_context(mx.cpu(0))
+    cpu_data.attach_grad()
+    with mx.autograd.record():
+        l_cpu = loss(cpu_data, cpu_label)
+        l_cpu.backward()
+
+    gpu_data = data.copyto(mx.gpu(0))
+    gpu_data.attach_grad()
+    with mx.autograd.record():
+        l_gpu = loss(gpu_data, gpu_label)
+        l_gpu.backward()
+
+    assert_almost_equal(cpu_data.grad.asnumpy(), gpu_data.grad.asnumpy(), atol=1e-3, rtol=1e-3)
+
+
+def test_cuda_rtc():
+    source = r'''
+    extern "C" __global__ void axpy(const float *x, float *y, float alpha) {
+        int i = threadIdx.x + blockIdx.x * blockDim.x;
+        y[i] += alpha * x[i];
+    }
+
+    extern "C" __global__ void saxpy(const float *x, float *y, float alpha) {
+        extern __shared__ float smem[];
+        int i = threadIdx.x + blockIdx.x * blockDim.x;
+        smem[threadIdx.x] = x[i];
+        y[i] += alpha * smem[threadIdx.x];
+    }
+    '''
+    module = mx.rtc.CudaModule(source)
+    axpy = module.get_kernel("axpy", "const float *x, float *y, float alpha")
+    x = mx.nd.ones((10,), ctx=mx.gpu(0))
+    y = mx.nd.zeros((10,), ctx=mx.gpu(0))
+    axpy.launch([x, y, 3.0], mx.gpu(0), (1, 1, 1), (10, 1, 1))
+    assert (y.asnumpy() == 3).all()
+
+    saxpy = module.get_kernel("saxpy", "const float *x, float *y, float alpha")
+    saxpy.launch([x, y, 4.0], mx.gpu(0), (1, 1, 1), (10, 1, 1), 10)
+    assert (y.asnumpy() == 7).all()
+
+    saxpy.launch([x, y, 5.0], mx.gpu(0), (2, 1, 1), (5, 1, 1), 5)
+    assert (y.asnumpy() == 12).all()
+
+
+def test_cross_device_autograd():
+    x = mx.nd.random.uniform(shape=(10,))
+    x.attach_grad()
+
+    with mx.autograd.record():
+        y = mx.nd.tanh(x)
+        y = y.copyto(mx.gpu(0))
+        y = mx.nd.tanh(y)
+        y = y.copyto(mx.cpu(0))
+        y = mx.nd.tanh(y)
+        y = y.copyto(mx.gpu(0))
+        y = y.copyto(mx.gpu(0))
+
+        y.backward()
+
+    dx = x.grad.asnumpy()
+    x.grad[:] = 0
+
+    with mx.autograd.record():
+        y = x
+        for i in range(3):
+            y = mx.nd.tanh(y)
+        y.backward()
+
+    assert_almost_equal(dx, x.grad.asnumpy())
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/train/test_dtype.py b/tests/python/train/test_dtype.py
index b0a524815c6c..52f04bf9a188 100644
--- a/tests/python/train/test_dtype.py
+++ b/tests/python/train/test_dtype.py
@@ -99,7 +99,7 @@ def run_cifar10(train, val, use_module):
     devs = [mx.cpu(0)]
     net = get_net()
     mod = mx.mod.Module(net, context=devs)
-    optim_args = {'learning_rate': 0.05, 'wd': 0.00001, 'momentum': 0.9}
+    optim_args = {'learning_rate': 0.001, 'wd': 0.00001, 'momentum': 0.9}
     eval_metrics = ['accuracy']
     if use_module:
         executor = mx.mod.Module(net, context=devs)
@@ -175,7 +175,6 @@ def test_cifar10():
     console = logging.StreamHandler()
     console.setLevel(logging.DEBUG)
     logging.getLogger('').addHandler(console)
-
     kv = mx.kvstore.create("local")
     # test float32 input
     (train, val) = get_iterator_float32(kv)
diff --git a/tests/python/unittest/test_autograd.py b/tests/python/unittest/test_autograd.py
index 30dd662ff1cc..c95ccdf29968 100644
--- a/tests/python/unittest/test_autograd.py
+++ b/tests/python/unittest/test_autograd.py
@@ -106,29 +106,41 @@ def autograd_assert(*args, **kwargs):
         assert same(a.asnumpy(), b.asnumpy())
 
 def test_unary_func():
-    x = nd.uniform(shape=(4, 5))
-    f_exp         = lambda x: nd.exp(x)
-    f_exp_grad    = lambda x: [nd.exp(x)]
-    autograd_assert(x, func=f_exp, grad_func=f_exp_grad)
-    f_half        = lambda x: x/2
-    f_half_grad   = lambda x: [nd.ones(x.shape) * 0.5]
-    autograd_assert(x, func=f_half, grad_func=f_half_grad)
-    f_square      = lambda x: x**2
-    f_square_grad = lambda x: [2*x]
-    autograd_assert(x, func=f_square, grad_func=f_square_grad)
+    def check_unary_func(x):
+        f_exp         = lambda x: nd.exp(x)
+        f_exp_grad    = lambda x: [nd.exp(x)]
+        autograd_assert(x, func=f_exp, grad_func=f_exp_grad)
+        f_half        = lambda x: x/2
+        f_half_grad   = lambda x: [nd.ones(x.shape) * 0.5]
+        autograd_assert(x, func=f_half, grad_func=f_half_grad)
+        f_square      = lambda x: x**2
+        f_square_grad = lambda x: [2*x]
+        autograd_assert(x, func=f_square, grad_func=f_square_grad)
+    uniform = nd.uniform(shape=(4, 5))
+    stypes = ['default', 'row_sparse', 'csr']
+    for stype in stypes:
+        check_unary_func(uniform.tostype(stype))
 
 def test_binary_func():
-    x = nd.uniform(shape=(4, 5))
-    y = nd.uniform(shape=(4, 5))
-    f_add      = lambda x, y: x+y
-    f_add_grad = lambda x, y: [nd.ones(x.shape), nd.ones(y.shape)]
-    autograd_assert(x, y, func=f_add, grad_func=f_add_grad)
-    f_mul      = lambda x, y: x*y
-    f_mul_grad = lambda x, y: [y, x]
-    autograd_assert(x, y, func=f_mul, grad_func=f_mul_grad)
-    f_compose  = lambda x, y: x+x*y
-    f_compose_grad = lambda x, y: [nd.ones(x.shape) + y, x]
-    autograd_assert(x, y, func=f_compose, grad_func=f_compose_grad)
+    def check_binary_func(x, y):
+        f_add      = lambda x, y: x+y
+        f_add_grad = lambda x, y: [nd.ones(x.shape), nd.ones(y.shape)]
+        autograd_assert(x, y, func=f_add, grad_func=f_add_grad)
+        f_mul      = lambda x, y: x*y
+        f_mul_grad = lambda x, y: [y, x]
+        autograd_assert(x, y, func=f_mul, grad_func=f_mul_grad)
+        f_compose  = lambda x, y: x+x*y
+        f_compose_grad = lambda x, y: [nd.ones(x.shape) + y, x]
+        autograd_assert(x, y, func=f_compose, grad_func=f_compose_grad)
+    uniform_x = nd.uniform(shape=(4, 5))
+    uniform_y = nd.uniform(shape=(4, 5))
+    stypes = ['default', 'row_sparse', 'csr']
+    for stype_x in stypes:
+        for stype_y in stypes:
+            x = uniform_x.tostype(stype_x)
+            y = uniform_y.tostype(stype_y)
+            check_binary_func(x, y)
+
 
 def test_operator_with_state():
     def f_fc(a, b, weight, bias):
@@ -255,14 +267,19 @@ def test_retain_grad():
 
 
 def test_attach_grad():
-    x = mx.nd.zeros((10,))
-    assert x.grad is None
-    x.attach_grad()
-    with record():
-        y = x * 2
-        assert y.grad is None
-        y.backward()
-    assert (x.grad.asnumpy() == 2).all()
+    def check_attach_grad(x):
+        assert x.grad is None
+        x.attach_grad()
+        with record():
+            y = x * 2
+            assert y.grad is None
+            y.backward(out_grad=mx.nd.ones_like(y).tostype(x.stype))
+        assert (x.grad.asnumpy() == 2).all()
+    zeros = mx.nd.zeros((10, 10))
+    stypes = ['default', 'row_sparse', 'csr']
+    for stype in stypes:
+        x = zeros.tostype(stype)
+        check_attach_grad(x)
 
 
 def test_is_train():
@@ -327,9 +344,9 @@ def backward(self, dm, dn):
             return dx, dy
 
     f = func()
-    x = mx.nd.random_uniform(shape=(10,))
+    x = mx.nd.random.uniform(shape=(10,))
     x.attach_grad()
-    y = mx.nd.random_uniform(shape=(10,))
+    y = mx.nd.random.uniform(shape=(10,))
     y.attach_grad()
     with record():
         m, n = f(x, y)
@@ -358,6 +375,57 @@ def test_get_symbol():
         y = x*x + 2*z - 1
     assert len(get_symbol(y).list_arguments()) == 2
 
+def test_grad_with_stype():
+    def check_grad_with_stype(array_stype, grad_stype, expected_stype):
+        x = mx.nd.zeros((1, 1), stype=array_stype)
+        x.attach_grad(stype=grad_stype)
+        # check grad attached
+        assert x.grad.stype == expected_stype
+        y = x.detach()
+        # check array detached
+        assert y.stype == array_stype
+
+    stypes = ['default', 'csr', 'row_sparse']
+    for stype in stypes:
+        # check the default stype of the gradient (same as the array stype)
+        check_grad_with_stype(stype, None, stype)
+        for grad_stype in stypes:
+            # check the stype of the gradient when provided
+            check_grad_with_stype(stype, grad_stype, grad_stype)
+
+def test_sparse_dot_grad():
+    def check_sparse_dot_grad(rhs):
+        lhs = rand_ndarray((2, 8), 'csr')
+        with mx.autograd.record():
+            y = mx.nd.dot(lhs, rhs)
+        y.backward()
+        grad = rhs.grad
+        grad_np = np.dot(lhs.asnumpy().T, np.ones((lhs.shape[0], rhs.shape[1])))
+        assert grad.stype == 'row_sparse'
+        assert_almost_equal(grad.asnumpy(), grad_np)
+
+    # check grad with row_sparse weight
+    shape = (8, 3)
+    rsp = mx.nd.ones(shape).tostype('row_sparse')
+    rsp.attach_grad()
+    check_sparse_dot_grad(rsp)
+
+    # check grad with dense weight
+    dns = mx.nd.ones(shape)
+    dns.attach_grad(stype='row_sparse')
+    check_sparse_dot_grad(dns)
+
+def test_gradient():
+    x = mx.nd.ones((1,))
+    x.attach_grad()
+
+    with mx.autograd.record():
+        z = mx.nd.elemwise_add(mx.nd.exp(x), x)
+    dx, = mx.autograd.grad(z, [x], create_graph=True)
+    assert abs(dx.asscalar() - 3.71828175) < 1e-7
+    dx.backward()
+    assert abs(x.grad.asscalar() - 2.71828175) < 1e-7
+
 
 if __name__ == "__main__":
     import nose
diff --git a/tests/python/unittest/test_engine.py b/tests/python/unittest/test_engine.py
new file mode 100644
index 000000000000..29b7b822b3ef
--- /dev/null
+++ b/tests/python/unittest/test_engine.py
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import nose
+import mxnet as mx
+
+def test_bulk():
+    with mx.engine.bulk(10):
+        x = mx.nd.ones((10,))
+        x *= 2
+        x += 1
+        x.wait_to_read()
+        x += 1
+        assert (x.asnumpy() == 4).all()
+        for i in range(100):
+            x += 1
+    assert (x.asnumpy() == 104).all()
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_gluon.py b/tests/python/unittest/test_gluon.py
index cafa08bc04ca..df9f78e0ce9a 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -18,7 +18,11 @@
 import mxnet as mx
 from mxnet import gluon
 from mxnet.gluon import nn
+from mxnet.test_utils import assert_almost_equal
 import numpy as np
+from nose.tools import raises
+from copy import deepcopy
+import warnings
 
 
 def test_parameter():
@@ -67,10 +71,10 @@ def forward(self, x):
 
 def test_basic():
     model = nn.Sequential()
-    model.add(nn.Dense(128, activation='tanh', in_units=10))
+    model.add(nn.Dense(128, activation='tanh', in_units=10, flatten=False))
     model.add(nn.Dropout(0.5))
-    model.add(nn.Dense(64, activation='tanh', in_units=128))
-    model.add(nn.Dense(32, in_units=64))
+    model.add(nn.Dense(64, activation='tanh', in_units=256),
+              nn.Dense(32, in_units=64))
     model.add(nn.Activation('relu'))
 
     # symbol
@@ -80,7 +84,7 @@ def test_basic():
 
     # ndarray
     model.collect_params().initialize(mx.init.Xavier(magnitude=2.24))
-    x = model(mx.nd.zeros((32, 10)))
+    x = model(mx.nd.zeros((32, 2, 10)))
     assert x.shape == (32, 32)
     x.wait_to_read()
 
@@ -90,12 +94,30 @@ def test_basic():
     assert list(model.collect_params().values())[0]._grad is not None
 
 
+def test_dense():
+    model = nn.Dense(128, activation='tanh', in_units=10, flatten=False, prefix='test_')
+    inputs = mx.sym.Variable('data')
+    outputs = model(inputs)
+    assert set(model.collect_params().keys()) == set(['test_weight', 'test_bias'])
+    assert outputs.list_outputs() == ['test_tanh_fwd_output']
+    args, outs, auxs = outputs.infer_shape(data=(2, 3, 10))
+    assert outs == [(2, 3, 128)]
+
+    model = nn.Dense(128, activation='relu', in_units=30, flatten=True, prefix='test2_')
+    inputs = mx.sym.Variable('data')
+    outputs = model(inputs)
+    assert set(model.collect_params().keys()) == set(['test2_weight', 'test2_bias'])
+    assert outputs.list_outputs() == ['test2_relu_fwd_output']
+    args, outs, auxs = outputs.infer_shape(data=(17, 2, 5, 3))
+    assert outs == [(17, 128)]
+
+
 def test_symbol_block():
     model = nn.HybridSequential()
     model.add(nn.Dense(128, activation='tanh'))
     model.add(nn.Dropout(0.5))
-    model.add(nn.Dense(64, activation='tanh'))
-    model.add(nn.Dense(32, in_units=64))
+    model.add(nn.Dense(64, activation='tanh'),
+              nn.Dense(32, in_units=64))
     model.add(nn.Activation('relu'))
 
     model.initialize()
@@ -108,7 +130,27 @@ def test_symbol_block():
     assert len(smodel(mx.nd.zeros((16, 10)))) == 14
 
     out = smodel(mx.sym.var('in'))
-    assert len(out.get_internals().list_outputs()) == len(outputs.list_outputs())
+    assert len(out) == len(outputs.list_outputs())
+
+    class Net(nn.HybridBlock):
+        def __init__(self, model):
+            super(Net, self).__init__()
+            self.model = model
+
+        def hybrid_forward(self, F, x):
+            out = self.model(x)
+            return F.add_n(*[i.sum() for i in out])
+
+    net = Net(smodel)
+    net.hybridize()
+    assert isinstance(net(mx.nd.zeros((16, 10))), mx.nd.NDArray)
+
+    inputs = mx.sym.var('data')
+    outputs = model(inputs)
+    smodel = gluon.SymbolBlock(outputs, inputs, params=model.collect_params())
+    net = Net(smodel)
+    net.hybridize()
+    assert isinstance(net(mx.nd.zeros((16, 10))), mx.nd.NDArray)
 
 
 def check_layer_forward(layer, dshape):
@@ -119,6 +161,9 @@ def check_layer_forward(layer, dshape):
         out = layer(x)
     out.backward()
 
+    np_out = out.asnumpy()
+    np_dx = x.grad.asnumpy()
+
     layer.hybridize()
 
     x = mx.nd.ones(shape=dshape)
@@ -127,6 +172,9 @@ def check_layer_forward(layer, dshape):
         out = layer(x)
     out.backward()
 
+    mx.test_utils.assert_almost_equal(np_out, out.asnumpy(), rtol=1e-5, atol=1e-6)
+    mx.test_utils.assert_almost_equal(np_dx, x.grad.asnumpy(), rtol=1e-5, atol=1e-6)
+
 def test_conv():
     layers1d = [
         nn.Conv1D(16, 3, in_channels=4),
@@ -305,7 +353,7 @@ def check_split_data(x, num_slice, batch_axis, **kwargs):
 
 
 def test_split_data():
-    x = mx.nd.random_uniform(shape=(128, 33, 64))
+    x = mx.nd.random.uniform(shape=(128, 33, 64))
 
     check_split_data(x, 8, 0)
     check_split_data(x, 3, 1)
@@ -329,9 +377,13 @@ def test_flatten():
 
 
 def test_trainer():
+    def dict_equ(a, b):
+        assert set(a) == set(b)
+        for k in a:
+            assert (a[k].asnumpy() == b[k].asnumpy()).all()
     x = gluon.Parameter('x', shape=(10,))
     x.initialize(ctx=[mx.cpu(0), mx.cpu(1)], init='zeros')
-    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0})
+    trainer = gluon.Trainer([x], 'sgd', {'learning_rate': 1.0, 'momentum': 0.5})
     with mx.autograd.record():
         for w in x.list_data():
             y = w + 1
@@ -348,7 +400,215 @@ def test_trainer():
             y.backward()
     trainer.step(1)
 
-    assert (x.data(mx.cpu(1)).asnumpy() == -3).all()
+    assert (x.data(mx.cpu(1)).asnumpy() == -4).all()
+
+    trainer.save_states('test.states')
+    states = deepcopy(trainer._kvstore._updater.states) if trainer._update_on_kvstore \
+             else deepcopy(trainer._updaters[0].states)
+    trainer.load_states('test.states')
+    if trainer._update_on_kvstore:
+        dict_equ(trainer._kvstore._updater.states, states)
+        assert trainer._optimizer == trainer._kvstore._updater.optimizer
+    else:
+        for updater in trainer._updaters:
+            dict_equ(updater.states, states)
+        assert trainer._optimizer == trainer._updaters[0].optimizer
+
+
+def test_block_attr_hidden():
+    b = gluon.Block()
+
+    # regular attributes can change types
+    b.a = None
+    b.a = 1
+
+@raises(TypeError)
+def test_block_attr_block():
+    b = gluon.Block()
+
+    # regular variables can't change types
+    b.b = gluon.Block()
+    b.b = (2,)
+
+@raises(TypeError)
+def test_block_attr_param():
+    b = gluon.Block()
+
+    # regular variables can't change types
+    b.b = gluon.Parameter()
+    b.b = (2,)
+
+def test_block_attr_regular():
+    b = gluon.Block()
+
+    # set block attribute also sets _children
+    b.c = gluon.Block()
+    c2 = gluon.Block()
+    b.c = c2
+    assert b.c is c2 and b._children[0] is c2
+
+def test_sequential_warning():
+    with warnings.catch_warnings(record=True) as w:
+        b = gluon.nn.Sequential()
+        b.add(gluon.nn.Dense(20))
+        b.hybridize()
+        assert len(w) == 1
+
+def test_global_norm_clip():
+    x1 = mx.nd.ones((3,3))
+    x2 = mx.nd.ones((4,4))
+    norm = gluon.utils.clip_global_norm([x1, x2], 1.0)
+    assert norm == 5.0
+    assert_almost_equal(x1.asnumpy(), np.ones((3,3))/5)
+    assert_almost_equal(x2.asnumpy(), np.ones((4,4))/5)
+
+    x3 = mx.nd.array([1.0, 2.0, float('nan')])
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        gluon.utils.clip_global_norm([x1, x3], 2.0)
+        assert len(w) == 1
+
+
+def test_embedding():
+    layer = gluon.nn.Embedding(10, 100)
+    layer.initialize()
+    x = mx.nd.array([3,4,2,0,1])
+    with mx.autograd.record():
+        y = layer(x)
+        y.backward()
+    assert (layer.weight.grad()[:5] == 1).asnumpy().all()
+    assert (layer.weight.grad()[5:] == 0).asnumpy().all()
+
+
+def test_export():
+    ctx = mx.context.current_context()
+    model = gluon.model_zoo.vision.resnet18_v1(
+        prefix='resnet', ctx=ctx, pretrained=True)
+    model.hybridize()
+    data = mx.nd.random.normal(shape=(1, 3, 224, 224))
+    out = model(data)
+
+    model.export('gluon')
+
+    module = mx.mod.Module.load('gluon', 0, label_names=None, context=ctx)
+    module.bind(data_shapes=[('data', data.shape)])
+    module.forward(mx.io.DataBatch([data], None), is_train=False)
+    mod_out, = module.get_outputs()
+
+    assert_almost_equal(out.asnumpy(), mod_out.asnumpy())
+
+    model2 = gluon.model_zoo.vision.resnet18_v1(prefix='resnet', ctx=ctx)
+    model2.collect_params().load('gluon-0000.params', ctx)
+    out2 = model2(data)
+
+    assert_almost_equal(out.asnumpy(), out2.asnumpy())
+
+
+def test_hybrid_stale_cache():
+    net = mx.gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.add(mx.gluon.nn.Dense(10, weight_initializer='zeros', bias_initializer='ones', flatten=False))
+
+    net.hybridize()
+    net.initialize()
+    net(mx.nd.ones((2,3,5)))
+
+    net.add(mx.gluon.nn.Flatten())
+    assert net(mx.nd.ones((2,3,5))).shape == (2, 30)
+
+    net = mx.gluon.nn.HybridSequential()
+    with net.name_scope():
+        net.fc1 = mx.gluon.nn.Dense(10, weight_initializer='zeros',
+                                    bias_initializer='ones', flatten=False)
+        net.fc2 = mx.gluon.nn.Dense(10, weight_initializer='zeros',
+                                    bias_initializer='ones', flatten=False)
+    net.hybridize()
+    net.initialize()
+    net(mx.nd.ones((2,3,5)))
+
+    net.fc2 = mx.gluon.nn.Dense(10, weight_initializer='zeros',
+                                bias_initializer='ones', flatten=True)
+    net.initialize()
+    assert net(mx.nd.ones((2,3,5))).shape == (2, 10)
+
+
+def test_lambda():
+    net1 = mx.gluon.nn.HybridSequential()
+    net1.add(nn.Activation('tanh'),
+             nn.LeakyReLU(0.1))
+
+    net2 = mx.gluon.nn.HybridSequential()
+    op3 = lambda F, x, *args: F.LeakyReLU(x, *args, slope=0.1)
+    net2.add(nn.HybridLambda('tanh'),
+             nn.HybridLambda(op3))
+
+    op4 = lambda x: mx.nd.LeakyReLU(x, slope=0.1)
+    net3 = mx.gluon.nn.Sequential()
+    net3.add(nn.Lambda('tanh'),
+             nn.Lambda(op4))
+
+    input_data = mx.nd.random.uniform(shape=(2, 3, 5, 7))
+    out1, out2, out3 = net1(input_data), net2(input_data), net3(input_data)
+    assert_almost_equal(out1.asnumpy(), out2.asnumpy())
+    assert_almost_equal(out1.asnumpy(), out3.asnumpy())
+
+
+def test_fill_shape_deferred():
+    net = nn.HybridSequential()
+    with net.name_scope():
+        net.add(nn.Conv2D(64, kernel_size=2, padding=1),
+                nn.BatchNorm(),
+                nn.Dense(10))
+    net.hybridize()
+    net.initialize()
+    net(mx.nd.ones((2,3,5,7)))
+    assert net[0].weight.shape[1] == 3, net[0].weight.shape[1]
+    assert net[1].gamma.shape[0] == 64, net[1].gamma.shape[0]
+    assert net[2].weight.shape[1] == 3072, net[2].weight.shape[1]
+
+
+def test_dtype():
+    net = mx.gluon.model_zoo.vision.resnet18_v1()
+    net.initialize()
+    net.cast('float64')
+    with mx.autograd.record():
+        y = net(mx.nd.ones((16, 3, 32, 32), dtype='float64'))
+        y.backward()
+
+    net = mx.gluon.model_zoo.vision.resnet18_v1()
+    net.initialize()
+    net.hybridize()
+    net(mx.nd.ones((16, 3, 32, 32), dtype='float32'))
+
+    net.cast('float64')
+    net(mx.nd.ones((16, 3, 32, 32), dtype='float64'))
+
+    mx.nd.waitall()
+
+
+def test_fill_shape_load():
+    ctx = mx.context.current_context()
+    net1 = nn.HybridSequential()
+    with net1.name_scope():
+        net1.add(nn.Conv2D(64, kernel_size=2, padding=1),
+                 nn.BatchNorm(),
+                 nn.Dense(10))
+    net1.hybridize()
+    net1.initialize(ctx=ctx)
+    net1(mx.nd.ones((2,3,5,7), ctx))
+    net1.save_params('net_fill.params')
+
+    net2 = nn.HybridSequential()
+    with net2.name_scope():
+        net2.add(nn.Conv2D(64, kernel_size=2, padding=1),
+                 nn.BatchNorm(),
+                 nn.Dense(10))
+    net2.hybridize()
+    net2.initialize()
+    net2.load_params('net_fill.params', ctx)
+    assert net2[0].weight.shape[1] == 3, net2[0].weight.shape[1]
+    assert net2[1].gamma.shape[0] == 64, net2[1].gamma.shape[0]
+    assert net2[2].weight.shape[1] == 3072, net2[2].weight.shape[1]
 
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_gluon_contrib.py b/tests/python/unittest/test_gluon_contrib.py
new file mode 100644
index 000000000000..07b8956988fd
--- /dev/null
+++ b/tests/python/unittest/test_gluon_contrib.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import mxnet as mx
+from mxnet.gluon import contrib
+from mxnet.test_utils import almost_equal
+import numpy as np
+from numpy.testing import assert_allclose
+
+
+def check_rnn_cell(cell, prefix, in_shape=(10, 50), out_shape=(10, 100), begin_state=None):
+    inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+    outputs, _ = cell.unroll(3, inputs, begin_state=begin_state)
+    outputs = mx.sym.Group(outputs)
+    assert sorted(cell.collect_params().keys()) == [prefix+'h2h_bias', prefix+'h2h_weight',
+                                                    prefix+'i2h_bias', prefix+'i2h_weight']
+    assert outputs.list_outputs() == [prefix+'t0_out_output', prefix+'t1_out_output', prefix+'t2_out_output']
+
+    args, outs, auxs = outputs.infer_shape(rnn_t0_data=in_shape,
+                                           rnn_t1_data=in_shape,
+                                           rnn_t2_data=in_shape)
+    assert outs == [out_shape]*3
+
+
+def check_rnn_forward(layer, inputs):
+    inputs.attach_grad()
+    layer.collect_params().initialize()
+    with mx.autograd.record():
+        layer.unroll(3, inputs, merge_outputs=True)[0].backward()
+        mx.autograd.backward(layer.unroll(3, inputs, merge_outputs=False)[0])
+    mx.nd.waitall()
+
+
+def test_rnn_cells():
+    check_rnn_forward(contrib.rnn.Conv1DLSTMCell((5, 7), 10, (3,), (3,)),
+                      mx.nd.ones((8, 3, 5, 7)))
+    check_rnn_forward(contrib.rnn.Conv1DRNNCell((5, 7), 10, (3,), (3,)),
+                      mx.nd.ones((8, 3, 5, 7)))
+    check_rnn_forward(contrib.rnn.Conv1DGRUCell((5, 7), 10, (3,), (3,)),
+                      mx.nd.ones((8, 3, 5, 7)))
+
+    net = mx.gluon.rnn.SequentialRNNCell()
+    net.add(contrib.rnn.Conv1DLSTMCell((5, 7), 10, (3,), (3,)))
+    net.add(contrib.rnn.Conv1DRNNCell((10, 5), 11, (3,), (3,)))
+    net.add(contrib.rnn.Conv1DGRUCell((11, 3), 12, (3,), (3,)))
+    check_rnn_forward(net, mx.nd.ones((8, 3, 5, 7)))
+
+
+def test_convrnn():
+    cell = contrib.rnn.Conv1DRNNCell((10, 50), 100, 3, 3, prefix='rnn_')
+    check_rnn_cell(cell, prefix='rnn_', in_shape=(1, 10, 50), out_shape=(1, 100, 48))
+
+    cell = contrib.rnn.Conv2DRNNCell((10, 20, 50), 100, 3, 3, prefix='rnn_')
+    check_rnn_cell(cell, prefix='rnn_', in_shape=(1, 10, 20, 50), out_shape=(1, 100, 18, 48))
+
+    cell = contrib.rnn.Conv3DRNNCell((10, 20, 30, 50), 100, 3, 3, prefix='rnn_')
+    check_rnn_cell(cell, prefix='rnn_', in_shape=(1, 10, 20, 30, 50), out_shape=(1, 100, 18, 28, 48))
+
+
+def test_convlstm():
+    cell = contrib.rnn.Conv1DLSTMCell((10, 50), 100, 3, 3, prefix='rnn_')
+    check_rnn_cell(cell, prefix='rnn_', in_shape=(1, 10, 50), out_shape=(1, 100, 48))
+
+    cell = contrib.rnn.Conv2DLSTMCell((10, 20, 50), 100, 3, 3, prefix='rnn_')
+    check_rnn_cell(cell, prefix='rnn_', in_shape=(1, 10, 20, 50), out_shape=(1, 100, 18, 48))
+
+    cell = contrib.rnn.Conv3DLSTMCell((10, 20, 30, 50), 100, 3, 3, prefix='rnn_')
+    check_rnn_cell(cell, prefix='rnn_', in_shape=(1, 10, 20, 30, 50), out_shape=(1, 100, 18, 28, 48))
+
+
+def test_convgru():
+    cell = contrib.rnn.Conv1DGRUCell((10, 50), 100, 3, 3, prefix='rnn_')
+    check_rnn_cell(cell, prefix='rnn_', in_shape=(1, 10, 50), out_shape=(1, 100, 48))
+
+    cell = contrib.rnn.Conv2DGRUCell((10, 20, 50), 100, 3, 3, prefix='rnn_')
+    check_rnn_cell(cell, prefix='rnn_', in_shape=(1, 10, 20, 50), out_shape=(1, 100, 18, 48))
+
+    cell = contrib.rnn.Conv3DGRUCell((10, 20, 30, 50), 100, 3, 3, prefix='rnn_')
+    check_rnn_cell(cell, prefix='rnn_', in_shape=(1, 10, 20, 30, 50), out_shape=(1, 100, 18, 28, 48))
+
+
+def test_conv_fill_shape():
+    cell = contrib.rnn.Conv1DLSTMCell((0, 7), 10, (3,), (3,))
+    cell.hybridize()
+    check_rnn_forward(cell, mx.nd.ones((8, 3, 5, 7)))
+    assert cell.i2h_weight.shape[1] == 5, cell.i2h_weight.shape[1]
+
+
+def test_vardrop():
+    def check_vardrop(drop_inputs, drop_states, drop_outputs):
+        cell = contrib.rnn.VariationalDropoutCell(mx.gluon.rnn.RNNCell(100, prefix='rnn_'),
+                                                  drop_outputs=drop_outputs,
+                                                  drop_states=drop_states,
+                                                  drop_inputs=drop_inputs)
+        cell.collect_params().initialize(init='xavier')
+        input_data = mx.nd.random_uniform(shape=(10, 3, 50), ctx=mx.context.current_context())
+        with mx.autograd.record():
+            outputs1, _ = cell.unroll(3, input_data, merge_outputs=True)
+            mask1 = cell.drop_outputs_mask.asnumpy()
+            mx.nd.waitall()
+            outputs2, _ = cell.unroll(3, input_data, merge_outputs=True)
+            mask2 = cell.drop_outputs_mask.asnumpy()
+        assert not almost_equal(mask1, mask2)
+        assert not almost_equal(outputs1.asnumpy(), outputs2.asnumpy())
+
+        inputs = [mx.sym.Variable('rnn_t%d_data'%i) for i in range(3)]
+        outputs, _ = cell.unroll(3, inputs, merge_outputs=False)
+        outputs = mx.sym.Group(outputs)
+
+        args, outs, auxs = outputs.infer_shape(rnn_t0_data=(10,50), rnn_t1_data=(10,50), rnn_t2_data=(10,50))
+        assert outs == [(10, 100), (10, 100), (10, 100)]
+
+        cell.reset()
+        cell.hybridize()
+        with mx.autograd.record():
+            outputs3, _ = cell.unroll(3, input_data, merge_outputs=True)
+            mx.nd.waitall()
+            outputs4, _ = cell.unroll(3, input_data, merge_outputs=True)
+        assert not almost_equal(outputs3.asnumpy(), outputs4.asnumpy())
+        assert not almost_equal(outputs1.asnumpy(), outputs3.asnumpy())
+
+    check_vardrop(0.5, 0.5, 0.5)
+    check_vardrop(0.5, 0, 0.5)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index 32298fcd57d5..63c5d28b7ce0 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -17,6 +17,7 @@
 
 import os
 import tarfile
+import unittest
 import mxnet as mx
 import numpy as np
 from mxnet import gluon
@@ -26,11 +27,16 @@ def test_array_dataset():
     Y = np.random.uniform(size=(10,))
     dataset = gluon.data.ArrayDataset(X, Y)
     loader = gluon.data.DataLoader(dataset, 2)
-
     for i, (x, y) in enumerate(loader):
         assert mx.test_utils.almost_equal(x.asnumpy(), X[i*2:(i+1)*2])
         assert mx.test_utils.almost_equal(y.asnumpy(), Y[i*2:(i+1)*2])
 
+    dataset = gluon.data.ArrayDataset(X)
+    loader = gluon.data.DataLoader(dataset, 2)
+
+    for i, x in enumerate(loader):
+        assert mx.test_utils.almost_equal(x.asnumpy(), X[i*2:(i+1)*2])
+
 
 def prepare_record():
     if not os.path.isdir("data/test_images"):
@@ -70,8 +76,15 @@ def test_sampler():
     assert sorted(sum(list(rand_batch_keep), [])) == list(range(10))
 
 def test_datasets():
-    assert len(gluon.data.vision.MNIST(root='data')) == 60000
-    assert len(gluon.data.vision.CIFAR10(root='data', train=False)) == 10000
+    assert len(gluon.data.vision.MNIST(root='data/mnist')) == 60000
+    assert len(gluon.data.vision.MNIST(root='data/mnist', train=False)) == 10000
+    assert len(gluon.data.vision.FashionMNIST(root='data/fashion-mnist')) == 60000
+    assert len(gluon.data.vision.FashionMNIST(root='data/fashion-mnist', train=False)) == 10000
+    assert len(gluon.data.vision.CIFAR10(root='data/cifar10')) == 50000
+    assert len(gluon.data.vision.CIFAR10(root='data/cifar10', train=False)) == 10000
+    assert len(gluon.data.vision.CIFAR100(root='data/cifar100')) == 50000
+    assert len(gluon.data.vision.CIFAR100(root='data/cifar100', fine_label=True)) == 50000
+    assert len(gluon.data.vision.CIFAR100(root='data/cifar100', train=False)) == 10000
 
 def test_image_folder_dataset():
     prepare_record()
@@ -80,6 +93,20 @@ def test_image_folder_dataset():
     assert len(dataset.items) == 16
 
 
+class Dataset(gluon.data.Dataset):
+    def __len__(self):
+        return 100
+    def __getitem__(self, key):
+        return mx.nd.full((10,), key)
+
+@unittest.skip("Somehow fails with MKL. Cannot reproduce locally")
+def test_multi_worker():
+    data = Dataset()
+    loader = gluon.data.DataLoader(data, batch_size=1, num_workers=5)
+    for i, batch in enumerate(loader):
+        assert (batch.asnumpy() == i).all()
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_gluon_model_zoo.py b/tests/python/unittest/test_gluon_model_zoo.py
index 6fbcf8b3dac8..39d3b19c3623 100644
--- a/tests/python/unittest/test_gluon_model_zoo.py
+++ b/tests/python/unittest/test_gluon_model_zoo.py
@@ -45,7 +45,7 @@ def test_concurrent():
 
 def test_identity():
     model = Identity()
-    x = mx.nd.random_uniform(shape=(128, 33, 64))
+    x = mx.nd.random.uniform(shape=(128, 33, 64))
     mx.test_utils.assert_almost_equal(model(x).asnumpy(),
                                       x.asnumpy())
 
@@ -57,18 +57,19 @@ def test_models():
                   'vgg11_bn', 'vgg13_bn', 'vgg16_bn', 'vgg19_bn',
                   'alexnet', 'inceptionv3',
                   'densenet121', 'densenet161', 'densenet169', 'densenet201',
-                  'squeezenet1.0', 'squeezenet1.1']
+                  'squeezenet1.0', 'squeezenet1.1',
+                  'mobilenet1.0', 'mobilenet0.75', 'mobilenet0.5', 'mobilenet0.25']
     pretrained_to_test = set(['squeezenet1.1'])
 
     for model_name in all_models:
         test_pretrain = model_name in pretrained_to_test
-        model = get_model(model_name, pretrained=test_pretrain)
+        model = get_model(model_name, pretrained=test_pretrain, root='model/')
         data_shape = (2, 3, 224, 224) if 'inception' not in model_name else (2, 3, 299, 299)
         eprint('testing forward for %s'%model_name)
         print(model)
         if not test_pretrain:
             model.collect_params().initialize()
-        model(mx.nd.random_uniform(shape=data_shape)).wait_to_read()
+        model(mx.nd.random.uniform(shape=data_shape)).wait_to_read()
 
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_gluon_rnn.py b/tests/python/unittest/test_gluon_rnn.py
index 5dcbdfa65d35..228884219258 100644
--- a/tests/python/unittest/test_gluon_rnn.py
+++ b/tests/python/unittest/test_gluon_rnn.py
@@ -19,6 +19,8 @@
 from mxnet import gluon
 import numpy as np
 from numpy.testing import assert_allclose
+import unittest
+from mxnet.test_utils import almost_equal
 
 
 def test_rnn():
@@ -180,13 +182,29 @@ def test_zoneout():
     assert outs == [(10, 100), (10, 100), (10, 100)]
 
 
-def check_rnn_forward(layer, inputs):
+def check_rnn_forward(layer, inputs, deterministic=True):
     inputs.attach_grad()
     layer.collect_params().initialize()
     with mx.autograd.record():
-        layer.unroll(3, inputs, merge_outputs=True)[0].backward()
-        mx.autograd.backward(layer.unroll(3, inputs, merge_outputs=False)[0])
-    mx.nd.waitall()
+        out = layer.unroll(3, inputs, merge_outputs=False)[0]
+        mx.autograd.backward(out)
+        out = layer.unroll(3, inputs, merge_outputs=True)[0]
+        out.backward()
+
+    np_out = out.asnumpy()
+    np_dx = inputs.grad.asnumpy()
+
+    layer.hybridize()
+
+    with mx.autograd.record():
+        out = layer.unroll(3, inputs, merge_outputs=False)[0]
+        mx.autograd.backward(out)
+        out = layer.unroll(3, inputs, merge_outputs=True)[0]
+        out.backward()
+
+    if deterministic:
+        mx.test_utils.assert_almost_equal(np_out, out.asnumpy(), rtol=1e-3, atol=1e-5)
+        mx.test_utils.assert_almost_equal(np_dx, inputs.grad.asnumpy(), rtol=1e-3, atol=1e-5)
 
 
 def test_rnn_cells():
@@ -198,11 +216,11 @@ def test_rnn_cells():
                                        gluon.rnn.LSTMCell(100, input_size=200))
     check_rnn_forward(bilayer, mx.nd.ones((8, 3, 200)))
 
-    check_rnn_forward(gluon.rnn.DropoutCell(0.5), mx.nd.ones((8, 3, 200)))
+    check_rnn_forward(gluon.rnn.DropoutCell(0.5), mx.nd.ones((8, 3, 200)), False)
 
     check_rnn_forward(gluon.rnn.ZoneoutCell(gluon.rnn.LSTMCell(100, input_size=200),
                                          0.5, 0.2),
-                      mx.nd.ones((8, 3, 200)))
+                      mx.nd.ones((8, 3, 200)), False)
 
     net = gluon.rnn.SequentialRNNCell()
     net.add(gluon.rnn.LSTMCell(100, input_size=200))
@@ -212,6 +230,7 @@ def test_rnn_cells():
 
 def check_rnn_layer_forward(layer, inputs, states=None):
     layer.collect_params().initialize()
+    inputs.attach_grad()
     with mx.autograd.record():
         out = layer(inputs, states)
         if states is not None:
@@ -220,7 +239,23 @@ def check_rnn_layer_forward(layer, inputs, states=None):
         else:
             assert isinstance(out, mx.nd.NDArray)
         out.backward()
-    mx.nd.waitall()
+
+    np_out = out.asnumpy()
+    np_dx = inputs.grad.asnumpy()
+
+    layer.hybridize()
+
+    with mx.autograd.record():
+        out = layer(inputs, states)
+        if states is not None:
+            assert isinstance(out, tuple) and len(out) == 2
+            out = out[0]
+        else:
+            assert isinstance(out, mx.nd.NDArray)
+        out.backward()
+
+    mx.test_utils.assert_almost_equal(np_out, out.asnumpy(), rtol=1e-3, atol=1e-5)
+    mx.test_utils.assert_almost_equal(np_dx, inputs.grad.asnumpy(), rtol=1e-3, atol=1e-5)
 
 def test_rnn_layers():
     check_rnn_layer_forward(gluon.rnn.RNN(10, 2), mx.nd.ones((8, 3, 20)))
@@ -239,6 +274,19 @@ def test_rnn_layers():
     with mx.autograd.record():
         net(mx.nd.ones((2, 3, 10))).backward()
 
+def test_cell_fill_shape():
+    cell = gluon.rnn.LSTMCell(10)
+    cell.hybridize()
+    check_rnn_forward(cell, mx.nd.ones((2, 3, 7)))
+    assert cell.i2h_weight.shape[1] == 7, cell.i2h_weight.shape[1]
+
+def test_layer_fill_shape():
+    layer = gluon.rnn.LSTM(10)
+    layer.hybridize()
+    check_rnn_layer_forward(layer, mx.nd.ones((3, 2, 7)))
+    print(layer)
+    assert layer.i2h_weight[0].shape[1] == 7, layer.i2h_weight[0].shape[1]
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_infer_shape.py b/tests/python/unittest/test_infer_shape.py
index d7f52e216659..73654a604135 100644
--- a/tests/python/unittest/test_infer_shape.py
+++ b/tests/python/unittest/test_infer_shape.py
@@ -52,7 +52,7 @@ def test_backward_infer():
     # broadcast add here, not being able to deduce shape correctly
     wt = mx.sym.broadcast_add(w, wshift)
     # shape constraint, this is what enables backward shape inference
-    wt = mx._symbol_internal._identity_with_attr_like_rhs(wt, w)
+    wt = mx.symbol._internal._identity_with_attr_like_rhs(wt, w)
     net = mx.sym.FullyConnected(data=data, weight=wt, num_hidden=11, no_bias=True)
     data_shape = (7, 100)
     arg_shapes, out_shapes, aux_shapes = net.infer_shape(data=data_shape)
@@ -129,6 +129,24 @@ def test_incomplete_infer_concat():
     assert arg_shapes['b'] == (2, 5)
     assert arg_shapes['d'] == (2, 15)
 
+def test_fc_infer_type():
+    mx_real_t = mx.base.mx_real_t
+    data = mx.symbol.Variable('data')
+    out = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=1000)
+
+    # infer type
+    data_type = mx_real_t
+    arg_types, out_types, aux_types = out.infer_type(data=data_type)
+    arg_type_dict = dict(zip(out.list_arguments(), arg_types))
+    assert len(out_types) == 1
+    assert out_types[0] == mx_real_t
+    true_types = {
+                   'fc1_bias' : mx_real_t,
+                   'fc1_weight' : mx_real_t }
+    for k, v in true_types.items():
+        assert arg_type_dict[k] == v
+
+
 if __name__ == "__main__":
     test_mlp2_infer_shape()
     test_mlp2_infer_error()
diff --git a/tests/python/unittest/test_io.py b/tests/python/unittest/test_io.py
index c0f2acd4ed47..fa314e0f8b8b 100644
--- a/tests/python/unittest/test_io.py
+++ b/tests/python/unittest/test_io.py
@@ -17,6 +17,7 @@
 
 # pylint: skip-file
 import mxnet as mx
+from mxnet.test_utils import *
 import numpy as np
 import os, gzip
 import pickle as pickle
@@ -26,7 +27,9 @@
 except ImportError:
     h5py = None
 import sys
-from common import get_data
+from common import get_data, assertRaises
+import unittest
+
 
 def test_MNISTIter():
     # prepare data
@@ -58,8 +61,6 @@ def test_MNISTIter():
     assert(sum(label_0 - label_1) == 0)
 
 def test_Cifar10Rec():
-    # skip-this test for saving time
-    return
     get_data.GetCifar10()
     dataiter = mx.io.ImageRecordIter(
             path_imgrec="data/cifar/train.rec",
@@ -152,6 +153,121 @@ def test_NDArrayIter_h5py():
         else:
             assert(labelcount[i] == 100)
 
+def test_NDArrayIter_csr():
+    # creating toy data
+    num_rows = rnd.randint(5, 15)
+    num_cols = rnd.randint(1, 20)
+    batch_size = rnd.randint(1, num_rows)
+    shape = (num_rows, num_cols)
+    csr, _ = rand_sparse_ndarray(shape, 'csr')
+    dns = csr.asnumpy()
+
+    # CSRNDArray with last_batch_handle not equal to 'discard' will throw NotImplementedError
+    assertRaises(NotImplementedError, mx.io.NDArrayIter, {'data': csr}, dns, batch_size,
+                 last_batch_handle='pad')
+
+    # CSRNDArray with shuffle
+    csr_iter = iter(mx.io.NDArrayIter({'csr_data': csr, 'dns_data': dns}, dns, batch_size,
+                    shuffle=True, last_batch_handle='discard'))
+    num_batch = 0
+    for batch in csr_iter:
+        num_batch += 1
+
+    assert(num_batch == num_rows // batch_size)
+
+    # make iterators
+    csr_iter = iter(mx.io.NDArrayIter(csr, csr, batch_size, last_batch_handle='discard'))
+    begin = 0
+    for batch in csr_iter:
+        expected = np.zeros((batch_size, num_cols))
+        end = begin + batch_size
+        expected[:num_rows - begin] = dns[begin:end]
+        if end > num_rows:
+            expected[num_rows - begin:] = dns[0:end - num_rows]
+        assert_almost_equal(batch.data[0].asnumpy(), expected)
+        begin += batch_size
+
+def test_LibSVMIter():
+
+    def check_libSVMIter_synthetic():
+        cwd = os.getcwd()
+        data_path = os.path.join(cwd, 'data.t')
+        label_path = os.path.join(cwd, 'label.t')
+        with open(data_path, 'w') as fout:
+            fout.write('1.0 0:0.5 2:1.2\n')
+            fout.write('-2.0\n')
+            fout.write('-3.0 0:0.6 1:2.4 2:1.2\n')
+            fout.write('4 2:-1.2\n')
+
+        with open(label_path, 'w') as fout:
+            fout.write('1.0\n')
+            fout.write('-2.0 0:0.125\n')
+            fout.write('-3.0 2:1.2\n')
+            fout.write('4 1:1.0 2:-1.2\n')
+
+        data_dir = os.path.join(cwd, 'data')
+        data_train = mx.io.LibSVMIter(data_libsvm=data_path, label_libsvm=label_path,
+                                      data_shape=(3, ), label_shape=(3, ), batch_size=3)
+
+        first = mx.nd.array([[ 0.5, 0., 1.2], [ 0., 0., 0.], [ 0.6, 2.4, 1.2]])
+        second = mx.nd.array([[ 0., 0., -1.2], [ 0.5, 0., 1.2], [ 0., 0., 0.]])
+        i = 0
+        for batch in iter(data_train):
+            expected = first.asnumpy() if i == 0 else second.asnumpy()
+            assert_almost_equal(data_train.getdata().asnumpy(), expected)
+            i += 1
+
+    def check_libSVMIter_news_data():
+        news_metadata = {
+            'name': 'news20.t',
+            'origin_name': 'news20.t.bz2',
+            'url': "http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/news20.t.bz2",
+            'feature_dim': 62060,
+            'num_classes': 20,
+            'num_examples': 3993,
+        }
+        batch_size = 33
+        num_examples = news_metadata['num_examples']
+        data_dir = os.path.join(os.getcwd(), 'data')
+        get_bz2_data(data_dir, news_metadata['name'], news_metadata['url'],
+                     news_metadata['origin_name'])
+        path = os.path.join(data_dir, news_metadata['name'])
+        data_train = mx.io.LibSVMIter(data_libsvm=path, data_shape=(news_metadata['feature_dim'],),
+                                      batch_size=batch_size)
+        for epoch in range(2):
+            num_batches = 0
+            for batch in data_train:
+                # check the range of labels
+                assert(np.sum(batch.label[0].asnumpy() > 20) == 0)
+                assert(np.sum(batch.label[0].asnumpy() <= 0) == 0)
+                num_batches += 1
+            expected_num_batches = num_examples / batch_size
+            assert(num_batches == int(expected_num_batches)), num_batches
+            data_train.reset()
+
+    check_libSVMIter_synthetic()
+    check_libSVMIter_news_data()
+
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/7826")
+def test_CSVIter():
+    def check_CSVIter_synthetic():
+        cwd = os.getcwd()
+        data_path = os.path.join(cwd, 'data.t')
+        label_path = os.path.join(cwd, 'label.t')
+        with open(data_path, 'w') as fout:
+            for i in range(1000):
+                fout.write(','.join(['1' for _ in range(8*8)]) + '\n')
+        with open(label_path, 'w') as fout:
+            for i in range(1000):
+                fout.write('0\n')
+
+        data_train = mx.io.CSVIter(data_csv=data_path, data_shape=(8,8),
+                                   label_csv=label_path, batch_size=100)
+        expected = mx.nd.ones((100, 8, 8))
+        for batch in iter(data_train):
+            assert_almost_equal(data_train.getdata().asnumpy(), expected.asnumpy())
+
+    check_CSVIter_synthetic()
 
 if __name__ == "__main__":
     test_NDArrayIter()
@@ -159,3 +275,6 @@ def test_NDArrayIter_h5py():
         test_NDArrayIter_h5py()
     test_MNISTIter()
     test_Cifar10Rec()
+    test_LibSVMIter()
+    test_NDArrayIter_csr()
+    test_CSVIter()
diff --git a/tests/python/unittest/test_kvstore.py b/tests/python/unittest/test_kvstore.py
index f1e10c757fad..174d577556dd 100644
--- a/tests/python/unittest/test_kvstore.py
+++ b/tests/python/unittest/test_kvstore.py
@@ -18,44 +18,77 @@
 # pylint: skip-file
 import mxnet as mx
 import numpy as np
+import unittest
+from mxnet.test_utils import rand_ndarray, assert_almost_equal, assert_exception
+from mxnet.base import py_str, MXNetError
 
 shape = (4, 4)
 keys = [5, 7, 11]
 str_keys = ['b', 'c', 'd']
 
-def init_kv():
+def init_kv(stype='default'):
     """init kv """
     kv = mx.kv.create()
     # single
-    kv.init(3, mx.nd.zeros(shape))
+    kv.init(3, mx.nd.zeros(shape=shape, stype=stype))
     # list
-    kv.init(keys, [mx.nd.zeros(shape)] * len(keys))
+    kv.init(keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys))
     return kv
 
-def init_kv_with_str():
+def init_kv_with_str(stype='default'):
     """init kv """
     kv = mx.kv.create()
     # single
-    kv.init('a', mx.nd.zeros(shape))
+    kv.init('a', mx.nd.zeros(shape, stype=stype))
     # list
-    kv.init(str_keys, [mx.nd.zeros(shape)] * len(keys))
+    kv.init(str_keys, [mx.nd.zeros(shape=shape, stype=stype)] * len(keys))
     return kv
 
+
 def check_diff_to_scalar(A, x):
     """ assert A == x"""
     assert(np.sum(np.abs((A - x).asnumpy())) == 0)
 
+
 def test_single_kv_pair():
     """single key-value pair push & pull"""
     def check_single_kv_pair(kv, key):
         kv.push(key, mx.nd.ones(shape))
         val = mx.nd.empty(shape)
-        kv.pull(key, out = val)
+        kv.pull(key, out=val)
         check_diff_to_scalar(val, 1)
 
     check_single_kv_pair(init_kv(), 3)
     check_single_kv_pair(init_kv_with_str(), 'a')
 
+def test_row_sparse_pull():
+    kv = init_kv_with_str('row_sparse')
+    kv.init('e', mx.nd.ones(shape).tostype('row_sparse'))
+
+    def check_row_sparse_pull(kv, count):
+        num_rows = shape[0]
+        vals = []
+        row_ids = []
+        all_row_ids = np.arange(num_rows)
+        for i in range(count):
+            vals.append(mx.nd.zeros(shape).tostype('row_sparse'))
+            row_id = np.random.randint(num_rows, size=num_rows)
+            row_ids.append(mx.nd.array(row_id))
+        row_ids_to_pull = row_ids[0] if len(row_ids) == 1 else row_ids
+        vals_to_pull = vals[0] if len(vals) == 1 else vals
+
+        kv.row_sparse_pull('e', out=vals_to_pull, row_ids=row_ids_to_pull)
+        for val, row_id in zip(vals, row_ids):
+            retained = val.asnumpy()
+            excluded_row_ids = np.setdiff1d(all_row_ids, row_id.asnumpy())
+            for row in range(num_rows):
+                expected_val = np.zeros_like(retained[row])
+                expected_val += 0 if row in excluded_row_ids else 1
+                assert_almost_equal(retained[row], expected_val)
+
+    check_row_sparse_pull(kv, 1)
+    check_row_sparse_pull(kv, 4)
+
 def test_init():
     """test init"""
     def check_init(kv, key):
@@ -72,7 +105,7 @@ def test_list_kv_pair():
     def check_list_kv_pair(kv, key):
         kv.push(key, [mx.nd.ones(shape)*4] * len(key))
         val = [mx.nd.empty(shape)] * len(key)
-        kv.pull(key, out = val)
+        kv.pull(key, out=val)
         for v in val:
             check_diff_to_scalar(v, 4)
 
@@ -92,7 +125,7 @@ def check_aggregator(kv, key, key_list):
         vals = [mx.nd.ones(shape, d) for d in devs]
 
         kv.push(key, vals)
-        kv.pull(key, out = vals)
+        kv.pull(key, out=vals)
 
         for v in vals:
             check_diff_to_scalar(v, num_devs)
@@ -100,7 +133,7 @@ def check_aggregator(kv, key, key_list):
         # list
         vals = [[mx.nd.ones(shape, d)*2.0 for d in devs]] * len(key_list)
         kv.push(key_list, vals)
-        kv.pull(key_list, out = vals)
+        kv.pull(key_list, out=vals)
 
         for vv in vals:
             for v in vv:
@@ -110,8 +143,55 @@ def check_aggregator(kv, key, key_list):
     check_aggregator(init_kv_with_str(), 'a', str_keys)
 
 
+def test_sparse_aggregator():
+    """aggregate sparse ndarray on muliple devices"""
+
+    stype = 'row_sparse'
+    kv = init_kv_with_str(stype)
+
+    # devices
+    num_devs = 4
+    devs = [mx.Context('cpu', i) for i in range(num_devs)]
+
+    # single
+    vals = [rand_ndarray(shape, stype).copyto(devs[i]) for i in range(num_devs)]
+    expected_sum = np.zeros(shape)
+    for v in vals:
+        expected_sum += v.asnumpy()
+
+    # prepare row_ids
+    all_rows = mx.nd.array(np.arange(shape[0]))
+    kv.push('a', vals)
+    kv.row_sparse_pull('a', out=vals, row_ids=[all_rows] * len(vals))
+    result_sum = np.zeros(shape)
+    for v in vals:
+        result_sum += v.asnumpy()
+    assert_almost_equal(result_sum, expected_sum * num_devs)
+
+    # list
+    vals = [[rand_ndarray(shape, stype).copyto(devs[i]) for i in range(num_devs)]] * len(keys)
+    expected_sum = np.zeros(shape)
+    for v in vals[0]:
+        expected_sum += v.asnumpy()
+
+    kv.push(str_keys, vals)
+    kv.row_sparse_pull(str_keys, out=vals, row_ids=[[all_rows] * num_devs] * len(vals))
+    for vv in vals:
+        result_sum = np.zeros(shape)
+        for v in vv:
+            result_sum += v.asnumpy()
+        assert_almost_equal(result_sum, expected_sum * num_devs)
+
 def updater(key, recv, local):
-    """use updater: +="""
+    """use updater: += with int keys"""
+    assert(isinstance(key, int))
+    local += recv
+
+def str_updater(key, recv, local):
+    """use updater: += with str keys"""
+    if isinstance(key, bytes):
+        key = py_str(key)
+    assert(isinstance(key, str))
     local += recv
 
 def test_updater(dev = 'cpu'):
@@ -126,7 +206,7 @@ def check_updater(kv, key, key_list):
         vals = [mx.nd.ones(shape, d) for d in devs]
 
         kv.push(key, vals)
-        kv.pull(key, out = vals)
+        kv.pull(key, out=vals)
 
         for v in vals:
             check_diff_to_scalar(v, num_devs)
@@ -138,7 +218,7 @@ def check_updater(kv, key, key_list):
         for i in range(num_push):
             kv.push(key_list, vals)
 
-        kv.pull(key_list, out = vals)
+        kv.pull(key_list, out=vals)
 
         for vv in vals:
             for v in vv:
@@ -149,19 +229,75 @@ def check_updater(kv, key, key_list):
     check_updater(kv, 3, keys)
 
     str_kv = init_kv_with_str()
-    str_kv._set_updater(updater)
+    str_kv._set_updater(str_updater)
     check_updater(str_kv, 'a', str_keys)
 
-
 def test_get_type():
     kvtype = 'local_allreduce_cpu'
     kv = mx.kv.create(kvtype)
     assert kv.type == kvtype
 
+def test_invalid_pull():
+    def check_ignored_pull_single(kv, key):
+        dns_val = (mx.nd.ones(shape) * 2)
+        rsp_val = dns_val.tostype('row_sparse')
+        kv.pull(key, out=rsp_val)
+        check_diff_to_scalar(rsp_val, 2)
+
+    def check_ignored_pull_list(kv, key):
+        dns_val = [mx.nd.ones(shape) * 2] * len(key)
+        rsp_val = [val.tostype('row_sparse') for val in dns_val]
+        kv.pull(key, out=rsp_val)
+        for v in rsp_val:
+            check_diff_to_scalar(v, 2)
+
+    def check_invalid_rsp_pull_single(kv, key):
+        dns_val = mx.nd.ones(shape) * 2
+        assert_exception(kv.row_sparse_pull, MXNetError,
+                         key, out=dns_val, row_ids=mx.nd.array([1]))
+
+    def check_invalid_rsp_pull_list(kv, key):
+        dns_val = [mx.nd.ones(shape) * 2] * len(key)
+        assert_exception(kv.row_sparse_pull, MXNetError, key, out=dns_val,
+                         row_ids=[mx.nd.array([1])] * len(key))
+
+    def check_invalid_key_types_single(kv, key):
+        dns_val = mx.nd.ones(shape) * 2
+        rsp_val = dns_val.tostype('row_sparse')
+        assert_exception(kv.init, MXNetError, key, dns_val)
+        assert_exception(kv.push, MXNetError, key, dns_val)
+        assert_exception(kv.pull, MXNetError, key, dns_val)
+        assert_exception(kv.row_sparse_pull, MXNetError, key, rsp_val,
+                         row_ids=mx.nd.array([1]))
+
+    def check_invalid_key_types_list(kv, key):
+        dns_val = [mx.nd.ones(shape) * 2] * len(key)
+        rsp_val = [val.tostype('row_sparse') for val in dns_val]
+        assert_exception(kv.init, MXNetError, key, dns_val)
+        assert_exception(kv.push, MXNetError, key, dns_val)
+        assert_exception(kv.pull, MXNetError, key, dns_val)
+        assert_exception(kv.row_sparse_pull, MXNetError, key, rsp_val,
+                         row_ids=[mx.nd.array([1])] * len(key))
+
+    int_kv = init_kv()
+    str_kv = init_kv_with_str()
+
+    kvs = [int_kv, str_kv]
+    single_keys = [3, 'a']
+    list_keys = [keys, str_keys]
+    for i in range(2):
+        # pull with rsp outputs should be ignored with no values updated
+        check_ignored_pull_single(kvs[i], single_keys[i])
+        check_ignored_pull_list(kvs[i], list_keys[i])
+        # row_sparse_pull should be aborted when vals.stype != row_sparse
+        check_invalid_rsp_pull_single(kvs[i], single_keys[i])
+        check_invalid_rsp_pull_list(kvs[i], list_keys[i])
+        # kvstore should be restricted to only accept either int or str keys
+        check_invalid_key_types_single(kvs[i], single_keys[1 - i])
+        check_invalid_key_types_list(kvs[i], list_keys[1 - i])
+
 if __name__ == '__main__':
-    test_init()
-    test_get_type()
-    test_single_kv_pair()
-    test_list_kv_pair()
-    test_aggregator()
-    test_updater()
+    import nose
+    nose.runmodule()
+
+
diff --git a/tests/python/unittest/test_loss.py b/tests/python/unittest/test_loss.py
index 714ea7562fdb..8ee4bfae00c4 100644
--- a/tests/python/unittest/test_loss.py
+++ b/tests/python/unittest/test_loss.py
@@ -18,7 +18,7 @@
 import mxnet as mx
 import numpy as np
 from mxnet import gluon
-from mxnet.test_utils import assert_almost_equal
+from mxnet.test_utils import assert_almost_equal, default_context
 
 
 def test_loss_ndarray():
@@ -52,18 +52,17 @@ def test_loss_ndarray():
     mx.test_utils.assert_almost_equal(L, np.array([ 1.06346405,  0.04858733]))
 
 
-def get_net(num_hidden):
+def get_net(num_hidden, flatten=True):
     data = mx.symbol.Variable('data')
-    fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128)
+    fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128, flatten=flatten)
     act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
-    fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
+    fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64, flatten=flatten)
     act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
-    fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden)
+    fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten)
     return fc3
 
 
 def test_ce_loss():
-    mx.random.seed(1234)
     np.random.seed(1234)
     nclass = 10
     N = 20
@@ -71,47 +70,44 @@ def test_ce_loss():
     label = mx.nd.array(np.random.randint(0, nclass, size=(N,)), dtype='int32')
     data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label')
     output = get_net(nclass)
-    fc2 = output.get_internals()['fc2_output']
     l = mx.symbol.Variable('label')
     Loss = gluon.loss.SoftmaxCrossEntropyLoss()
     loss = Loss(output, l)
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 1.},
-            eval_metric=mx.metric.Loss())
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.01
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            eval_metric=mx.metric.Loss(), optimizer='adam')
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
 
 def test_bce_loss():
-    mx.random.seed(1234)
     np.random.seed(1234)
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 20))
     label = mx.nd.array(np.random.randint(2, size=(N,)), dtype='float32')
     data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label')
     output = get_net(1)
-    fc2 = output.get_internals()['fc2_output']
     l = mx.symbol.Variable('label')
     Loss = gluon.loss.SigmoidBinaryCrossEntropyLoss()
     loss = Loss(output, l)
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 1.},
-            eval_metric=mx.metric.Loss())
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            eval_metric=mx.metric.Loss(), optimizer='adam',
+            initializer=mx.init.Xavier(magnitude=2))
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.01
 
 def test_bce_equal_ce2():
     N = 100
     loss1 = gluon.loss.SigmoidBCELoss(from_sigmoid=True)
     loss2 = gluon.loss.SoftmaxCELoss(from_logits=True)
-    out1 = mx.random.uniform(0, 1, shape=(N, 1))
+    out1 = mx.random.uniform(0.1, 0.9, shape=(N, 1))
     out2 = mx.nd.log(mx.nd.concat(1-out1, out1, dim=1) + 1e-8)
     label = mx.nd.round(mx.random.uniform(0, 1, shape=(N, 1)))
     assert_almost_equal(loss1(out1, label).asnumpy(), loss2(out2, label).asnumpy())
 
 
 def test_kl_loss():
-    mx.random.seed(1234)
     np.random.seed(1234)
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
@@ -123,50 +119,92 @@ def test_kl_loss():
     loss = Loss(output, l)
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 1.},
-            eval_metric=mx.metric.Loss())
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            eval_metric=mx.metric.Loss(), optimizer='adam')
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
 
 def test_l2_loss():
-    mx.random.seed(1234)
     np.random.seed(1234)
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
     label = mx.random.uniform(-1, 1, shape=(N, 1))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label')
+    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
     output = get_net(1)
     l = mx.symbol.Variable('label')
     Loss = gluon.loss.L2Loss()
-    Loss(label, label)
     loss = Loss(output, l)
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 1.},
-            eval_metric=mx.metric.Loss())
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            optimizer='adam')
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
 
 def test_l1_loss():
-    mx.random.seed(1234)
     np.random.seed(1234)
     N = 20
     data = mx.random.uniform(-1, 1, shape=(N, 10))
     label = mx.random.uniform(-1, 1, shape=(N, 1))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label')
+    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
     output = get_net(1)
     l = mx.symbol.Variable('label')
     Loss = gluon.loss.L1Loss()
     loss = Loss(output, l)
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.1},
-            initializer=mx.init.Uniform(0.5), eval_metric=mx.metric.Loss())
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            optimizer='adam')
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.1
 
 
+def test_ctc_loss():
+    loss = gluon.loss.CTCLoss()
+    l = loss(mx.nd.ones((2,20,4)), mx.nd.array([[1,0,-1,-1],[2,1,1,-1]]))
+    mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741]))
+
+    loss = gluon.loss.CTCLoss(layout='TNC')
+    l = loss(mx.nd.ones((20,2,4)), mx.nd.array([[1,0,-1,-1],[2,1,1,-1]]))
+    mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741]))
+
+    loss = gluon.loss.CTCLoss(layout='TNC', label_layout='TN')
+    l = loss(mx.nd.ones((20,2,4)), mx.nd.array([[1,0,-1,-1],[2,1,1,-1]]).T)
+    mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741]))
+
+    loss = gluon.loss.CTCLoss()
+    l = loss(mx.nd.ones((2,20,4)), mx.nd.array([[2,1,2,2],[3,2,2,2]]), None, mx.nd.array([2,3]))
+    mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741]))
+
+    loss = gluon.loss.CTCLoss()
+    l = loss(mx.nd.ones((2,25,4)), mx.nd.array([[2,1,-1,-1],[3,2,2,-1]]), mx.nd.array([20,20]))
+    mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741]))
+
+    loss = gluon.loss.CTCLoss()
+    l = loss(mx.nd.ones((2,25,4)), mx.nd.array([[2,1,3,3],[3,2,2,3]]), mx.nd.array([20,20]), mx.nd.array([2,3]))
+    mx.test_utils.assert_almost_equal(l.asnumpy(), np.array([18.82820702, 16.50581741]))
+
+
+def test_ctc_loss_train():
+    np.random.seed(1234)
+    N = 20
+    data = mx.random.uniform(-1, 1, shape=(N, 20, 10))
+    label = mx.nd.arange(4, repeat=N).reshape((N, 4))
+    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
+    output = get_net(5, False)
+    l = mx.symbol.Variable('label')
+    Loss = gluon.loss.CTCLoss(layout='NTC', label_layout='NT')
+    loss = Loss(output, l)
+    loss = mx.sym.make_loss(loss)
+    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 1.},
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            optimizer='adam')
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 10
+
+
 def test_sample_weight_loss():
-    mx.random.seed(1234)
     np.random.seed(1234)
     nclass = 10
     N = 20
@@ -181,8 +219,8 @@ def test_sample_weight_loss():
     loss = Loss(output, l, w)
     loss = mx.sym.make_loss(loss)
     mod = mx.mod.Module(loss, data_names=('data',), label_names=('label', 'w'))
-    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 1.},
-            eval_metric=mx.metric.Loss())
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            eval_metric=mx.metric.Loss(), optimizer='adam')
     data_iter = mx.io.NDArrayIter(data[10:], {'label': label, 'w': weight}, batch_size=10)
     score =  mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1]
     assert score > 1
@@ -214,6 +252,80 @@ def test_saveload():
             eval_metric=mx.metric.Loss())
     assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
 
+def test_huber_loss():
+    np.random.seed(1234)
+    N = 20
+    data = mx.random.uniform(-1, 1, shape=(N, 10))
+    label = mx.random.uniform(-1, 1, shape=(N, 1))
+    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
+    output = get_net(1)
+    l = mx.symbol.Variable('label')
+    Loss = gluon.loss.HuberLoss()
+    loss = Loss(output, l)
+    loss = mx.sym.make_loss(loss)
+    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            optimizer='adam')
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+
+
+def test_hinge_loss():
+    np.random.seed(1234)
+    N = 20
+    data = mx.random.uniform(-1, 1, shape=(N, 10))
+    label = mx.nd.sign(mx.random.uniform(-1, 1, shape=(N, 1)))
+    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
+    output = get_net(1)
+    l = mx.symbol.Variable('label')
+    Loss = gluon.loss.HingeLoss()
+    loss = Loss(output, l)
+    loss = mx.sym.make_loss(loss)
+    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            optimizer='adam')
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+
+
+def test_squared_hinge_loss():
+    np.random.seed(1234)
+    N = 20
+    data = mx.random.uniform(-1, 1, shape=(N, 10))
+    label = mx.nd.sign(mx.random.uniform(-1, 1, shape=(N, 1)))
+    data_iter = mx.io.NDArrayIter(data, label, batch_size=10, label_name='label', shuffle=True)
+    output = get_net(1)
+    l = mx.symbol.Variable('label')
+    Loss = gluon.loss.SquaredHingeLoss()
+    loss = Loss(output, l)
+    loss = mx.sym.make_loss(loss)
+    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            optimizer='adam')
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+
+
+def test_triplet_loss():
+    np.random.seed(1234)
+    N = 20
+    data = mx.random.uniform(-1, 1, shape=(N, 10))
+    pos = mx.random.uniform(-1, 1, shape=(N, 10))
+    neg = mx.random.uniform(-1, 1, shape=(N, 10))
+    data_iter = mx.io.NDArrayIter(data, {'pos': pos, 'neg': neg}, batch_size=10,
+                                  label_name='label', shuffle=True)
+    output = get_net(10)
+    pos = mx.symbol.Variable('pos')
+    neg = mx.symbol.Variable('neg')
+    Loss = gluon.loss.TripletLoss()
+    loss = Loss(output, pos, neg)
+    loss = mx.sym.make_loss(loss)
+    mod = mx.mod.Module(loss, data_names=('data',), label_names=('pos','neg'))
+    mod.fit(data_iter, num_epoch=200, optimizer_params={'learning_rate': 0.01},
+            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
+            optimizer='adam')
+    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.05
+
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_metric.py b/tests/python/unittest/test_metric.py
index 7ae93bf36299..31f31e6e626d 100644
--- a/tests/python/unittest/test_metric.py
+++ b/tests/python/unittest/test_metric.py
@@ -16,6 +16,7 @@
 # under the License.
 
 import mxnet as mx
+import numpy as np
 import json
 
 def check_metric(metric, *args, **kwargs):
@@ -31,9 +32,19 @@ def test_metrics():
     check_metric('f1')
     check_metric('perplexity', -1)
     check_metric('pearsonr')
+    check_metric('nll_loss')
     composite = mx.metric.create(['acc', 'f1'])
     check_metric(composite)
 
+def test_nll_loss():
+    metric = mx.metric.create('nll_loss')
+    pred = mx.nd.array([[0.2, 0.3, 0.5], [0.6, 0.1, 0.3]])
+    label = mx.nd.array([2, 1])
+    metric.update([label], [pred])
+    _, loss = metric.get()
+    expected_loss = 0.0
+    expected_loss = -(np.log(pred[0][2].asscalar()) + np.log(pred[1][1].asscalar())) / 2
+    assert loss == expected_loss
 
 if __name__ == '__main__':
     import nose
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index f522f29dae39..a8fb99dfc1c5 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -17,12 +17,15 @@
 
 import mxnet as mx
 import mxnet.ndarray as nd
+from mxnet.test_utils import *
 import numpy as np
 from functools import reduce
 from mxnet.module.executor_group import DataParallelExecutorGroup
 from common import assertRaises
 from collections import namedtuple
 
+import numpy.random as rnd
+
 
 def test_module_dtype():
     dtype = np.float16
@@ -67,6 +70,63 @@ def test_module_input_grads():
     assert np.all(c_grad == 3), c_grad
 
 
+def test_module_ctx_group():
+    with mx.AttrScope(ctx_group='dev1'):
+        a = mx.symbol.Variable('a')
+        a = a * 2
+    with mx.AttrScope(ctx_group='dev2'):
+        b = mx.symbol.Variable('b')
+        c = a + b
+    shape = (2, 5)
+    mod1 = mx.mod.Module(c, context=[mx.cpu(0)], data_names=['a', 'b'], label_names=None,
+                         group2ctxs=[{'dev1':mx.cpu(1),'dev2':mx.cpu(2)}])
+    mod1.bind(data_shapes=[['a', shape], ['b', shape]], inputs_need_grad=True)
+    mod1.init_params()
+    mod1.forward(data_batch=mx.io.DataBatch(data=[mx.nd.ones(shape), mx.nd.ones(shape)]), is_train=True)
+    mod1.backward([mx.nd.ones(shape)])
+    mod1_input_grads = mod1.get_input_grads()
+
+    mod2 = mx.mod.Module(c, data_names=['a', 'b'], label_names=None)
+    mod2.bind(data_shapes=[['a', shape], ['b', shape]], inputs_need_grad=True)
+    mod2.init_params()
+    mod2.forward(data_batch=mx.io.DataBatch(data=[mx.nd.ones(shape), mx.nd.ones(shape)]), is_train=True)
+    mod2.backward([mx.nd.ones(shape)])
+    mod2_input_grads = mod2.get_input_grads()
+
+    assert np.all(mod1_input_grads[0].asnumpy() == mod2_input_grads[0].asnumpy())
+    assert np.all(mod1_input_grads[1].asnumpy() == mod2_input_grads[1].asnumpy())
+
+
+def test_bucket_module_ctx_group():
+    num_hidden = 10
+    batch_size = 5
+    def sym_gen(seq_len):
+        with mx.AttrScope(ctx_group='dev1'):
+            data = mx.symbol.Variable('data')
+            weight = mx.symbol.Variable('dev1_weight')
+            bias = mx.symbol.Variable('dev1_bias')
+            fc = data
+            for i in range(seq_len):
+                fc  = mx.symbol.FullyConnected(data=fc, weight=weight, bias=bias,
+                                               name='dev1_fc_%d' % i, num_hidden=num_hidden)
+        with mx.AttrScope(ctx_group='dev2'):
+            label = mx.symbol.Variable('label')
+            weight = mx.symbol.Variable('dev2_weight')
+            bias = mx.symbol.Variable('dev2_bias')
+            for i in range(seq_len):
+                fc  = mx.symbol.FullyConnected(data=fc, weight=weight, bias=bias,
+                                               name='dev2_fc_%d' % i, num_hidden=num_hidden)
+            sym = mx.symbol.SoftmaxOutput(fc, label, name='softmax')
+
+        return sym, ('data',), ('label',)
+
+    mod = mx.mod.BucketingModule(sym_gen=sym_gen, default_bucket_key=10, context=[mx.cpu(0)],
+                                 group2ctxs=[{'dev1':mx.cpu(1), 'dev2':mx.cpu(2)}])
+    mod.bind(data_shapes=[['data', (batch_size, num_hidden)]],
+             label_shapes=[['label', (batch_size,)]],
+             for_training=True, inputs_need_grad=True)
+    assert(mod.binded)
+
 def test_module_layout():
     sym = mx.sym.Variable('data')
     sym = mx.sym.Activation(data=sym, act_type='relu', __layout__='TNC')
@@ -345,16 +405,21 @@ def mean_abs(x):
                 break
     assert(mon_result_counts == [2, 2, 1, 6, 6, 4])
 
-
 def test_executor_group():
-    def get_rnn_sym(num_layers, num_words, num_hidden, num_embed, seq_len):
+    def get_rnn_sym(num_layers, num_words, num_hidden, num_embed, seq_len, sparse_embedding):
         stack = mx.rnn.SequentialRNNCell()
         for i in range(num_layers):
             stack.add(mx.rnn.LSTMCell(num_hidden=num_hidden, prefix='lstm_l%d_' % i))
         data = mx.sym.Variable('data')
         label = mx.sym.Variable('softmax_label')
-        embed = mx.sym.Embedding(data=data, input_dim=num_words,
-                                 output_dim=num_embed, name='embed')
+        if sparse_embedding:
+            embed_weight = mx.sym.Variable('embed_weight', stype='row_sparse')
+            embed = mx.sym.contrib.SparseEmbedding(data=data, input_dim=num_words,
+                                                   weight=embed_weight, output_dim=num_embed,
+                                                   name='embed')
+        else:
+            embed = mx.sym.Embedding(data=data, input_dim=num_words,
+                                     output_dim=num_embed, name='embed')
 
         stack.reset()
         outputs, states = stack.unroll(seq_len, inputs=embed, merge_outputs=True)
@@ -366,7 +431,8 @@ def get_rnn_sym(num_layers, num_words, num_hidden, num_embed, seq_len):
         pred = mx.sym.SoftmaxOutput(data=pred, label=label, name='softmax')
         return pred
 
-    def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=None, extra_args=None):
+    def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=None,
+                               extra_args=None, check_shared_grad=True):
         # Test shared data arrays
         for i in range(len(exec_grp_shared.execs)):
             # test same shared_data_arrays for two exec groups
@@ -402,12 +468,14 @@ def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=N
                     assert mx.test_utils.same_array(exec_shared.arg_dict[arg_name], exec_created.arg_dict[arg_name]), \
                         "Shared argument '%s' does not share memory." % (arg_name)
                 # test shared argument gradients
-                for arg_name in shared_arg_names:
-                    assert arg_name in exec_created.grad_dict, \
-                        "Shared argument gradient '%s' is not in " \
-                        "grad_dict of created executor group." % (arg_name)
-                    assert mx.test_utils.same_array(exec_shared.grad_dict[arg_name], exec_created.grad_dict[arg_name]), \
-                        "Shared argument gradient '%s' does not sharing memory." % (arg_name)
+                if check_shared_grad:
+                    for arg_name in shared_arg_names:
+                        assert arg_name in exec_created.grad_dict, \
+                            "Shared argument gradient '%s' is not in " \
+                            "grad_dict of created executor group." % (arg_name)
+                        assert mx.test_utils.same_array(exec_shared.grad_dict[arg_name], \
+                                                        exec_created.grad_dict[arg_name]), \
+                            "Shared argument gradient '%s' does not share memory." % (arg_name)
 
             for arg_name, grad in exec_grp_shared.grad_req.items():
                 assert grad == exec_grp_created.grad_req[arg_name], \
@@ -415,6 +483,43 @@ def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=N
                     "Shared executor group requires '%s' while created executor group requires '%s'" \
                     %(arg_name, grad, exec_grp_created.grad_req[arg_name])
 
+    def check_shared_exec_group(sparse_embedding):
+        # generate an rnn sym with #layers=5
+        sym = get_rnn_sym(num_layers=3, num_words=num_words, num_hidden=num_hidden,
+                          num_embed=num_embed, seq_len=max_bucket_size,
+                          sparse_embedding=sparse_embedding)
+        arg_names1 = sym.list_arguments()
+        input_names = [name[0] for name in data_shapes] + [name[0] for name in label_shapes]
+        shared_arg_names = [name for name in arg_names1 if name not in input_names]
+        exec_group1 = DataParallelExecutorGroup(symbol=sym, contexts=contexts,
+                                                workload=workload, data_shapes=data_shapes,
+                                                label_shapes=label_shapes, param_names=shared_arg_names,
+                                                for_training=True, inputs_need_grad=False)
+
+        # shared_data_arrays should only have input "data" and "softmax_label" arrays
+        for i in range(len(contexts)):
+            assert len(exec_group1.shared_data_arrays[i]) == len(input_names),\
+                "exec_group1.shared_data_arrays[%d] should have the same number of names as in input_names" % i
+            for name in input_names:
+                assert name in exec_group1.shared_data_arrays[i],\
+                    "arg %s should be in exec_group1.shared_data_arrays[%d]" % (name, i)
+
+        # generate an rnn sym with #layers=5
+        sym = get_rnn_sym(num_layers=5, num_words=num_words, num_hidden=num_hidden,
+                          num_embed=num_embed, seq_len=max_bucket_size,
+                          sparse_embedding=sparse_embedding)
+        arg_names2 = sym.list_arguments()
+        exec_group2 = DataParallelExecutorGroup(symbol=sym, contexts=contexts,
+                                                workload=workload, data_shapes=data_shapes,
+                                                label_shapes=label_shapes, param_names=shared_arg_names,
+                                                for_training=True, inputs_need_grad=False,
+                                                shared_group=exec_group1)
+        extra_args = [name for name in arg_names2 if name not in shared_arg_names]
+        check_shared_grad = not sparse_embedding
+        test_shared_exec_group(exec_grp_shared=exec_group1, exec_grp_created=exec_group2,
+                               shared_arg_names=shared_arg_names, extra_args=extra_args,
+                               check_shared_grad=check_shared_grad)
+
     contexts = [mx.cpu(0), mx.cpu(1)]
     workload = [1] * len(contexts)
     batch_size = 32
@@ -424,39 +529,152 @@ def test_shared_exec_group(exec_grp_shared, exec_grp_created, shared_arg_names=N
     num_embed = 200
     data_shapes = [('data', (batch_size, max_bucket_size))]
     label_shapes = [('softmax_label', (batch_size, max_bucket_size))]
+    sparse_embedding_opt = [True, False]
+    for opt in sparse_embedding_opt:
+        check_shared_exec_group(opt)
+
+
+def test_factorization_machine_module(verbose=False):
+    """ Test factorization machine model with sparse operators """
+    def check_factorization_machine_module(optimizer=None, num_epochs=None):
+        print("check_factorization_machine_module( {} )".format(optimizer))
+        mx.random.seed(11)
+        rnd.seed(11)
+
+        def fm(factor_size, feature_dim, init):
+            x = mx.symbol.Variable("data", stype='csr')
+            v = mx.symbol.Variable("v", shape=(feature_dim, factor_size),
+                                   init=init, stype='row_sparse')
+
+            w1_weight = mx.symbol.var('w1_weight', shape=(feature_dim, 1),
+                                      init=init, stype='row_sparse')
+            w1_bias = mx.symbol.var('w1_bias', shape=(1))
+            w1 = mx.symbol.broadcast_add(mx.symbol.dot(x, w1_weight), w1_bias)
+
+            v_s = mx.symbol._internal._square_sum(data=v, axis=1, keepdims=True)
+            x_s = mx.symbol.square(data=x)
+            bd_sum = mx.sym.dot(x_s, v_s)
+
+            w2 = mx.symbol.dot(x, v)
+            w2_squared = 0.5 * mx.symbol.square(data=w2)
+
+            w_all = mx.symbol.Concat(w1, w2_squared, dim=1)
+            sum1 = mx.symbol.sum(data=w_all, axis=1, keepdims=True)
+            sum2 = 0.5 * mx.symbol.negative(bd_sum)
+            model = mx.sym.elemwise_add(sum1, sum2)
+
+            y = mx.symbol.Variable("label")
+            model = mx.symbol.LinearRegressionOutput(data=model, label=y)
+            return model
+
+        # model
+        init = mx.initializer.Normal(sigma=0.01)
+        factor_size = 4
+        feature_dim = 10000
+        model = fm(factor_size, feature_dim, init)
+
+        # data iter
+        num_batches = 5
+        batch_size = 64
+        num_samples = batch_size * num_batches
+        # generate some random csr data
+        csr_nd = rand_ndarray((num_samples, feature_dim), 'csr', 0.1)
+        label = mx.nd.ones((num_samples,1))
+        # the alternative is to use LibSVMIter
+        train_iter = mx.io.NDArrayIter(data=csr_nd,
+                                       label={'label':label},
+                                       batch_size=batch_size,
+                                       last_batch_handle='discard')
+        # create module
+        mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label'])
+        # allocate memory by given the input data and lable shapes
+        mod.bind(data_shapes=train_iter.provide_data, label_shapes=train_iter.provide_label)
+        # initialize parameters by uniform random numbers
+        mod.init_params(initializer=init)
+        if optimizer == 'sgd':
+            # use Sparse SGD with learning rate 0.1 to train
+            sgd = mx.optimizer.SGD(momentum=0.1, clip_gradient=5.0, learning_rate=0.01,
+                                   rescale_grad=1.0/batch_size)
+            mod.init_optimizer(optimizer=sgd)
+            if num_epochs is None:
+                num_epochs = 10
+            expected_accuracy = 0.02
+        elif optimizer == 'adam':
+            # use Sparse Adam to train
+            adam = mx.optimizer.Adam(clip_gradient=5.0, learning_rate=0.001,
+                                     rescale_grad=1.0/batch_size)
+            mod.init_optimizer(optimizer=adam)
+            if num_epochs is None:
+                num_epochs = 10
+            expected_accuracy = 0.05
+        elif optimizer == 'adagrad':
+            # use Sparse AdaGrad with learning rate 0.1 to train
+            adagrad = mx.optimizer.AdaGrad(clip_gradient=5.0, learning_rate=0.01,
+                                           rescale_grad=1.0/batch_size)
+            mod.init_optimizer(optimizer=adagrad)
+            if num_epochs is None:
+                num_epochs = 20
+            expected_accuracy = 0.09
+        else:
+            raise AssertionError("Unsupported optimizer type '" + optimizer + "' specified")
+        # use accuracy as the metric
+        metric = mx.metric.create('MSE')
+        # train 'num_epochs' epoch
+        for epoch in range(num_epochs):
+            train_iter.reset()
+            metric.reset()
+            for batch in train_iter:
+                mod.forward(batch, is_train=True)       # compute predictions
+                mod.update_metric(metric, batch.label)  # accumulate prediction accuracy
+                mod.backward()                          # compute gradients
+                mod.update()                            # update parameters
+            print('Epoch %d, Training %s' % (epoch, metric.get()))
+        if num_epochs > 1:
+            assert(metric.get()[1] < expected_accuracy)
+
+    if verbose is True:
+        print("============ SGD ==========================")
+        start = time.clock()
+    check_factorization_machine_module('sgd')
+    if verbose is True:
+        print("Duration: {}".format(time.clock() - start))
+        print("============ ADAM ==========================")
+        start = time.clock()
+    check_factorization_machine_module('adam')
+    if verbose is True:
+        print("Duration: {}".format(time.clock() - start))
+        print("============ ADAGRAD ==========================")
+        start = time.clock()
+    check_factorization_machine_module('adagrad')
+    if verbose is True:
+        print("Duration: {}".format(time.clock() - start))
+
+
+def test_module_initializer():
+    def regression_model(m):
+         x = mx.symbol.var("data", stype='csr')
+         v = mx.symbol.var("v", shape=(m, 1), init=mx.init.Uniform(scale=.1),
+                                stype='row_sparse')
+         model = mx.symbol.dot(lhs=x, rhs=v)
+         y = mx.symbol.Variable("label")
+         model = mx.symbol.LinearRegressionOutput(data=model, label=y, name="out")
+         return model
+
+    n, m = 128, 100
+    model = regression_model(m)
+
+    data = mx.nd.zeros(shape=(n, m), stype='csr')
+    label = mx.nd.zeros((n, 1))
+    iterator = mx.io.NDArrayIter(data=data, label={'label':label},
+                                 batch_size=n, last_batch_handle='discard')
 
-    # generate an rnn sym with #layers=5
-    sym = get_rnn_sym(num_layers=3, num_words=num_words, num_hidden=num_hidden,
-                      num_embed=num_embed, seq_len=max_bucket_size)
-    arg_names1 = sym.list_arguments()
-    input_names = [name[0] for name in data_shapes] + [name[0] for name in label_shapes]
-    shared_arg_names = [name for name in arg_names1 if name not in input_names]
-    exec_group1 = DataParallelExecutorGroup(symbol=sym, contexts=contexts,
-                                            workload=workload, data_shapes=data_shapes,
-                                            label_shapes=label_shapes, param_names=shared_arg_names,
-                                            for_training=True, inputs_need_grad=False)
-
-    # shared_data_arrays should only have input "data" and "softmax_label" arrays
-    for i in range(len(contexts)):
-        assert len(exec_group1.shared_data_arrays[i]) == len(input_names),\
-            "exec_group1.shared_data_arrays[%d] should have the same number of names as in input_names" % i
-        for name in input_names:
-            assert name in exec_group1.shared_data_arrays[i],\
-                "arg %s should be in exec_group1.shared_data_arrays[%d]" % (name, i)
-
-    # generate an rnn sym with #layers=5
-    sym = get_rnn_sym(num_layers=5, num_words=num_words, num_hidden=num_hidden,
-                      num_embed=num_embed, seq_len=max_bucket_size)
-    arg_names2 = sym.list_arguments()
-    exec_group2 = DataParallelExecutorGroup(symbol=sym, contexts=contexts,
-                                            workload=workload, data_shapes=data_shapes,
-                                            label_shapes=label_shapes, param_names=shared_arg_names,
-                                            for_training=True, inputs_need_grad=False,
-                                            shared_group=exec_group1)
-    extra_args = [name for name in arg_names2 if name not in shared_arg_names]
-    test_shared_exec_group(exec_grp_shared=exec_group1, exec_grp_created=exec_group2,
-                           shared_arg_names=shared_arg_names, extra_args=extra_args)
-
+    # create module
+    mod = mx.mod.Module(symbol=model, data_names=['data'], label_names=['label'])
+    mod.bind(data_shapes=iterator.provide_data, label_shapes=iterator.provide_label)
+    mod.init_params()
+    v = mod._arg_params['v']
+    assert(v.stype == 'row_sparse')
+    assert(np.sum(v.asnumpy()) != 0)
 
 def test_forward_reshape():
     num_class=10
@@ -484,8 +702,8 @@ def test_forward_reshape():
     mod.init_optimizer(optimizer_params={'learning_rate': 0.01})
 
     # Train with original data shapes
-    data_batch = mx.io.DataBatch(data=[mx.nd.random_uniform(0, 9, dshape1),
-                                       mx.nd.random_uniform(5, 15, dshape2)],
+    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1),
+                                       mx.nd.random.uniform(5, 15, dshape2)],
                                  label=[mx.nd.ones(lshape)])
     mod.forward(data_batch)
     assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
@@ -496,8 +714,8 @@ def test_forward_reshape():
     dshape1 = (3, 3, 64, 64)
     dshape2 = (3, 3, 32, 32)
     lshape = (3,)
-    data_batch = mx.io.DataBatch(data=[mx.nd.random_uniform(0, 9, dshape1),
-                                       mx.nd.random_uniform(5, 15, dshape2)],
+    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1),
+                                       mx.nd.random.uniform(5, 15, dshape2)],
                                  label=[mx.nd.ones(lshape)])
     mod.forward(data_batch)
     assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
@@ -507,8 +725,8 @@ def test_forward_reshape():
     dshape1 = (20, 3, 64, 64)
     dshape2 = (20, 3, 32, 32)
     lshape = (20,)
-    data_batch = mx.io.DataBatch(data=[mx.nd.random_uniform(3, 5, dshape1),
-                                       mx.nd.random_uniform(10, 25, dshape2)],
+    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(3, 5, dshape1),
+                                       mx.nd.random.uniform(10, 25, dshape2)],
                                  label=[mx.nd.ones(lshape)])
     mod.forward(data_batch)
     assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
@@ -519,8 +737,8 @@ def test_forward_reshape():
     dshape1 = (20, 3, 120, 120)
     dshape2 = (20, 3, 32, 64)
     lshape = (20,)
-    data_batch = mx.io.DataBatch(data=[mx.nd.random_uniform(0, 9, dshape1),
-                                       mx.nd.random_uniform(5, 15, dshape2)],
+    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1),
+                                       mx.nd.random.uniform(5, 15, dshape2)],
                                  label=[mx.nd.ones(lshape)])
     mod.forward(data_batch)
     assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
@@ -530,8 +748,8 @@ def test_forward_reshape():
     dshape1 = (5, 3, 28, 40)
     dshape2 = (5, 3, 24, 16)
     lshape = (5,)
-    data_batch = mx.io.DataBatch(data=[mx.nd.random_uniform(0, 9, dshape1),
-                                       mx.nd.random_uniform(15, 25, dshape2)],
+    data_batch = mx.io.DataBatch(data=[mx.nd.random.uniform(0, 9, dshape1),
+                                       mx.nd.random.uniform(15, 25, dshape2)],
                                  label=[mx.nd.ones(lshape)])
     mod.forward(data_batch)
     assert mod.get_outputs()[0].shape == tuple([lshape[0], num_class])
@@ -543,8 +761,8 @@ def test_forward_reshape():
     dataset_shape2 = (30, 3, 20, 40)
     labelset_shape = (30,)
 
-    eval_dataiter = mx.io.NDArrayIter(data=[mx.nd.random_uniform(0, 9, dataset_shape1),
-                                            mx.nd.random_uniform(15, 25, dataset_shape2)],
+    eval_dataiter = mx.io.NDArrayIter(data=[mx.nd.random.uniform(0, 9, dataset_shape1),
+                                            mx.nd.random.uniform(15, 25, dataset_shape2)],
                                       label=[mx.nd.ones(labelset_shape)],
                                       batch_size=5)
     assert len(mod.score(eval_data=eval_dataiter, eval_metric='acc')) == 1
@@ -555,8 +773,8 @@ def test_forward_reshape():
     dataset_shape1 = (10, 3, 30, 30)
     dataset_shape2 = (10, 3, 20, 40)
 
-    pred_dataiter = mx.io.NDArrayIter(data=[mx.nd.random_uniform(0, 9, dataset_shape1),
-                                            mx.nd.random_uniform(15, 25, dataset_shape2)])
+    pred_dataiter = mx.io.NDArrayIter(data=[mx.nd.random.uniform(0, 9, dataset_shape1),
+                                            mx.nd.random.uniform(15, 25, dataset_shape2)])
     mod.bind(data_shapes=[('data1', dshape1), ('data2', dshape2)],
              for_training=False, force_rebind=True)
     assert mod.predict(pred_dataiter).shape == tuple([10, num_class])
diff --git a/tests/python/unittest/test_multi_device_exec.py b/tests/python/unittest/test_multi_device_exec.py
index 6f8eb17ff34e..aa279b183722 100644
--- a/tests/python/unittest/test_multi_device_exec.py
+++ b/tests/python/unittest/test_multi_device_exec.py
@@ -16,9 +16,21 @@
 # under the License.
 
 import os
+import numpy as np
 import mxnet as mx
 
 def test_ctx_group():
+    def check_ctx_group(group2ctx, grad_req, mlp, set_stage1):
+        texec = mlp.simple_bind(mx.cpu(0),
+                                group2ctx=group2ctx,
+                                data=(1,200), grad_req=grad_req)
+
+        for arr, name in zip(texec.arg_arrays, mlp.list_arguments()):
+            if name in set_stage1:
+                assert arr.context == group2ctx['stage1']
+            else:
+                assert arr.context == group2ctx['stage2']
+
     with mx.AttrScope(ctx_group='stage1'):
         data = mx.symbol.Variable('data')
         fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
@@ -39,11 +51,35 @@ def test_ctx_group():
         'stage2' : mx.cpu(2)
     }
 
-    texec = mlp.simple_bind(mx.cpu(0),
-                            group2ctx=group2ctx,
-                            data=(1,200))
+    # generate reqs with null
+    grad_req_with_null = {}
+    for arg in mlp.list_arguments():
+        grad_req_with_null[arg] = 'null' if arg == 'data' else 'write'
+
+    grad_reqs = ['write', grad_req_with_null]
+    for grad_req in grad_reqs:
+        check_ctx_group(group2ctx, grad_req, mlp, set_stage1)
+
+def test_ctx_group_sparse():
+    with mx.AttrScope(ctx_group='stage1'):
+        lhs = mx.symbol.Variable('lhs', stype='csr')
+        rhs = mx.symbol.Variable('rhs', stype='row_sparse')
+        dot  = mx.symbol.dot(lhs, rhs, name='dot')
+
+    set_stage1 = set(dot.list_arguments())
+    with mx.AttrScope(ctx_group='stage2'):
+        softmax  = mx.symbol.SoftmaxOutput(data = dot, name = 'softmax')
+
+    set_stage2 = set(softmax.list_arguments()) - set_stage1
+
+    group2ctx = {
+        'stage1' : mx.cpu(1),
+        'stage2' : mx.cpu(2)
+    }
+    texec = softmax.simple_bind(mx.cpu(0), group2ctx=group2ctx,
+                                lhs=(32,200), rhs=(200, 5))
 
-    for arr, name in zip(texec.arg_arrays, mlp.list_arguments()):
+    for arr, name in zip(texec.arg_arrays, softmax.list_arguments()):
         if name in set_stage1:
             assert arr.context == group2ctx['stage1']
         else:
@@ -51,3 +87,4 @@ def test_ctx_group():
 
 if __name__ == '__main__':
     test_ctx_group()
+    test_ctx_group_sparse()
diff --git a/tests/python/unittest/test_ndarray.py b/tests/python/unittest/test_ndarray.py
index eae364eeaecf..5512b07c7708 100644
--- a/tests/python/unittest/test_ndarray.py
+++ b/tests/python/unittest/test_ndarray.py
@@ -19,8 +19,12 @@
 import mxnet as mx
 import numpy as np
 import pickle as pkl
+import unittest
+from nose.tools import raises
 from mxnet.test_utils import *
 from numpy.testing import assert_allclose
+import unittest
+import mxnet.autograd
 
 def check_with_uniform(uf, arg_shapes, dim=None, npuf=None, rmin=-10, type_list=[np.float32]):
     """check function consistency with uniform random numbers"""
@@ -85,14 +89,6 @@ def test_ndarray_setitem():
     x_np[1] = 1
     assert same(x.asnumpy(), x_np)
 
-    # all-dim indexing
-    x = mx.nd.zeros(shape)
-    val = mx.nd.ones((3, 2, 1))
-    x[:, 1:3, 1] = val
-    x_np = np.zeros(shape, dtype=x.dtype)
-    x_np[:, 1:3, 1:2] = val.asnumpy()
-    assert same(x.asnumpy(), x_np)
-
     # short all-dim indexing
     x = mx.nd.zeros(shape)
     val = mx.nd.ones((3, 2))
@@ -284,13 +280,16 @@ def test_ndarray_slice():
     assert same(A[3:8].asnumpy(), A2[3:8])
 
     shape = (3,4,5,6,7)
-    A = mx.nd.random_uniform(shape=shape)
+    A = mx.nd.random.uniform(shape=shape)
     A2 = A.asnumpy()
 
     assert same(A[1,3:4,:,1:5].asnumpy(), A2[1,3:4,:,1:5])
 
     assert A[1,2,3,4,5].asscalar() == A2[1,2,3,4,5]
 
+    a = mx.nd.array([[0, 1], [2, 3]])
+    assert (a[[1, 1, 0], [0, 1, 0]].asnumpy() == [2, 3, 0]).all()
+    assert (a[mx.nd.array([1, 1, 0]), mx.nd.array([0, 1, 0])].asnumpy() == [2, 3, 0]).all()
 
 
 def test_ndarray_crop():
@@ -373,6 +372,7 @@ def test_dot():
     assert_almost_equal(c, C.asnumpy())
 
 
+
 def test_reduce():
     sample_num = 200
     def test_reduce_inner(numpy_reduce_func, nd_reduce_func, multi_axes):
@@ -490,6 +490,10 @@ def test_arange():
         gt = np.broadcast_to(gt.reshape((gt.shape[0], 1)), shape=(gt.shape[0], repeat)).ravel()
         pred = mx.nd.arange(start=start, stop=stop, step=step, repeat=repeat).asnumpy()
         assert_almost_equal(pred, gt)
+    gt = np.arange(start=0, stop=10000**2, step=10001, dtype=np.int32)
+    pred = mx.nd.arange(start=0, stop=10000**2, step=10001,
+                        dtype="int32").asnumpy()
+    assert_almost_equal(pred, gt)
 
 def test_order(ctx=default_context()):
     def gt_topk(dat, axis, ret_typ, k, is_ascend):
@@ -583,6 +587,10 @@ def gt_topk(dat, axis, ret_typ, k, is_ascend):
     gt = gt_topk(a_npy, axis=None, ret_typ="indices", k=5*5*5*5, is_ascend=False)
     assert_almost_equal(nd_ret_argsort, gt)
 
+    # test topk with a big shape
+    a = mx.nd.arange(0, 54686454, step=1, repeat=1)
+    assert_almost_equal(a.topk(k=54686454).asnumpy(), a.asnumpy()[::-1])
+
 def test_ndarray_equal():
     x = mx.nd.zeros((2, 3))
     y = mx.nd.ones((2, 3))
@@ -669,7 +677,7 @@ def test_iter():
     for i in range(x.size):
         assert same(y[i].asnumpy(), x[i].asnumpy())
 
-
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/8049")
 def test_cached():
     sym = mx.sym.Convolution(kernel=(3, 3), num_filter=10) + 2
     op = mx.nd.CachedOp(sym)
@@ -684,6 +692,34 @@ def test_cached():
     op(data, weight, bias, out=o2)
     assert_almost_equal(o2.asnumpy(), o1.asnumpy()+1)
 
+    weight.attach_grad()
+    bias.attach_grad()
+    with mx.autograd.record():
+        bias = bias + 1
+        o = op(data, weight, bias)
+        o = o * 2
+        o.backward()
+
+    with mx.autograd.record():
+        bias = bias + 1
+        o = op(data, weight, bias)
+        o = o * 2
+        o.backward(retain_graph=True)
+        o.backward()
+
+    # try a different shape
+    data = mx.nd.ones((5, 2, 10, 10))
+    weight = mx.nd.ones((10, 2, 3, 3))
+    bias = mx.nd.ones((10,))
+    data.attach_grad()
+
+    with mx.autograd.record():
+        bias = bias + 1
+        o = op(data, weight, bias)
+        o = o * 2
+        o.backward()
+
+
 def test_output():
     shape = (2,2)
     ones = mx.nd.ones(shape)
@@ -695,6 +731,209 @@ def test_output():
     assert_almost_equal(out.asnumpy(), zeros.asnumpy())
     mx.nd.full(shape, 2, out=out)
     assert_almost_equal(out.asnumpy(), ones.asnumpy() * 2)
+    arange_out = mx.nd.arange(0, 20, dtype='int64')
+    assert_almost_equal(arange_out.asnumpy(), np.arange(0, 20))
+
+def test_ndarray_fluent():
+    has_grad = set(['flatten', 'expand_dims', 'flip', 'tile', 'transpose', 'sum', 'nansum', 'prod',
+                    'nanprod', 'mean', 'max', 'min', 'reshape', 'broadcast_to', 'split',
+                    'broadcast_axes', 'pad', 'swapaxes', 'slice', 'slice_axis', 'take',
+                    'one_hot', 'pick', 'sort', 'topk', 'argsort', 'argmax', 'argmin',
+                    'clip', 'abs', 'sign', 'sin', 'cos', 'tan', 'arcsin', 'arccos', 'arctan',
+                    'degrees', 'radians', 'sinh', 'cosh', 'tanh', 'arcsinh', 'arccosh', 'arctanh',
+                    'exp', 'expm1', 'log', 'log10', 'log2', 'log1p', 'sqrt', 'rsqrt', 'square',
+                    'reshape_like', 'cbrt', 'rcbrt', 'relu', 'sigmoid', 'softmax', 'log_softmax',
+                    'reciprocal'])
+    def check_fluent_regular(func, kwargs, shape=(5, 17, 1), equal_nan=False):
+        with mx.name.NameManager():
+            data = mx.nd.random_uniform(shape=shape, ctx=default_context())
+            regular = getattr(mx.ndarray, func)(data, **kwargs)
+            fluent = getattr(data, func)(**kwargs)
+            if isinstance(regular, list):
+                for r, f in zip(regular, fluent):
+                    assert almost_equal(r.asnumpy(), f.asnumpy(), equal_nan=equal_nan)
+            else:
+                assert almost_equal(regular.asnumpy(), fluent.asnumpy(), equal_nan=equal_nan)
+
+    for func in ['flatten', 'norm', 'round', 'rint', 'fix', 'floor', 'ceil', 'trunc', 'zeros_like',
+                 'ones_like', 'abs', 'sign', 'sin', 'cos', 'degrees', 'radians',
+                 'exp', 'expm1', 'square', 'reciprocal', 'argmax_channel']:
+        check_fluent_regular(func, {})
+
+    for func in ['arccosh', 'arcsin', 'arccos', 'arctan', 'tan', 'sinh', 'cosh', 'tanh',
+                 'arcsinh', 'arctanh', 'log', 'log10', 'log2', 'log1p', 'sqrt', 'rsqrt',
+                 'cbrt', 'rcbrt', 'relu', 'sigmoid', 'softmax', 'log_softmax']:
+        check_fluent_regular(func, {}, equal_nan=True)
+
+    for func in ['expand_dims', 'flip', 'sort', 'topk', 'argsort', 'argmax', 'argmin']:
+        check_fluent_regular(func, {'axis': 1})
+
+    check_fluent_regular('one_hot', {'depth': 15})
+    check_fluent_regular('tile', {'reps': (1,2)})
+    check_fluent_regular('repeat', {'repeats': 3})
+    check_fluent_regular('transpose', {'axes': (1,0,2)})
+    check_fluent_regular('split', {'axis': 2, 'num_outputs': 3}, shape=(5, 17, 6))
+    check_fluent_regular('slice', {'begin': (2, 5, 1), 'end': (4, 7, 6)}, shape=(5, 17, 6))
+    check_fluent_regular('slice_axis', {'axis': 1, 'begin': 5, 'end': 7})
+    check_fluent_regular('take', {'indices': mx.nd.array([2, 3])})
+    check_fluent_regular('pick', {'axis': 1, 'index': mx.nd.array([[2], [3], [5], [6], [11]])})
+    check_fluent_regular('clip', {'a_min': 0.25, 'a_max': 0.75})
+    check_fluent_regular('broadcast_axes', {'axis': (2,), 'size': (5,)})
+    check_fluent_regular('pad', {'mode': 'constant', 'pad_width': (0,0,0,0,3,0,0,4)}, shape=(5, 17, 2, 3))
+    check_fluent_regular('reshape_like', {'rhs': mx.nd.ones((30, 17))}, shape=(5, 17, 2, 3))
+
+    for func in ['sum', 'nansum', 'prod', 'nanprod', 'mean', 'max', 'min']:
+        check_fluent_regular(func, {'axis': (1, 2)})
+
+    check_fluent_regular('reshape', {'shape': (17, 1, 5)})
+    check_fluent_regular('broadcast_to', {'shape': (5, 17, 47)})
+
+@raises(ValueError)
+def test_bool_ambiguous():
+    bool(mx.nd.ones((2,3,4)))
+
+def test_bool():
+    assert not bool(mx.nd.array([]))
+    assert not bool(mx.nd.zeros((1,)))
+    assert bool(mx.nd.ones((1,)))
+
+
+def test_ndarray_indexing():
+    def test_getitem(np_array, index, is_scalar=False):
+        """`is_scalar` indicates whether we should expect a scalar for the result.
+        If so, the indexed array of NDArray should call asscalar to compare
+        with numpy's indexed array."""
+        np_index = index
+        if isinstance(index, mx.nd.NDArray):
+            np_index = index.asnumpy()
+        if isinstance(index, tuple):
+            np_index = []
+            for idx in index:
+                if isinstance(idx, mx.nd.NDArray):
+                    np_index.append(idx.asnumpy())
+                else:
+                    np_index.append(idx)
+            np_index = tuple(np_index)
+
+        np_indexed_array = np_array[np_index]
+        mx_array = mx.nd.array(np_array, dtype=np_array.dtype)
+        mx_indexed_array = mx_array[index]
+        if is_scalar:
+            mx_indexed_array = mx_indexed_array.asscalar()
+        else:
+            mx_indexed_array = mx_indexed_array.asnumpy()
+        assert same(np_indexed_array, mx_indexed_array), 'Failed with index=%s' % str(index)
+
+    def test_setitem(np_array, index, is_scalar):
+        def assert_same(np_array, np_index, mx_array, mx_index, mx_value, np_value=None):
+            if np_value is not None:
+                np_array[np_index] = np_value
+            else:
+                np_array[np_index] = mx_value
+            mx_array[mx_index] = mx_value
+            assert same(np_array, mx_array.asnumpy())
+
+        np_index = index
+        if isinstance(index, mx.nd.NDArray):
+            np_index = index.asnumpy()
+        if isinstance(index, tuple):
+            np_index = []
+            for idx in index:
+                if isinstance(idx, mx.nd.NDArray):
+                    np_index.append(idx.asnumpy())
+                else:
+                    np_index.append(idx)
+            np_index = tuple(np_index)
+
+        mx_array = mx.nd.array(np_array, dtype=np_array.dtype)
+        np_array = mx_array.asnumpy()
+        if is_scalar:
+            # test value is a numeric type
+            assert_same(np_array, np_index, mx_array, index, np.random.randint(low=-10000, high=0))
+            value_nd = [np.random.randint(low=-10000, high=0)]
+            assert_same(np_array, np_index, mx_array, index, value_nd, value_nd[0])
+        else:
+            indexed_array_shape = np_array[np_index].shape
+            np_indexed_array = np.random.randint(low=-10000, high=0, size=indexed_array_shape)
+            # test value is a numpy array without broadcast
+            assert_same(np_array, np_index, mx_array, index, np_indexed_array)
+            # test value is an numeric_type
+            assert_same(np_array, np_index, mx_array, index, np.random.randint(low=-10000, high=0))
+            if len(indexed_array_shape) > 1:
+                # test numpy array with broadcast
+                assert_same(np_array, np_index, mx_array, index,
+                            np.random.randint(low=-10000, high=0, size=(indexed_array_shape[-1],)))
+                # test list with broadcast
+                assert_same(np_array, np_index, mx_array, index,
+                            [np.random.randint(low=-10000, high=0)] * indexed_array_shape[-1])
+
+    def test_getitem_autograd(np_array, index):
+        x = mx.nd.array(np_array, dtype=np_array.dtype)
+        x.attach_grad()
+        with mx.autograd.record():
+            y = x[index]
+        y.backward()
+        value = mx.nd.ones_like(y)
+        x_grad = mx.nd.zeros_like(x)
+        x_grad[index] = value
+        assert same(x_grad.asnumpy(), x.grad.asnumpy())
+
+    shape = (8, 16, 9, 9)
+    np_array = np.arange(np.prod(shape), dtype='int32').reshape(shape)
+    # index_list is a list of tuples. The tuple's first element is the index, the second one is a boolean value
+    # indicating whether we should expect the result as a scalar compared to numpy.
+    index_list = [(0, False), (5, False), (-1, False),
+                  (slice(5), False), (slice(1, 5), False), (slice(1, 5, 2), False),
+                  (slice(7, 0, -1), False), (slice(None, 6), False), (slice(None, 6, 3), False),
+                  (slice(1, None), False), (slice(1, None, 3), False), (slice(None, None, 2), False),
+                  (slice(None, None, -1), False), (slice(None, None, -2), False),
+                  ((slice(None), slice(None), 1, 8), False),
+                  ((slice(None), 2, slice(1, 5), 1), False),
+                  ((1, 2, 3), False), ((1, 2, 3, 4), True),
+                  ((slice(None, None, -1), 2, slice(1, 5), 1), False),
+                  ((slice(None, None, -1), 2, slice(1, 7, 2), 1), False),
+                  ((slice(1, 8, 2), slice(14, 2, -2), slice(3, 8), slice(0, 7, 3)), False),
+                  ((slice(1, 8, 2), 1, slice(3, 8), 2), False),
+                  ([1], False), ([1, 2], False), ([2, 1, 3], False), ([7, 5, 0, 3, 6, 2, 1], False),
+                  (np.array([6, 3], dtype=np.int32), False),
+                  (np.array([[3, 4], [0, 6]], dtype=np.int32), False),
+                  (np.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=np.int32), False),
+                  (np.array([[2], [0], [1]], dtype=np.int32), False),
+                  (mx.nd.array([4, 7], dtype=np.int32), False),
+                  (mx.nd.array([[3, 6], [2, 1]], dtype=np.int32), False),
+                  (mx.nd.array([[7, 3], [2, 6], [0, 5], [4, 1]], dtype=np.int32), False),
+                  ((1, [2, 3]), False), ((1, [2, 3], np.array([[3], [0]], dtype=np.int32)), False),
+                  ((1, [2], np.array([[5], [3]], dtype=np.int32), slice(None)), False),
+                  ((1, [2, 3], np.array([[6], [0]], dtype=np.int32), slice(2, 5)), False),
+                  ((1, [2, 3], np.array([[4], [7]], dtype=np.int32), slice(2, 5, 2)), False),
+                  ((1, [2], np.array([[3]], dtype=np.int32), slice(None, None, -1)), False),
+                  ((1, [2], np.array([[3]], dtype=np.int32), np.array([[5, 7], [2, 4]], dtype=np.int64)), False),
+                  ((1, [2], mx.nd.array([[4]], dtype=np.int32), mx.nd.array([[1, 3], [5, 7]], dtype='int64')),
+                   False),
+                  ([0], False), ([0, 1], False), ([1, 2, 3], False), ([2, 0, 5, 6], False),
+                  (([1, 1], [2, 3]), False), (([1], [4], [5]), False), (([1], [4], [5], [6]), False),
+                  (([[1]], [[2]]), False), (([[1]], [[2]], [[3]], [[4]]), False),
+                  ((slice(0, 2), [[1], [6]], slice(0, 2), slice(0, 5, 2)), False),
+                  (([[[[1]]]], [[1]], slice(0, 3), [1, 5]), False),
+                  (([[[[1]]]], 3, slice(0, 3), [1, 3]), False),
+                  (([[[[1]]]], 3, slice(0, 3), 0), False),
+                  (([[[[1]]]], [[2], [12]], slice(0, 3), slice(None)), False),
+                  (([1, 2], slice(3, 5), [2, 3], [3, 4]), False),
+                  (([1, 2], slice(3, 5), (2, 3), [3, 4]), False)]
+    for index in index_list:
+        test_getitem(np_array, index[0], index[1])
+        test_setitem(np_array, index[0], index[1])
+        test_getitem_autograd(np_array, index[0])
+
+
+def test_assign_float_value_to_ndarray():
+    """Test case from https://github.com/apache/incubator-mxnet/issues/8668"""
+    a = np.array([47.844944], dtype=np.float32)
+    b = mx.nd.zeros(1, dtype=np.float32)
+    b[0] = a
+    assert same(a, b.asnumpy())
+    b[0] = a[0]
+    assert same(a, b.asnumpy())
 
 
 if __name__ == '__main__':
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index a33cb039c849..19c4e65d3d80 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -23,6 +23,8 @@
 import itertools
 from numpy.testing import assert_allclose, assert_array_equal
 from mxnet.test_utils import *
+import unittest
+
 
 def np_softmax(x, axis=-1):
     # fix for old numpy on Travis not supporting keepdims
@@ -58,6 +60,7 @@ def check_elementwise_sum_with_shape(shape, n):
     for a in arr_grad:
         assert_almost_equal(a.asnumpy(), out_grad.asnumpy())
 
+
 def test_elementwise_sum():
     np.random.seed(0)
     nrepeat = 2
@@ -112,6 +115,7 @@ def check_concat_with_shape(shapes, dimension, skip_second):
             np_grad = arr_np[i]
             assert_almost_equal(grad.asnumpy(), np_grad + 1)
 
+
 def test_concat():
     for dimension in range(4):
         n = 2
@@ -158,6 +162,7 @@ def test_concat():
             check_concat_with_shape(shapes,dimension,True)
             check_concat_with_shape(shapes,dimension,False)
 
+
 def test_slice_channel():
     def check_slice_channel(data_ndim, axis, num_outputs, squeeze_axis):
         ins = []
@@ -221,6 +226,7 @@ def check_regression(symbol, forward, backward):
     npout = backward(npout,  arr_label.asnumpy().reshape(npout.shape))
     assert_almost_equal(npout, arr_grad.asnumpy())
 
+
 def test_regression():
     check_regression(mx.symbol.LogisticRegressionOutput,
                      lambda x: 1.0 / (1.0 + np.exp(-x)),
@@ -229,6 +235,54 @@ def test_regression():
                      lambda x: x,
                      lambda x, y : x - y)
 
+
+def check_softmax_grad(xpu):
+    x = mx.sym.Variable('x')
+    label = mx.sym.Variable('label')
+    x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=xpu)
+    grad_x = mx.nd.zeros((1,4), ctx=xpu)
+    label_nd = mx.nd.array([1], ctx=xpu)
+
+    sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False)
+    ex = sym.bind(ctx=xpu, args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x})
+
+    ex.forward(is_train=True)
+    softmax_out = ex.outputs[0].asnumpy()
+    expected_softmax_out = [[0.005806628, 0.861780069, 0.116629249, 0.015784052]]
+    assert np.isclose(softmax_out, expected_softmax_out).all()
+
+    ex.backward(is_train=True)
+    grad_out = ex.grad_arrays[0].asnumpy()
+    k = int(label_nd[0].asscalar())
+    expected_grad_out = np.zeros((1,4))
+    expected_grad_out[0, k] = -1
+    assert np.isclose(grad_out - softmax_out, expected_grad_out).all()
+
+
+def check_smoothed_softmax_grad(xpu):
+    alpha = 0.2
+    x = mx.sym.Variable('x')
+    label = mx.sym.Variable('label')
+    x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=xpu)
+    grad_x = mx.nd.zeros((1,4), ctx=xpu)
+    label_nd = mx.nd.array([1], ctx=xpu)
+
+    sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False, smooth_alpha=alpha)
+    ex = sym.bind(ctx=xpu, args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x})
+
+    ex.forward(is_train=True)
+    softmax_out = ex.outputs[0].asnumpy()
+    expected_softmax_out = [[0.005806628, 0.861780069, 0.116629249, 0.015784052]]
+    assert np.isclose(softmax_out, expected_softmax_out).all()
+
+    ex.backward(is_train=True)
+    grad_out = ex.grad_arrays[0].asnumpy()
+    k = int(label_nd[0].asscalar())
+    expected_grad_out = np.full((1,4), fill_value=-alpha/float(4-1))
+    expected_grad_out[0, k] = - (1 - alpha)
+    assert np.isclose(grad_out - softmax_out, expected_grad_out).all()
+
+
 def check_softmax_with_ignore_label(xpu):
     X = mx.symbol.Variable('X')
     L = mx.symbol.Variable('L')
@@ -261,6 +315,7 @@ def check_softmax_with_ignore_label(xpu):
     assert abs(np.sum(grad1[:int(shape[0]/2)])) < 1e-5
     assert_almost_equal(grad0[int(shape[0]/2):], grad1[int(shape[0]/2):])
 
+
 def check_softmax_with_shape(shape, xpu, preserve_shape=False):
     # bind with label
     X = mx.symbol.Variable('X')
@@ -277,10 +332,6 @@ def check_softmax_with_shape(shape, xpu, preserve_shape=False):
     exec1.backward()
     assert_almost_equal(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy(), rtol=1e-4)
 
-def test_softmax():
-    check_softmax_with_shape((3, 4), default_context(), preserve_shape=False)
-    check_softmax_with_shape((3, 4), default_context(), preserve_shape=True)
-    check_softmax_with_shape((3, 4, 2), default_context(), preserve_shape=True)
 
 def test_python_op():
     X = mx.symbol.Variable('X')
@@ -296,6 +347,7 @@ def test_python_op():
     exec1.backward(dy)
     assert_almost_equal(dy.asnumpy(), dx.asnumpy())
 
+
 def test_swapaxes():
     data = mx.symbol.Variable('data')
     shape = (2, 3, 4)
@@ -314,6 +366,7 @@ def test_swapaxes():
 
     assert_almost_equal(out, swap_)
 
+
 def test_scalarop():
     data = mx.symbol.Variable('data')
     shape = (3, 4)
@@ -344,6 +397,7 @@ def test_scalar_pow():
     check_symbolic_forward(test, [data_tmp], [data_tmp ** 2])
     check_symbolic_backward(test, [data_tmp], [np.ones(shape)], [2 * data_tmp])
 
+
 def test_symbol_pow():
     shape = (1, 1)
 
@@ -362,6 +416,7 @@ def test_symbol_pow():
     exp_dir = data_tmp**(exp_tmp) * np.log(data_tmp)
     check_symbolic_backward(test, [data_tmp, exp_tmp], [np.ones(shape)], [data_dir, exp_dir])
 
+
 def test_pow_fn():
     shape = (3, 4)
     exp = mx.symbol.Variable("exp")
@@ -371,6 +426,7 @@ def test_pow_fn():
     check_symbolic_forward(y, [x], [2**x])
     check_symbolic_backward(y, [x], [np.ones(shape)], [np.log(2) * 2**x])
 
+
 def test_relu():
     def frelu(x):
         return np.maximum(x, 0.0)
@@ -386,6 +442,7 @@ def frelu_grad(x):
     check_symbolic_forward(y, [xa], [ya])
     check_symbolic_backward(y, [xa], [np.ones(shape)], [ga])
 
+
 def test_sigmoid():
     def fsigmoid(a):
         return np.divide(1.0, (1.0 + np.exp(-a)))
@@ -398,6 +455,7 @@ def fsigmoid(a):
     check_symbolic_forward(y, [xa], [ya])
     check_symbolic_backward(y, [xa], [np.ones(shape)], [ya * (1 - ya)])
 
+
 def test_binary_logic():
     def _inner_test(forward_gt, logic_sym, x_shape, y_shape, test_scalar=True):
         x = mx.symbol.Variable("x")
@@ -453,6 +511,7 @@ def _inner_test(forward_gt, logic_sym, x_shape, y_shape, test_scalar=True):
                 logic_sym=lambda x, y: mx.sym.broadcast_not_equal(x, y),
                 x_shape=(1, 10), y_shape=(10, 1), test_scalar=False)
 
+
 def test_embedding():
     in_dim = 10
     out_dim = 4
@@ -479,6 +538,7 @@ def test_embedding():
     exe_test.backward([grad])
     assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, np_grad))
 
+
 # check ops handle duplicate input correctly.
 def test_binary_op_duplicate_input():
     data = mx.symbol.Variable('data')
@@ -497,6 +557,7 @@ def test_binary_op_duplicate_input():
     exe_square.backward(out_grad)
     assert_almost_equal(arr_grad.asnumpy(), 2.0 * data_tmp)
 
+
 def test_sign():
     data = mx.symbol.Variable('data')
     shape = (3, 4)
@@ -520,6 +581,7 @@ def test_sign():
     exe_test.backward(out_grad)
     assert_almost_equal(arr_grad.asnumpy(), npout_grad)
 
+
 def test_round_ceil_floor():
     data = mx.symbol.Variable('data')
     shape = (3, 4)
@@ -536,6 +598,7 @@ def test_round_ceil_floor():
     npout = np.round(data_tmp) + np.ceil(data_tmp) + np.floor(data_tmp)
     assert_almost_equal(out, npout)
 
+
 def test_trunc():
     data_tmp = np.random.rand(3, 4) * 10 - 5
     arr_data = mx.nd.array(data_tmp)
@@ -549,6 +612,7 @@ def test_trunc():
 
     assert_almost_equal(out, npout)
 
+
 def test_rsqrt_cos_sin():
     data = mx.symbol.Variable('data')
     shape = (3, 4)
@@ -572,6 +636,7 @@ def test_rsqrt_cos_sin():
     exe_test.backward(out_grad)
     assert_almost_equal(arr_grad.asnumpy(), npout_grad)
 
+
 def test_maximum_minimum():
     data1 = mx.symbol.Variable('data')
     data2 = mx.symbol.Variable('data')
@@ -584,11 +649,9 @@ def test_maximum_minimum():
     arr_data1 = mx.nd.array(data_tmp1)
     arr_data2 = mx.nd.array(data_tmp2)
 
-
     arr_grad1 = mx.nd.empty(shape)
     arr_grad2 = mx.nd.empty(shape)
 
-
     test =  mx.sym.maximum(data1,data2) + mx.sym.minimum(data1,data2);
     exe_test = test.bind(default_context(), args=[arr_data1,arr_data2], args_grad=[arr_grad1,arr_grad2])
     exe_test.forward(is_train=True)
@@ -610,6 +673,7 @@ def test_maximum_minimum():
     assert_almost_equal(arr_grad1.asnumpy(), npout_grad1)
     assert_almost_equal(arr_grad2.asnumpy(), npout_grad2)
 
+
 def test_maximum_minimum_scalar():
     data1 = mx.symbol.Variable('data')
     shape = (3, 4)
@@ -640,6 +704,7 @@ def test_maximum_minimum_scalar():
 
     assert_almost_equal(arr_grad1.asnumpy(), npout_grad1)
 
+
 def test_abs():
     data = mx.symbol.Variable('data')
     shape = (3, 4)
@@ -663,6 +728,7 @@ def test_abs():
     exe_test.backward(out_grad)
     assert_almost_equal(arr_grad.asnumpy(), npout_grad)
 
+
 def check_deconvolution_forward_backward(input_shape, num_filter, kernel, stride, pad):
     """configure A: input --> conv --> deconv --> output.
        the convolution and deconvoluiton has similar parameter which ensure
@@ -761,6 +827,7 @@ def check_deconvolution_gradient(input_shape, num_filter, pad):
     assert_almost_equal(conv_args_grad[1].asnumpy() + deconv_addto_args_grad_npy[1],
                         deconv_addto_args_grad[1].asnumpy(), rtol=1e-3, atol=1e-2)
 
+
 def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, target_shape=None):
     data = mx.sym.Variable(name="data")
     if target_shape:
@@ -774,6 +841,7 @@ def check_deconvolution_target_shape(input_shape, kernel, stride, pad, adj, targ
     arg_shapes, out_shapes, _ = deconv.infer_shape(data=input_shape)
     assert out_shapes[0] == (input_shape[0], 5, 8, 8)
 
+
 def test_deconvolution():
     check_deconvolution_target_shape(
         input_shape         = (2,3,4,4),
@@ -822,6 +890,7 @@ def test_deconvolution():
         pad = (3,3)
     )
 
+
 def check_nearest_upsampling_with_shape(shapes, scale, root_scale):
     arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)}
     arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)}
@@ -834,6 +903,7 @@ def check_nearest_upsampling_with_shape(shapes, scale, root_scale):
         name = 'arg_%d'%k
         assert_allclose(arr[name].asnumpy()*root_scale**2*scale**(2*k), arr_grad[name].asnumpy(), rtol=1e-4)
 
+
 def check_bilinear_upsampling_with_shape(shapes, scale, root_scale):
     arr = {'arg_%d'%i: mx.random.uniform(-10.0, 10.0, shape, ctx=mx.cpu()).copyto(default_context()) for i, shape in zip(range(len(shapes)), shapes)}
     arr_grad = {'arg_%d'%i: mx.nd.zeros(shape) for i, shape in zip(range(len(shapes)), shapes)}
@@ -846,6 +916,7 @@ def check_bilinear_upsampling_with_shape(shapes, scale, root_scale):
         name = 'arg_%d'%k
         assert_allclose(arr[name].asnumpy()*root_scale**2*scale**(2*k), arr_grad[name].asnumpy(), rtol=1e-4)
 
+
 def test_nearest_upsampling():
     for root_scale in [1,2,3]:
         for scale in [1,2,3]:
@@ -854,76 +925,92 @@ def test_nearest_upsampling():
                     shapes = [(1,3,base*root_scale*scale**(num_shape-1-i),base*root_scale*scale**(num_shape-1-i)) for i in range(num_shape)]
                     check_nearest_upsampling_with_shape(shapes, scale, root_scale)
 
+
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/8044")
 def test_batchnorm_training():
-    for shape in [(2, 3), (2, 3, 2, 2)]:
-        data_tmp = np.random.normal(-0.1, 0.1, size=shape)
-        s = shape[1],
-        gamma = np.ones(s)
-        beta = np.ones(s)
-        gamma[1] = 3
-        beta[0] = 3
+    def check_batchnorm_training(stype):
+        for shape in [(2, 3), (2, 3, 2, 2)]:
+            data_tmp = np.random.normal(-0.1, 0.1, size=shape)
+            s = shape[1],
+            gamma = np.ones(s)
+            beta = np.ones(s)
+            gamma[1] = 3
+            beta[0] = 3
 
-        rolling_mean = np.random.uniform(size=s)
-        rolling_std = np.random.uniform(size=s)
+            rolling_mean = np.random.uniform(size=s)
+            rolling_std = np.random.uniform(size=s)
 
-        data = mx.symbol.Variable('data')
+            data = mx.symbol.Variable('data', stype=stype)
+            in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype),
+                           mx.nd.array(beta).tostype(stype)]
+            mean_std = [mx.nd.array(rolling_mean).tostype(stype), mx.nd.array(rolling_std).tostype(stype)]
 
-        test = mx.symbol.BatchNorm_v1(data, fix_gamma=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm_v1(data, fix_gamma=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
 
-        test = mx.symbol.BatchNorm(data, fix_gamma=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm(data, fix_gamma=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
 
-        test = mx.symbol.BatchNorm_v1(data, fix_gamma=True, use_global_stats=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm_v1(data, fix_gamma=True, use_global_stats=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
 
-        test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
 
-        test = mx.symbol.BatchNorm_v1(data, fix_gamma=False)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm_v1(data, fix_gamma=False)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
 
-        test = mx.symbol.BatchNorm(data, fix_gamma=False)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm(data, fix_gamma=False)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
 
-        test = mx.symbol.BatchNorm_v1(data, fix_gamma=False, use_global_stats=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm_v1(data, fix_gamma=False, use_global_stats=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
 
-        test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True)
-        check_numeric_gradient(test, [data_tmp, gamma, beta], [rolling_mean, rolling_std], numeric_eps=1e-2, rtol=0.16)
+            test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True)
+            check_numeric_gradient(test, in_location, mean_std, numeric_eps=1e-2, rtol=0.16, atol=1e-4)
 
-        # Test varying channel axis
-        dim = len(shape)
-        for chaxis in range(-dim, dim):
-            chaxis_true = chaxis
-            if chaxis < 0:
-                chaxis_true = dim + chaxis
+            # Test varying channel axis
+            dim = len(shape)
+            for chaxis in range(-dim, dim):
+                chaxis_true = chaxis
+                if chaxis < 0:
+                    chaxis_true = dim + chaxis
 
-            shapex = shape
+                shapex = shape
 
-            channel_count = shapex[chaxis_true]
-            data_tmp = np.random.normal(-0.1, 0.1, size=shapex)
+                channel_count = shapex[chaxis_true]
+                data_tmp = np.random.normal(-0.1, 0.1, size=shapex)
 
-            gamma = np.ones(channel_count)
-            beta = np.ones(channel_count)
-            if channel_count > 1:
-                gamma[1] = 3
-            beta[0] = 3
+                gamma = np.ones(channel_count)
+                beta = np.ones(channel_count)
+                if channel_count > 1:
+                    gamma[1] = 3
+                beta[0] = 3
+
+                in_location = [mx.nd.array(data_tmp).tostype(stype), mx.nd.array(gamma).tostype(stype),
+                               mx.nd.array(beta).tostype(stype)]
 
-            xrolling_mean = np.random.uniform(size=channel_count)
-            xrolling_std = np.random.uniform(size=channel_count)
+                xrolling_mean = np.random.uniform(size=channel_count)
+                xrolling_std = np.random.uniform(size=channel_count)
+                xmean_std = [mx.nd.array(xrolling_mean).tostype(stype),
+                             mx.nd.array(xrolling_std).tostype(stype)]
 
-            test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis)
-            check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01)
+                test = mx.symbol.BatchNorm(data, fix_gamma=True, axis=chaxis)
+                check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01)
 
-            test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis)
-            check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01)
+                test = mx.symbol.BatchNorm(data, fix_gamma=True, use_global_stats=True, axis=chaxis)
+                check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01)
 
-            test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis)
-            check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01)
+                test = mx.symbol.BatchNorm(data, fix_gamma=False, axis=chaxis)
+                check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01)
+
+                test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis)
+                check_numeric_gradient(test, in_location, xmean_std, numeric_eps=1e-2, rtol=0.2, atol=0.01)
+
+    stypes = ['row_sparse', 'default']
+    for stype in stypes:
+        check_batchnorm_training(stype)
 
-            test = mx.symbol.BatchNorm(data, fix_gamma=False, use_global_stats=True, axis=chaxis)
-            check_numeric_gradient(test, [data_tmp, gamma, beta], [xrolling_mean, xrolling_std], numeric_eps=1e-2, rtol=0.2, atol=0.01)
 
 def test_convolution_grouping():
     num_filter = 4
@@ -956,42 +1043,45 @@ def test_convolution_grouping():
         np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
 
 
+@unittest.skip("test fails intermittently. temporarily disabled till it gets fixed. tracked at https://github.com/apache/incubator-mxnet/issues/8712")
 def test_depthwise_convolution():
-    for num_base in [32, 64]:
+    for num_base in [1, 4, 16, 32, 64]:
         for kernel in [(3,3), (5,5)]:
             for stride in [(1,1), (2,2)]:
                 for pad in [(0,0), (1,1)]:
-                    num_filter = num_base
-                    num_group = num_base
-                    shape = (2, num_base, 32, 32)
-
-                    x = mx.sym.Variable('x')
-                    w = mx.sym.Variable('w')
-                    b = mx.sym.Variable('b')
-                    y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group,
-                            kernel=kernel, stride=stride, pad=pad)
-                    xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
-                    wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
-                    bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
-                    y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i],
-                                                            num_filter=num_filter//num_group, kernel=kernel,
-                                                            stride=stride, pad=pad)
-                                       for i in range(num_group)])
+                    for in_size in [7, 32]:
+                        num_filter = num_base
+                        num_group = num_base
+                        shape = (2, num_base, in_size, in_size)
+
+                        x = mx.sym.Variable('x')
+                        w = mx.sym.Variable('w')
+                        b = mx.sym.Variable('b')
+                        y1 = mx.sym.Convolution(data=x, weight=w, bias=b, num_filter=num_filter, num_group=num_group,
+                                kernel=kernel, stride=stride, pad=pad)
+                        xslice = mx.sym.SliceChannel(data=x, num_outputs=num_group, axis=1)
+                        wslice = mx.sym.SliceChannel(data=w, num_outputs=num_group, axis=0)
+                        bslice = mx.sym.SliceChannel(data=b, num_outputs=num_group, axis=0)
+                        y2 = mx.sym.Concat(*[mx.sym.Convolution(data=xslice[i], weight=wslice[i], bias=bslice[i],
+                                                                num_filter=num_filter//num_group, kernel=kernel,
+                                                                stride=stride, pad=pad)
+                                           for i in range(num_group)])
+
+                        dev = default_context()
+                        exe1 = y1.simple_bind(dev, x=shape)
+                        exe2 = y2.simple_bind(mx.cpu(), x=shape, w=(num_filter, shape[1]//num_group, kernel[0], kernel[1]),
+                                b=(num_filter,))
+                        for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
+                            arr1[:] = np.random.normal(size=arr1.shape)
+                            arr2[:] = arr1
+                        exe1.forward(is_train=True)
+                        exe1.backward(exe1.outputs[0])
+                        exe2.forward(is_train=True)
+                        exe2.backward(exe2.outputs[0])
+
+                        for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
+                            np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
 
-                    dev = default_context()
-                    exe1 = y1.simple_bind(dev, x=shape)
-                    exe2 = y2.simple_bind(mx.cpu(), x=shape, w=(num_filter, shape[1]//num_group, kernel[0], kernel[1]),
-                            b=(num_filter,))
-                    for arr1, arr2 in zip(exe1.arg_arrays, exe2.arg_arrays):
-                        arr1[:] = np.random.normal(size=arr1.shape)
-                        arr2[:] = arr1
-                    exe1.forward(is_train=True)
-                    exe1.backward(exe1.outputs[0])
-                    exe2.forward(is_train=True)
-                    exe2.backward(exe2.outputs[0])
-
-                    for arr1, arr2 in zip(exe1.outputs + exe1.grad_arrays, exe2.outputs + exe2.grad_arrays):
-                        np.testing.assert_allclose(arr1.asnumpy(), arr2.asnumpy(), rtol=1e-3, atol=1e-4)
 
 def gen_broadcast_data(idx):
     # Manually set test cases
@@ -1048,27 +1138,35 @@ def gen_broadcast_data(idx):
         r_shape[np.where(r_axis_flags == 0)] = 1
     return [np.random.random(l_shape), np.random.random(r_shape)]
 
+
 def gen_broadcast_data_int(idx):
     d = gen_broadcast_data(idx);
     return [np.round(d[0]*100).astype(int), np.round(d[1]*100).astype(int)]
 
+
 def gen_binary_data(dummy):
     ndim = np.random.randint(1, 6)
     shape = np.random.randint(1, 6, size=(ndim,))
     return [np.random.random(shape), np.random.random(shape)]
 
+
 def gen_binary_data_int(dummy):
     d = gen_binary_data(dummy);
     return [np.round(d[0]*100).astype(int), np.round(d[1]*100).astype(int)]
 
-def check_binary_op_forward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5):
+
+def check_binary_op_forward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5, mx_nd_func=None):
     sample_num = 200
     for i in range(sample_num):
         d = gen_data(i)
         x = baseline(d[0], d[1])
-        y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b' : mx.nd.array(d[1])})
+        y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])})
         y.forward(is_train=True)
         y = y.outputs[0].asnumpy()
+        if mx_nd_func is not None:
+            d0 = mx.nd.array(d[0], dtype=d[0].dtype)
+            d1 = mx.nd.array(d[1], dtype=d[1].dtype)
+            assert_almost_equal(y, mx_nd_func(d0, d1).asnumpy(), rtol=rtol, atol=atol)
         idx = np.abs(x-y) > atol+rtol*np.abs(x)
         if idx.any():
             print('found precision problem')
@@ -1084,11 +1182,13 @@ def check_binary_op_forward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5):
             print('diff: {}'.format(np.abs(x-y)[idx] - atol-rtol*np.abs(x)[idx]))
         assert_allclose(y, x, rtol=rtol, atol=atol)
 
+
 def check_binary_op_backward(symbol, baseline, gen_data, rtol=1e-3, atol=1e-5):
     sample_num = 200
     for i in range(sample_num):
         d = gen_data(i)
         out = np.random.random((d[0] + d[1]).shape)
+
         def reduce_op(shape, x):
             if shape == x.shape:
                 return x
@@ -1098,18 +1198,20 @@ def reduce_op(shape, x):
                     keepdims_shape[i] = 1
                     x = np.sum(x, axis=i).reshape(keepdims_shape)
             return x
+
         baseline_grad1, baseline_grad2 = baseline(out, d[0], d[1])
         x_1 = reduce_op(d[0].shape, baseline_grad1)
         x_2 = reduce_op(d[1].shape, baseline_grad2)
         y_1 = mx.nd.empty(d[0].shape)
         y_2 = mx.nd.empty(d[1].shape)
-        y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b' : mx.nd.array(d[1])},
+        y = symbol.bind(default_context(), args={'a': mx.nd.array(d[0]), 'b': mx.nd.array(d[1])},
                         args_grad=[y_1, y_2])
         y.forward(is_train=True)
         y.backward([mx.nd.array(out)])
         assert_allclose(y_1.asnumpy(), x_1, rtol=rtol, atol=atol)
         assert_allclose(y_2.asnumpy(), x_2, rtol=rtol, atol=atol)
 
+
 def test_binary_op():
     a = mx.sym.Variable('a')
     b = mx.sym.Variable('b')
@@ -1164,51 +1266,79 @@ def test_bneq(a, b):
     test_bpow(a, b)
     test_bneq(a, b)
 
+
 def test_broadcast_binary_op():
+    def check_bmaxmin_gradient(test_sym, x, y, delta, rtol, atol):
+        """This function ensures that checking the numerical gradient of
+        broadcast_max/min is not crossing the boundary y=x where there
+        is no gradient definition at those sigularities."""
+        x_max = np.max(x)
+        y = x_max + 2 * delta + np.random.random(y.shape)
+        check_numeric_gradient(test_sym, [x, y], numeric_eps=delta, rtol=rtol, atol=atol)
+
+        x_min = np.min(x)
+        y = x_min - 2 * delta - np.random.random(y.shape)
+        check_numeric_gradient(test_sym, [x, y], numeric_eps=delta, rtol=rtol, atol=atol)
+
     a = mx.sym.Variable('a')
     b = mx.sym.Variable('b')
 
     def test_bplus(a, b):
         c = mx.sym.broadcast_plus(a, b)
-        check_binary_op_forward(c, lambda a, b: a + b, gen_broadcast_data)
+        check_binary_op_forward(c, lambda a, b: a + b, gen_broadcast_data, mx_nd_func=mx.nd.add)
         check_binary_op_backward(c, lambda g_out, a, b: (g_out, g_out), gen_broadcast_data)
 
     def test_bminus(a, b):
         c = mx.sym.broadcast_minus(a, b)
-        check_binary_op_forward(c, lambda a, b: a - b, gen_broadcast_data)
+        check_binary_op_forward(c, lambda a, b: a - b, gen_broadcast_data, mx_nd_func=mx.nd.subtract)
         check_binary_op_backward(c, lambda g_out, a, b: (g_out, - g_out), gen_broadcast_data)
 
     def test_bmul(a, b):
         c = mx.sym.broadcast_mul(a, b)
-        check_binary_op_forward(c, lambda a, b: a * b, gen_broadcast_data)
+        check_binary_op_forward(c, lambda a, b: a * b, gen_broadcast_data, mx_nd_func=mx.nd.multiply)
         check_binary_op_backward(c, lambda g_out, a, b: (g_out * b, g_out * a), gen_broadcast_data)
 
     def test_bdiv(a, b):
         c = mx.sym.broadcast_div(a, b)
-        check_binary_op_forward(c, lambda a, b: a / b, gen_broadcast_data)
+        check_binary_op_forward(c, lambda a, b: a / b, gen_broadcast_data, mx_nd_func=mx.nd.divide)
         check_binary_op_backward(c, lambda g_out, a, b: (g_out / b, - g_out * a / (b * b)), gen_broadcast_data)
 
     def test_bmod(a, b):
         c = mx.sym.broadcast_mod(a, b)
-        check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data, atol=1)
+        check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data, atol=1, mx_nd_func=mx.nd.modulo)
         check_binary_op_backward(c, lambda g_out, a, b: (g_out, - g_out * (a // b)), gen_broadcast_data, atol=1)
 
     def test_bmod_int(a, b):
         c = mx.sym.broadcast_mod(mx.sym.cast(a, dtype='int32'), mx.sym.cast(b, dtype='int32'))
-        check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data_int)
+        check_binary_op_forward(c, lambda a, b: a % b, gen_broadcast_data_int, mx_nd_func=mx.nd.modulo)
         check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_broadcast_data_int)
 
     def test_bpow(a, b):
         c = mx.sym.broadcast_power(a, b)
-        check_binary_op_forward(c, lambda a, b: a ** b, gen_broadcast_data)
+        check_binary_op_forward(c, lambda a, b: a ** b, gen_broadcast_data, mx_nd_func=mx.nd.power)
         check_binary_op_backward(c, lambda g_out, a, b: (g_out * a **(b - 1) * b,
-                                        g_out * a ** b * np.log(a)), gen_broadcast_data)
+                                                         g_out * a ** b * np.log(a)), gen_broadcast_data)
 
     def test_bequal(a, b):
         c = mx.sym.broadcast_equal(a, b)
-        check_binary_op_forward(c, lambda a, b: (a == b).astype(a.dtype), gen_broadcast_data_int)
+        check_binary_op_forward(c, lambda a, b: (a == b).astype(a.dtype), gen_broadcast_data_int,
+                                mx_nd_func=mx.nd.equal)
         check_binary_op_backward(c, lambda g_out, a, b: (np.zeros_like(a), np.zeros_like(b)), gen_broadcast_data_int)
 
+    def test_bmax(a, b):
+        c = mx.sym.broadcast_maximum(a, b)
+        check_binary_op_forward(c, lambda x, y: np.maximum(x, y), gen_broadcast_data, mx_nd_func=mx.nd.maximum)
+        # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big
+        data = gen_broadcast_data(idx=200)
+        check_bmaxmin_gradient(c, data[0], data[1], 0.001, 1e-2, 1e-3)
+
+    def test_bmin(a, b):
+        c = mx.sym.broadcast_minimum(a, b)
+        check_binary_op_forward(c, lambda x, y: np.minimum(x, y), gen_broadcast_data, mx_nd_func=mx.nd.minimum)
+        # pass idx=200 to gen_broadcast_data so that generated ndarrays' sizes are not too big
+        data = gen_broadcast_data(idx=200)
+        check_bmaxmin_gradient(c, data[0], data[1], 0.001, 1e-2, 1e-3)
+
     test_bplus(a, b)
     test_bminus(a, b)
     test_bmul(a, b)
@@ -1217,6 +1347,9 @@ def test_bequal(a, b):
     test_bmod_int(a, b)
     test_bpow(a, b)
     test_bequal(a, b)
+    test_bmax(a, b)
+    test_bmin(a, b)
+
 
 def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3), verbose=False):
     # Input for spike response
@@ -1225,7 +1358,6 @@ def test_run_convolution_dilated_impulse_response(dil=(1,1), kernel_shape=(3,3),
     spike_img = mx.nd.array(spike_imgs)
     spike_img2 = mx.nd.array(spike_imgs)
 
-
     kernel_weights = mx.nd.ones(shape=tuple([1,1]+list(kernel_shape)), dtype=np.float32)
     kernel_weights2 = mx.nd.ones(shape=tuple([1,1]+list(kernel_shape)), dtype=np.float32)
 
@@ -1361,6 +1493,7 @@ def test_reshape_new(src_shape, shape_args, reverse, dst_shape):
     exe.backward(out_grads=[mx.nd.array(out_grad_npy, ctx=default_context())])
     assert_allclose(exe.grad_arrays[0].asnumpy(), out_grad_npy.reshape((5, 4, 3, 7)))
 
+
 def test_reduce():
     sample_num = 500
     def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym, nan_prob = 0):
@@ -1446,6 +1579,7 @@ def test_reduce_inner(numpy_reduce_func, numpy_reduce_grad_func, mx_reduce_sym,
                         outgrad.reshape(keepdim_shape) * (np.equal(data, outdata.reshape(keepdim_shape)).astype(np.float)),
                       mx.symbol.min)
 
+
 def test_broadcast():
     sample_num = 200
     for i in range(sample_num):
@@ -1477,6 +1611,7 @@ def test_broadcasting_ele(sym_bcast):
         test_broadcasting_ele(sym_bcast_axis)
         test_broadcasting_ele(sym_bcast_to)
 
+
 def test_transpose():
     for ndim in range(1, 7):
         for t in range(5):
@@ -1494,12 +1629,13 @@ def test_transpose():
 
 def test_expand_dims():
     for ndim in range(1, 6):
-        for t in range(5):
-            dims = list(np.random.randint(1, 10, size=ndim))
-            axis = np.random.randint(1, ndim+1)
-            x = mx.nd.array(np.random.normal(size=dims))
-            y = mx.nd.expand_dims(x, axis=axis)
-            assert_allclose(np.expand_dims(x.asnumpy(), axis=axis), y.asnumpy())
+        for axis in range(-ndim + 1, ndim):
+            x = np.random.normal(size=list(np.random.randint(1, 10, size=ndim)))
+            y = mx.nd.array(x)
+            x1 = np.expand_dims(x, axis=axis)
+            y1 = mx.nd.expand_dims(y, axis=axis)
+            assert_allclose(x1, y1.asnumpy())
+            assert_allclose(x1.shape, y1.shape)
 
 
 def test_crop():
@@ -1576,6 +1712,7 @@ def test_slice_axis():
             xx[idx] = x.asnumpy()[idx]
             assert_allclose(xx + x_grad_npy, xgrad.asnumpy(), atol=1E-5)
 
+
 def test_flip():
     for ndim in range(1, 6):
         for t in range(5):
@@ -1664,18 +1801,22 @@ def dot_sym(data_type):
         x = mx.sym.Variable('x', dtype=data_type)
         y = mx.sym.Variable('y', dtype=data_type)
         return mx.sym.dot(x, y)
+
     def dot_sym_xT(data_type):
         x = mx.sym.Variable('x', dtype=data_type)
         y = mx.sym.Variable('y', dtype=data_type)
         return mx.sym.dot(x, y, transpose_a=True)
+
     def dot_sym_yT(data_type):
         x = mx.sym.Variable('x', dtype=data_type)
         y = mx.sym.Variable('y', dtype=data_type)
         return mx.sym.dot(x, y, transpose_b=True)
+
     def dot_sym_xT_yT(data_type):
         x = mx.sym.Variable('x', dtype=data_type)
         y = mx.sym.Variable('y', dtype=data_type)
         return mx.sym.dot(x, y, transpose_a=True, transpose_b=True)
+
     for data_type in dtypes:
         for ashape, bshape in [((3, 4), (4, 5)), ((2, 3, 4), (4, 5, 6))]:
             m1_npy = np.random.uniform(-1, 1, ashape)
@@ -1687,6 +1828,7 @@ def dot_sym_xT_yT(data_type):
             check_numeric_gradient(dot_sym_yT(data_type), [m1_npy, m2_npy.T], numeric_eps=1e-1, rtol=2e-2, atol=1e-3)
             check_numeric_gradient(dot_sym_xT_yT(data_type), [m1_npy.T, m2_npy.T], numeric_eps=1e-1, rtol=2e-2, atol=1e-3)
 
+
 def test_batch_dot():
     dtypes = ['float32', 'float64']
 
@@ -1743,6 +1885,7 @@ def test_batch_dot():
                             assert_almost_equal(exe_add.grad_dict['b'].asnumpy(),
                                 bgrad_npy + b_init_grad_npy, rtol=1e-3, atol=1e-4)
 
+
 def get_correlation(data1,data2,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply):
 
     img1 = mx.sym.Variable('img1')
@@ -1750,6 +1893,7 @@ def get_correlation(data1,data2,kernel_size,max_displacement,stride1,stride2,pad
     return mx.sym.Correlation(data1=img1,data2=img2,kernel_size =kernel_size,max_displacement = max_displacement,
                               stride1 = stride1,stride2 = stride2,pad_size= pad_size,is_multiply = is_multiply)
 
+
 def correlation_forward(data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply):
 
     # compute output's dimension
@@ -1797,6 +1941,7 @@ def correlation_forward(data1,data2,pad_size,kernel_size,stride1,stride2,max_dis
     out /= float(kernel_size**2*data1.shape[1])
     return out,tmp1,tmp2
 
+
 def correlation_backward(out_grad,tmp1,tmp2,data1,data2,pad_size,kernel_size,stride1,stride2,max_displacement,is_multiply):
 
     # compute output's dimension
@@ -1846,6 +1991,7 @@ def correlation_backward(out_grad,tmp1,tmp2,data1,data2,pad_size,kernel_size,str
     tmp2_grad = tmp2_grad / float(kernel_size**2*data1.shape[1])
     return tmp1_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]],tmp2_grad[:,:,pad_size:pad_size+data1.shape[2],pad_size:pad_size+data1.shape[3]],
 
+
 def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2,pad_size,is_multiply):
 
     img1 = np.random.random(data_shape)
@@ -1878,8 +2024,8 @@ def unittest_correlation(data_shape,kernel_size,max_displacement,stride1,stride2
     assert_almost_equal(exe1.grad_dict['img1'].asnumpy(), grad1, rtol=1e-3, atol=1e-4)
     assert_almost_equal(exe1.grad_dict['img2'].asnumpy(), grad2, rtol=1e-3, atol=1e-4)
 
-def test_correlation():
 
+def test_correlation():
     unittest_correlation((1,3,10,10), kernel_size = 1,max_displacement = 4,stride1 = 1,stride2 = 1,pad_size = 4,is_multiply = False)
     unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = False)
     unittest_correlation((5,1,15,15), kernel_size = 1,max_displacement = 5,stride1 = 1,stride2 = 1,pad_size = 5,is_multiply = True)
@@ -1919,6 +2065,7 @@ def test_support_vector_machine_l1_svm():
 
     assert_almost_equal(grad_np, grad.asnumpy())
 
+
 def test_support_vector_machine_l2_svm():
     xpu = default_context()
     shape = (20, 10)
@@ -1966,6 +2113,7 @@ def test_roipooling():
                            grad_nodes={'data':'add', 'rois':'null'},
                            numeric_eps=1e-4, rtol=1e-1, atol=1E-4)
 
+
 def check_pad_with_shape(shape, xpu, pad_width, mode):
     # bind with label
     X = mx.symbol.Variable('X')
@@ -1984,6 +2132,7 @@ def check_pad_with_shape(shape, xpu, pad_width, mode):
     # grad check
     check_numeric_gradient(Y, [x.asnumpy()], numeric_eps=1e-2, rtol=1e-2)
 
+
 def test_pad():
     shape1 = (2, 3, 3, 5)
     pad1 = (0, 0, 0, 0, 1, 2, 3, 4)
@@ -1996,6 +2145,7 @@ def test_pad():
     check_pad_with_shape(shape1, default_context(), pad1, 'reflect')
     check_pad_with_shape(shape2, default_context(), pad2, 'reflect')
 
+
 def np_instance_norm(data, weight, bias, eps):
     spatial_dims = data.shape[2::]
     num_spatial_vals = np.prod(np.array(spatial_dims))
@@ -2012,6 +2162,7 @@ def np_instance_norm(data, weight, bias, eps):
     biasBatch = np.reshape(np.repeat(biasBatch, num_spatial_vals), data.shape)
     return weightBatch * (data - mean)/np.sqrt(var + eps) + biasBatch
 
+
 def check_instance_norm_with_shape(shape, xpu):
     # bind with label
     eps = 0.001
@@ -2032,12 +2183,14 @@ def check_instance_norm_with_shape(shape, xpu):
     check_numeric_gradient(Y, {'X':x.asnumpy(), 'G':gamma.asnumpy(), 'B':beta.asnumpy()},
                            numeric_eps=1e-2, rtol=1e-2, atol=1e-2)
 
+
 def test_instance_normalization():
     check_instance_norm_with_shape((1, 1, 1), default_context())
     check_instance_norm_with_shape((2, 1, 2), default_context())
     check_instance_norm_with_shape((2,4,5,6), default_context())
     check_instance_norm_with_shape((3,3,2,3,2,1,1), default_context())
 
+
 def check_l2_normalization(in_shape, mode, ctx=default_context(), norm_eps=1e-10):
     data = mx.symbol.Variable('data')
     out = mx.symbol.L2Normalization(data=data, mode=mode, eps=norm_eps)
@@ -2071,6 +2224,7 @@ def check_l2_normalization(in_shape, mode, ctx=default_context(), norm_eps=1e-10
     # check gradient
     check_numeric_gradient(out, [in_data], numeric_eps=1e-3, rtol=1e-2, atol=1e-3)
 
+
 def test_l2_normalization():
     for mode in ['channel', 'spatial', 'instance']:
         for nbatch in [1, 4]:
@@ -2080,6 +2234,7 @@ def test_l2_normalization():
                     for width in [5, 7]:
                         check_l2_normalization((nbatch, nchannel, height, width), mode)
 
+
 def sequence_mask_numpy(array, lengths, value):
     arrayMask = array.copy()
     shape = array.shape
@@ -2095,14 +2250,11 @@ def check_sequence_mask(shape, xpu, mask_value):
     Y = mx.symbol.SequenceMask(data=X, use_sequence_length=True, sequence_length=L, value=mask_value)
     x = mx.random.uniform(-1, 1, shape, ctx=mx.cpu()).copyto(xpu)
     l = mx.nd.array(np.random.randint(1, shape[0] + 1, shape[1]), ctx=mx.cpu()).copyto(xpu)
-
     # numpy result
     np_out = sequence_mask_numpy(x.asnumpy(), l.asnumpy(), mask_value)
     # mxnet result
-    gradX = mx.nd.empty(shape, ctx = xpu)
-    gradL = mx.nd.empty((shape[1]), ctx = xpu)
-    exec1 = Y.bind(xpu, args = [x, l], grad_req={'X':'write', 'L':'null'}, args_grad = {'X':gradX, 'L':gradL})
-    exec1.forward(is_train=True)
+    exec1 = Y.bind(xpu, args = [x, l], grad_req={'X':'null', 'L':'null'})
+    exec1.forward()
     out = exec1.outputs[0].asnumpy()
     # compare numpy + mxnet
     assert_almost_equal(out, np_out, rtol=1e-5)
@@ -2115,9 +2267,10 @@ def test_sequence_mask():
     shape2 = (1, 2, 2, 3, 1, 1)
     check_sequence_mask(shape1, default_context(), 2.1)
     check_sequence_mask(shape2, default_context(), 0.1)
+    check_sequence_mask((3, 4), default_context(), 0.14)
 
-def check_sequence_reverse(xpu):
 
+def check_sequence_reverse(xpu):
     # sample data
     arr = np.array(
         [[[  1.,   2.,   3.],
@@ -2151,6 +2304,11 @@ def check_sequence_reverse(xpu):
          [[ 13.,  14.,   15.],
           [ 4.,  5.,   6.]]])
 
+    # test for matrix case
+    seq_len_1 = [1, 2, 2]
+    arr_4 = np.array([[7., 8., 9.], [16., 17., 5.4]], dtype=np.float32)
+    arr_5 = np.array([[7., 17., 5.4], [16., 8., 9.]], dtype=np.float32)
+
     def test_wrapper(arr, xpu, sequence_length=None, use_sequence_length=False):
         # MxNet symbol creation
         seq = mx.sym.Variable('seq')
@@ -2174,11 +2332,12 @@ def test_wrapper(arr, xpu, sequence_length=None, use_sequence_length=False):
     assert_array_equal(test_wrapper(arr, xpu, sequence_length=[3, 3], use_sequence_length=True), arr1)
     assert_array_equal(test_wrapper(arr, xpu, sequence_length=[2, 2], use_sequence_length=True), arr2)
     assert_array_equal(test_wrapper(arr, xpu, sequence_length=[2, 3], use_sequence_length=True), arr3)
-
+    assert_array_equal(test_wrapper(arr_4, xpu, sequence_length=seq_len_1, use_sequence_length=True), arr_5)
 
 def test_sequence_reverse():
     check_sequence_reverse(mx.cpu())
 
+
 def mathematical_core_binary(name,
                              forward_mxnet_call,
                              forward_numpy_call,
@@ -2223,6 +2382,7 @@ def mathematical_core_binary(name,
     assert_almost_equal(arr_grad1, npout_grad1)
     assert_almost_equal(arr_grad2, npout_grad2)
 
+
 def mathematical_core(name, forward_mxnet_call, forward_numpy_call, backward_numpy_call, data_init=5., grad_init=2.):
     data = mx.symbol.Variable('data')
     shape = (3, 4)
@@ -2251,6 +2411,7 @@ def mathematical_core(name, forward_mxnet_call, forward_numpy_call, backward_num
     # print(npout_grad)
     assert_almost_equal(arr_grad, npout_grad)
 
+
 def test_special_functions_using_scipy():
     try:
         from scipy import special as scipy_special
@@ -2281,6 +2442,7 @@ def rounding(name, forward_mxnet_call, forward_numpy_call, data_init=5., grad_in
     npout = forward_numpy_call(data_tmp)
     assert_almost_equal(out, npout)
 
+
 def test_mathematical():
     # rsqrt
     mathematical_core("rsqrt",
@@ -2355,11 +2517,11 @@ def test_mathematical():
 
     # log10
     mathematical_core("log10", lambda x: mx.sym.log10(x), lambda x: np.log10(x),
-                      lambda x: (1 / x))
+                      lambda x: 1. / (x * np.log(10.)))
 
     # log2
     mathematical_core("log2", lambda x: mx.sym.log2(x), lambda x: np.log2(x),
-                      lambda x: (1 / x))
+                      lambda x: 1. / (x * np.log(2.)))
 
     # rint
     rounding("rint", lambda x: mx.sym.rint(x), lambda x: np.rint(x))
@@ -2367,6 +2529,7 @@ def test_mathematical():
     # fix
     rounding("fix", lambda x: mx.sym.fix(x), lambda x: np.fix(x))
 
+
 def test_special_functions_using_scipy():
     try:
         from scipy import special as scipy_special
@@ -2382,6 +2545,7 @@ def test_special_functions_using_scipy():
     mathematical_core("gammaln", lambda x: mx.sym.gammaln(x), lambda x: scipy_special.gammaln(x),
                      lambda x: scipy_special.psi(x), 0.5, 0.5)
 
+
 def test_clip():
     data = mx.symbol.Variable('data')
     shape = (30, 30)
@@ -2391,6 +2555,7 @@ def test_clip():
     check_symbolic_backward(test, [data_tmp], [np.ones(shape)],
                             [np.where(data_tmp < 0.6, [1], [0]) * np.where(data_tmp > -0.6, [1], [0])])
 
+
 def test_init():
     def test_basic_val_init(sym_func, np_func, shape, dtype):
         x = sym_func(shape=shape, dtype=dtype)
@@ -2398,19 +2563,23 @@ def test_basic_val_init(sym_func, np_func, shape, dtype):
         exe.forward(is_train=True)
         assert_almost_equal(exe.outputs[0].asnumpy(), np_func(shape=shape, dtype=dtype))
         assert exe.outputs[0].asnumpy().dtype == dtype
+
     def test_arange():
-        for i in range(5):
-            start = np.random.rand() * 10
-            stop = start + np.random.rand() * 100
-            step = np.random.rand() * 4
-            repeat = int(np.random.rand() * 5) + 1
-            gt = np.arange(start=start, stop=stop, step=step)
-            gt = np.broadcast_to(gt.reshape((gt.shape[0], 1)), shape=(gt.shape[0], repeat)).ravel()
-            x = mx.sym.arange(start=start, stop=stop, step=step, repeat=repeat)
-            exe = x.simple_bind(ctx=default_context())
-            assert len(exe.grad_arrays) == 0
-            pred = exe.forward(is_train=False)[0].asnumpy()
-            assert_almost_equal(pred, gt)
+        # General Random Tests
+        dtype_list = [np.float32, np.float64, np.int32, np.uint8]
+        config_list = [(10,),
+                       (0, 10),
+                       (5, 100, 4),
+                       (50, -50, -2),
+                       (-100, 100, 1),
+                       (1.3, 456.6, 1.3)]
+        for dtype in dtype_list:
+            for config in config_list:
+                repeats = random.choice([1, 3])
+                np_out = np.repeat(np.arange(*config, dtype=dtype), repeats)
+                nd_out = mx.nd.arange(*config, repeat=repeats, dtype=dtype)
+                assert_almost_equal(np_out, nd_out.asnumpy())
+
     test_basic_val_init(mx.sym.zeros, np.zeros, (3, 4), np.float32)
     test_basic_val_init(mx.sym.ones, np.ones, 3, np.int32)
     test_basic_val_init(mx.sym.ones, np.ones, (2, 2, 3), np.float16)
@@ -2419,6 +2588,7 @@ def test_arange():
 
 def test_order():
     ctx = default_context()
+
     def gt_topk(dat, axis, ret_typ, k, is_ascend):
         if ret_typ == "indices":
             if is_ascend:
@@ -2525,6 +2695,7 @@ def test_blockgrad():
     assert_almost_equal(exe.outputs[0].asnumpy(), a_npy)
     exe.backward()  # No error if BlockGrad works
 
+
 def test_take():
     def check_output_n_grad(data_shape, idx_shape):
         exe = result.simple_bind(default_context(), a=data_shape,
@@ -2678,7 +2849,6 @@ def bilinear_forward_numpy(data, grid):
                             +(1-xWeightTopLeft) * (1-yWeightTopLeft) * inBottomRight
         return out
 
-
     def bilinear_backward_numpy(out_grad, data, grid):
 
         data_grad = np.zeros(data.shape, dtype=np.float32)
@@ -2789,6 +2959,7 @@ def bilinear_backward_numpy(out_grad, data, grid):
             assert_almost_equal(exe_addto.grad_dict['data'].asnumpy(), data_grad + data_initial_grid, rtol=1e-3,atol=1e-5)
             assert_almost_equal(exe_addto.grad_dict['grid'].asnumpy(), grid_grad + grid_initial_grid, rtol=1e-3,atol=1e-5)
 
+
 def test_index2d():
     for _ in range(30):
         n = np.random.randint(1, 100)
@@ -2798,6 +2969,7 @@ def test_index2d():
         r = mx.nd.batch_take(data, x)
         assert_almost_equal(r.asnumpy(), data.asnumpy()[np.arange(n), x.asnumpy()])
 
+
 def test_cast():
     for srctype in [np.int32, np.float32, np.float16]:
         for dsttype in [np.float32, np.int32, np.float16]:
@@ -3246,7 +3418,7 @@ def test_pick_helper(index_type=np.int32):
 def check_ctc_loss(acts, labels, loss_truth):
     in_var = mx.sym.Variable('input')
     labels_var = mx.sym.Variable('labels')
-    ctc = mx.contrib.sym.ctc_loss(in_var, labels_var)
+    ctc = mx.sym.contrib.ctc_loss(in_var, labels_var)
     acts_nd = mx.nd.array(acts, ctx=default_context())
     labels_nd = mx.nd.array(labels, ctx=default_context())
     exe = ctc.bind(ctx=default_context(), args=[acts_nd, labels_nd])
@@ -3264,6 +3436,7 @@ def check_ctc_loss(acts, labels, loss_truth):
     # test grad
     check_numeric_gradient(ctc, [acts, labels], grad_nodes=['input'], rtol=0.05, atol=1e-3)
 
+
 def test_ctc_loss():
     # Test 1: check that batches are same + check against Torch WarpCTC
     acts = np.array([
@@ -3284,12 +3457,95 @@ def test_ctc_loss():
     check_ctc_loss(acts2, labels2, true_loss)
 
 
+def test_ctc_loss_grad():
+    def check_ctc_loss_grad(blank_label): # from tf
+        vocab_size = 5
+        max_label_len = 5
+        padding_mask = -1+ (blank_label=='first')
+
+        targets_0 = [0, 1, 2, 1, 0]
+        loss_log_prob_0 = -3.34211
+        input_prob_matrix_0 = np.asarray(
+            [[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
+             [0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436],
+             [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
+             [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
+             [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]],
+            dtype=np.float32)
+        gradient_log_prob_0 = np.asarray(
+            [[-0.366234, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553],
+             [0.111121, -0.411608, 0.278779, 0.0055756, 0.00569609, 0.010436],
+             [0.0357786, 0.633813, -0.678582, 0.00249248, 0.00272882, 0.0037688],
+             [0.0663296, -0.356151, 0.280111, 0.00283995, 0.0035545, 0.00331533],
+             [-0.541765, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]],
+            dtype=np.float32)
+
+        targets_1 = [0, 1, 1, 0]
+        loss_log_prob_1 = -5.42262
+        input_prob_matrix_1 = np.asarray(
+            [[0.30176, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508],
+             [0.24082, 0.397533, 0.0557226, 0.0546814, 0.0557528, 0.19549],
+             [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, 0.202456],
+             [0.280884, 0.429522, 0.0326593, 0.0339046, 0.0326856, 0.190345],
+             [0.423286, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]],
+            dtype=np.float32)
+        gradient_log_prob_1 = np.asarray(
+            [[-0.69824, 0.28562, 0.0831517, 0.0862751, 0.0816851, 0.161508],
+             [0.24082, -0.602467, 0.0557226, 0.0546814, 0.0557528, 0.19549],
+             [0.230246, 0.450868, 0.0389607, 0.038309, 0.0391602, -0.797544],
+             [0.280884, -0.570478, 0.0326593, 0.0339046, 0.0326856, 0.190345],
+             [-0.576714, 0.315517, 0.0338439, 0.0393744, 0.0339315, 0.154046]],
+            dtype=np.float32)
+
+        inputs = [
+            np.vstack(
+                [input_prob_matrix_0[t, :], input_prob_matrix_1[t, :]])
+            for t in range(5)
+        ] + 2 * [np.nan * np.ones((2, vocab_size+1), np.float32)]
+        inputs = np.log(np.asarray(inputs, dtype=np.float32))
+
+        grad_truth = np.array([
+            np.vstack(
+                [gradient_log_prob_0[t, :], gradient_log_prob_1[t, :]])
+            for t in range(5)
+        ] + 2 * [np.zeros((2, vocab_size+1), np.float32)])
+
+        if blank_label == 'first':
+            inputs = np.roll(inputs, 1, axis=2)
+            grad_truth = np.roll(grad_truth, 1, axis=2)
+
+        labels = (np.asarray([x + [padding_mask]*(max_label_len-len(x))
+                             for x in [targets_0, targets_1]])+(blank_label == 'first'))
+
+        seq_lens = np.array([5, 5], dtype=np.int32)
+        label_lens = np.array([5, 4], dtype=np.int32)
+        loss_truth = np.array([-loss_log_prob_0, -loss_log_prob_1], np.float32)
+
+        with default_context():
+            data = mx.nd.array(inputs)
+            label = mx.nd.array(labels)
+            data.attach_grad()
+            with mx.autograd.record():
+                l = mx.contrib.ndarray.CTCLoss(data, label,
+                                               use_data_lengths=True,
+                                               use_label_lengths=True,
+                                               data_lengths=mx.nd.array(seq_lens),
+                                               label_lengths=mx.nd.array(label_lens),
+                                               blank_label=blank_label)
+                l.backward()
+            assert_almost_equal(l.asnumpy(), loss_truth, atol=1e-5, rtol=1e-5)
+            assert_almost_equal(data.grad.asnumpy(), grad_truth, atol=1e-5, rtol=1e-5)
+
+    check_ctc_loss_grad('first')
+    check_ctc_loss_grad('last')
+
+
 def test_quantization_op():
     min0 = mx.nd.array([0.0])
     max0 = mx.nd.array([1.0])
     a  = mx.nd.array([[0.1392, 0.5928], [0.6027, 0.8579]])
-    qa, min1, max1 = mx.contrib.nd.quantize(a, min0, max0, out_type='uint8')
-    a_ = mx.contrib.nd.dequantize(qa, min1, max1, out_type='float32')
+    qa, min1, max1 = mx.nd.contrib.quantize(a, min0, max0, out_type='uint8')
+    a_ = mx.nd.contrib.dequantize(qa, min1, max1, out_type='float32')
 
     qa_real = mx.nd.array([[35, 151], [154, 219]])
     a_real  = mx.nd.array([[0.13725491, 0.59215689], [0.60392159, 0.8588236]])
@@ -3297,6 +3553,7 @@ def test_quantization_op():
     assert same(qa.asnumpy(), qa_real.asnumpy())
     assert same(a_.asnumpy(),  a_real.asnumpy())
 
+
 def test_reciprocal_op():
     data_tmp = np.random.rand(3, 4) * 10 - 5
     # Avoid possible division by 0 errors
@@ -3307,13 +3564,33 @@ def test_reciprocal_op():
     check_numeric_gradient(test, [data_tmp])
     check_symbolic_forward(test, [data_tmp], [np.reciprocal(data_tmp)])
 
+def test_cbrt_op():
+    data_tmp = np.random.rand(3, 4) * 10 - 5
+    data = mx.symbol.Variable('data')
+    test = mx.sym.cbrt(data)
+
+    check_numeric_gradient(test, [data_tmp])
+    check_symbolic_forward(test, [data_tmp], [np.cbrt(data_tmp)])
+
+def test_rcbrt_op():
+    data_tmp = np.random.rand(3, 4) * 10 - 5
+    # Avoid possible division by 0 errors
+    data_tmp[data_tmp == 0] = 1.0
+    data = mx.symbol.Variable('data')
+    test = mx.sym.rcbrt(data)
+
+    check_numeric_gradient(test, [data_tmp])
+    check_symbolic_forward(test, [data_tmp], [1/np.cbrt(data_tmp)])
+
 def test_custom_op():
     class Sqr(mx.operator.CustomOp):
         def forward(self, is_train, req, in_data, out_data, aux):
             self.assign(out_data[0], req[0], in_data[0]*in_data[0])
+            aux[0][:] = 1
 
         def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
             self.assign(in_grad[0], req[0], 2*in_data[0]*out_grad[0])
+            assert (aux[0].asnumpy() == 1).all()
 
     @mx.operator.register("sqr")
     class SqrProp(mx.operator.CustomOpProp):
@@ -3326,31 +3603,34 @@ def list_arguments(self):
         def list_outputs(self):
             return ['output']
 
+        def list_auxiliary_states(self):
+            return ['aux']
+
         def infer_shape(self, in_shape):
-            return in_shape, [in_shape[0]], []
+            return in_shape, [in_shape[0]], [in_shape[0]]
 
         def infer_type(self, in_type):
-            return in_type, [in_type[0]], []
+            return in_type, [in_type[0]], [in_type[0]]
 
         def create_operator(self, ctx, shapes, dtypes):
             return Sqr()
 
     data = mx.symbol.Variable('data')
-    op = mx.symbol.Custom(data=data, name='sqr', op_type='sqr')
+    aux = mx.symbol.Variable('aux')
+    op = mx.symbol.Custom(data=data, aux=aux, name='sqr', op_type='sqr')
     x = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10)))
-    check_numeric_gradient(op, [x])
+    aux = mx.nd.zeros_like(x)
+    check_numeric_gradient(op, [x], [aux])
 
-    data = mx.symbol.Variable('data')
     data = mx.symbol.cast(data, dtype='float64')
-    op = mx.symbol.Custom(data=data, name='sqr', op_type='sqr')
     op = mx.symbol.cast(op, dtype='float32')
     x = mx.nd.array(np.random.uniform(-1, 1, size=(4, 10)))
-    check_numeric_gradient(op, [x])
+    aux = mx.nd.zeros_like(x)
+    check_numeric_gradient(op, [x], [aux])
 
-    dx = mx.nd.zeros_like(x)
-    mx.contrib.autograd.mark_variables([x], [dx])
+    x.attach_grad()
     with mx.contrib.autograd.train_section():
-        y = mx.nd.Custom(x, op_type='sqr')
+        y = mx.nd.Custom(x, aux, op_type='sqr')
         y.backward()
 
 
@@ -3369,7 +3649,7 @@ def test_psroipooling():
 
                     im_data_var = mx.symbol.Variable(name="im_data")
                     rois_data_var = mx.symbol.Variable(name="rois_data")
-                    op = mx.contrib.sym.PSROIPooling(data=im_data_var, rois=rois_data_var, spatial_scale=spatial_scale,
+                    op = mx.sym.contrib.PSROIPooling(data=im_data_var, rois=rois_data_var, spatial_scale=spatial_scale,
                                                      group_size=num_group, pooled_size=num_group,
                                                      output_dim=num_classes, name='test_op')
                     rtol, atol = 1e-2, 1e-3
@@ -3378,6 +3658,7 @@ def test_psroipooling():
                         check_numeric_gradient(op, [im_data, rois_data], rtol=rtol, atol=atol,
                                                grad_nodes=grad_nodes, ctx=mx.gpu(0))
 
+
 def test_deformable_convolution():
     for num_batch in [1, 2]:
         for num_channel_data, num_deformable_group in itertools.product([4, 8], [1, 2]):
@@ -3398,7 +3679,7 @@ def test_deformable_convolution():
                         offset_data_var = mx.symbol.Variable(name="offset_data")
                         weight_var = mx.symbol.Variable(name="weight")
                         bias_var = mx.symbol.Variable(name="bias")
-                        op = mx.contrib.sym.DeformableConvolution(name='test_op', data=im_data_var,
+                        op = mx.sym.contrib.DeformableConvolution(name='test_op', data=im_data_var,
                                                                   offset=offset_data_var,
                                                                   weight=weight_var, bias=bias_var,
                                                                   num_filter=num_channel_data, pad=dilate,
@@ -3432,7 +3713,7 @@ def test_deformable_psroipooling():
                     im_data_var = mx.symbol.Variable(name="im_data")
                     rois_data_var = mx.symbol.Variable(name="rois_data")
                     offset_data_var = mx.symbol.Variable(name="offset_data")
-                    op = mx.contrib.sym.DeformablePSROIPooling(data=im_data_var, rois=rois_data_var,
+                    op = mx.sym.contrib.DeformablePSROIPooling(data=im_data_var, rois=rois_data_var,
                                                                trans=offset_data_var, spatial_scale=spatial_scale,
                                                                sample_per_part=4, group_size=num_group,
                                                                pooled_size=num_group, output_dim=num_classes,
@@ -3448,16 +3729,53 @@ def test_deformable_psroipooling():
                                                grad_nodes=grad_nodes, ctx=mx.gpu(0))
 
 
+# Helper functions for test_laop
+
+def _make_symm_symbol(a, ndims):
+    assert ndims >= 2
+    tr_shape = list(range(ndims))
+    tr_shape[-1] = ndims-2
+    tr_shape[-2] = ndims-1
+    tr_shape = tuple(tr_shape)
+    return 0.5 * (a + mx.sym.transpose(a, axes=tr_shape))
+
+def _make_lower_triangle_symm(a, ndims, m, dtype=np.float32):
+    assert ndims >= 2
+    # The last two dimensions must both be m
+    # Create mask for lower triangle and diagonal
+    index = mx.sym.arange(start=0, stop=m, step=1, dtype=np.int32)
+    lt_mask = mx.sym.one_hot(index, depth=m, dtype=dtype)
+    for j in range(1, m):
+        part1 = mx.sym.zeros(shape=(j, m), dtype=dtype)
+        index = mx.sym.arange(start=0, stop=m-j, step=1, dtype=np.int32)
+        part2 = mx.sym.one_hot(index, depth=m, dtype=dtype)
+        lt_mask = lt_mask + mx.sym.concat(*[part1, part2], dim=0)
+    shp = tuple([1]*(ndims-2) + [m, m])
+    lt_mask = mx.sym.reshape(lt_mask, shape=shp)
+    return mx.sym.broadcast_mul(a, lt_mask)
 
 def test_laop():
-
+    dtype = np.float64
+    rtol_fw = 1e-7
+    atol_fw = 1e-9
+    num_eps = 1e-6
+    rtol_bw = 1e-5
+    atol_bw = 1e-6
     # enable numerical checking of gradients
     grad_check = 1
 
     data1 = mx.symbol.Variable('data1')
     data2 = mx.symbol.Variable('data2')
     data3 = mx.symbol.Variable('data3')
-    data4 = mx.symbol.Variable('data4')
+
+    check_fw = lambda sym, location, expected :\
+        check_symbolic_forward(sym, location, expected, rtol=rtol_fw,
+                               atol=atol_fw, dtype=dtype)
+    check_grad = lambda sym, location:\
+        check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol_bw,
+                               atol=atol_bw, dtype=dtype)
+    rep_3x = lambda a, m, n :\
+        np.reshape(np.tile(np.array(a).flatten(), 3), (3, 1, m, n))
 
     # Test gemm separately from other la-operators.
     shape1 = (2, 3)
@@ -3473,222 +3791,464 @@ def test_laop():
     # Check all transpositions of gemm operator.
     data_in1_t = np.transpose(data_in1)
     data_in2_t = np.transpose(data_in2)
-    res_gemm = 4*np.dot(data_in1,data_in2)+7*data_in4
-    test_gemm = mx.sym.linalg_gemm(data1, data2, data3, alpha = 4, beta = 7)
-    check_symbolic_forward(test_gemm, [data_in1, data_in2, data_in4], [res_gemm])
+    res_gemm = 4. * np.dot(data_in1, data_in2) + 7. * data_in4
+    test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7.)
+    check_fw(test_gemm, [data_in1, data_in2, data_in4], [res_gemm])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [data_in1, data_in2, data_in4], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
-    res_gemm = 4*np.dot(data_in1_t,data_in2_t)+7*data_in3
-    test_gemm = mx.sym.linalg_gemm(data1, data2, data3, alpha = 4, beta = 7, transpose_a = 1, transpose_b = 1)
-    check_symbolic_forward(test_gemm, [data_in1, data_in2, data_in3], [res_gemm])
+        check_grad(test_gemm, [data_in1, data_in2, data_in4])
+    res_gemm = 4. * np.dot(data_in1_t, data_in2_t) + 7. * data_in3
+    test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7.,
+                                   transpose_a=True, transpose_b=True)
+    check_fw(test_gemm, [data_in1, data_in2, data_in3], [res_gemm])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [data_in1, data_in2, data_in3], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
-    res_gemm = 4*np.dot(data_in1_t,data_in1)+7*data_in3
-    test_gemm = mx.sym.linalg_gemm(data1, data2, data3, alpha = 4, beta = 7, transpose_a = 1)
-    check_symbolic_forward(test_gemm, [data_in1, data_in1, data_in3], [res_gemm])
+        check_grad(test_gemm, [data_in1, data_in2, data_in3])
+    res_gemm = 4. * np.dot(data_in1_t, data_in1) + 7. * data_in3
+    test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7.,
+                                   transpose_a=True)
+    check_fw(test_gemm, [data_in1, data_in1, data_in3], [res_gemm])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [data_in1, data_in1, data_in3], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
-    res_gemm = 4*np.dot(data_in1,data_in1_t)+7*data_in4
-    test_gemm = mx.sym.linalg_gemm(data1, data2, data3, alpha = 4, beta = 7, transpose_b = 1)
-    check_symbolic_forward(test_gemm, [data_in1, data_in1, data_in4], [res_gemm])
+        check_grad(test_gemm, [data_in1, data_in1, data_in3])
+    res_gemm = 4. * np.dot(data_in1, data_in1_t) + 7. * data_in4
+    test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7.,
+                                   transpose_b=True)
+    check_fw(test_gemm, [data_in1, data_in1, data_in4], [res_gemm])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [data_in1, data_in1, data_in4], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
+        check_grad(test_gemm, [data_in1, data_in1, data_in4])
 
     # Check batch of gemm.
-    a = np.tile(np.array(data_in1).flatten(),3)
-    a = np.reshape(a,(3,1,2,3))
-    b = np.tile(np.array(data_in2).flatten(),3)
-    b = np.reshape(b,(3,1,3,2))
-    c = np.tile(np.array(data_in4).flatten(),3)
-    c = np.reshape(c,(3,1,2,2))
-    r = 4*np.dot(data_in1,data_in2)+7*data_in4
-    r = np.tile(r.flatten(),3)
-    r = np.reshape(r,(3,1,2,2))
-    test_gemm = mx.sym.linalg_gemm(data1, data2, data3, alpha = 4, beta = 7)
-    check_symbolic_forward(test_gemm, [a, b, c], [r])
+    a = rep_3x(data_in1, 2, 3)
+    b = rep_3x(data_in2, 3, 2)
+    c = rep_3x(data_in4, 2, 2)
+    r = 4. * np.dot(data_in1, data_in2) + 7. * data_in4
+    r = rep_3x(r, 2, 2)
+    test_gemm = mx.sym.linalg.gemm(data1, data2, data3, alpha=4., beta=7.)
+    check_fw(test_gemm, [a, b, c], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [a, b, c], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
+        check_grad(test_gemm, [a, b, c])
 
     # Check gemm2 operator same way as gemm.
-    res_gemm = 4*np.dot(data_in1,data_in2)
-    test_gemm = mx.sym.linalg_gemm2(data1, data2, alpha = 4)
-    check_symbolic_forward(test_gemm, [data_in1, data_in2], [res_gemm])
+    res_gemm = 4. * np.dot(data_in1, data_in2)
+    test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4.)
+    check_fw(test_gemm, [data_in1, data_in2], [res_gemm])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [data_in1, data_in2], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
-    res_gemm = 4*np.dot(data_in1_t, data_in2_t)
-    test_gemm = mx.sym.linalg_gemm2(data1, data2, alpha = 4, transpose_a = 1, transpose_b = 1)
-    check_symbolic_forward(test_gemm, [data_in1, data_in2], [res_gemm])
+        check_grad(test_gemm, [data_in1, data_in2])
+    res_gemm = 4. * np.dot(data_in1_t, data_in2_t)
+    test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4., transpose_a=True,
+                                    transpose_b=True)
+    check_fw(test_gemm, [data_in1, data_in2], [res_gemm])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [data_in1, data_in2], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
-    res_gemm = 4*np.dot(data_in1_t,data_in1)
-    test_gemm = mx.sym.linalg_gemm2(data1, data2, alpha = 4, transpose_a = 1)
-    check_symbolic_forward(test_gemm, [data_in1, data_in1], [res_gemm])
+        check_grad(test_gemm, [data_in1, data_in2])
+    res_gemm = 4. * np.dot(data_in1_t, data_in1)
+    test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4., transpose_a=True)
+    check_fw(test_gemm, [data_in1, data_in1], [res_gemm])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [data_in1, data_in1], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
-    res_gemm = 4*np.dot(data_in1,data_in1_t)
-    test_gemm = mx.sym.linalg_gemm2(data1, data2, alpha = 4, transpose_b = 1)
-    check_symbolic_forward(test_gemm, [data_in1, data_in1], [res_gemm])
+        check_grad(test_gemm, [data_in1, data_in1])
+    res_gemm = 4. * np.dot(data_in1, data_in1_t)
+    test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4., transpose_b=True)
+    check_fw(test_gemm, [data_in1, data_in1], [res_gemm])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [data_in1, data_in1], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
+        check_grad(test_gemm, [data_in1, data_in1])
 
     # Check batch of gemm2.
-    a = np.tile(np.array(data_in1).flatten(),3)
-    a = np.reshape(a,(3,1,2,3))
-    b = np.tile(np.array(data_in2).flatten(),3)
-    b = np.reshape(b,(3,1,3,2))
-    r = 4*np.dot(data_in1,data_in2)
-    r = np.tile(r.flatten(),3)
-    r = np.reshape(r,(3,1,2,2))
-    test_gemm = mx.sym.linalg_gemm2(data1, data2, alpha = 4)
-    check_symbolic_forward(test_gemm, [a, b], [r])
+    a = rep_3x(data_in1, 2, 3)
+    b = rep_3x(data_in2, 3, 2)
+    r = rep_3x(4. * np.dot(data_in1, data_in2), 2, 2)
+    test_gemm = mx.sym.linalg.gemm2(data1, data2, alpha=4.)
+    check_fw(test_gemm, [a, b], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_gemm, [a, b], numeric_eps=1e-3, rtol=1e-1, atol=1e-1)
+        check_grad(test_gemm, [a, b])
 
     # Now test all the other operators.
 
     # Tests with trivial 1x1 matrices.
-    shape = (4, 4, 1, 1 )
+    shape = (4, 4, 1, 1)
     data_in = np.random.uniform(1, 10, shape)
     # test potrf
+    # Note: Have to symmetrize input, for gradient test to work
     res_potrf = np.sqrt(data_in)
-    test_potrf = mx.sym.linalg_potrf(data1)
-    check_symbolic_forward(test_potrf, [data_in], [res_potrf])
+    test_potrf = mx.sym.linalg.potrf(data1)
+    check_fw(test_potrf, [data_in], [res_potrf])
     if grad_check == 1:
-      check_numeric_gradient(test_potrf, [data_in])
+        check_grad(test_potrf, [data_in])
     # test potri
     ones = mx.nd.ones(shape).asnumpy()
-    res_potri = np.divide(ones,data_in*data_in)
-    test_potri = mx.sym.linalg_potri(data1)
-    check_symbolic_forward(test_potri, [data_in], [res_potri])
+    res_potri = np.divide(ones, data_in * data_in)
+    test_potri = mx.sym.linalg.potri(data1)
+    check_fw(test_potri, [data_in], [res_potri])
     if grad_check == 1:
-      check_numeric_gradient(test_potri, [data_in], atol = 0.01, rtol = 1.5)
+        check_grad(test_potri, [data_in])
     # test trsm
-    trian_in = data_in *7
-    test_trsm = mx.sym.linalg_trsm(data1,data2,alpha = 7)
-    check_symbolic_forward(test_trsm, [trian_in,data_in], [ones])
+    trian_in = data_in * 7.
+    test_trsm = mx.sym.linalg.trsm(data1, data2, alpha=7.)
+    check_fw(test_trsm, [trian_in, data_in], [ones])
     if grad_check == 1:
-      check_numeric_gradient(test_trsm, [trian_in,data_in], atol = 0.02, rtol = 2.0)
+        check_grad(test_trsm, [trian_in,data_in])
     # test trmm
-    trian_in = np.divide(ones,trian_in)
-    test_trmm = mx.sym.linalg_trmm(data1,data2,alpha = 7, transpose = 1, rightside = 1)
-    check_symbolic_forward(test_trmm, [trian_in,data_in], [ones])
+    trian_in = np.divide(ones, trian_in)
+    test_trmm = mx.sym.linalg.trmm(data1, data2, alpha=7., transpose=True,
+                                   rightside=True)
+    check_fw(test_trmm, [trian_in, data_in], [ones])
     if grad_check == 1:
-      check_numeric_gradient(test_trmm, [trian_in,data_in], atol = 0.02, rtol = 2.0)
+        check_grad(test_trmm, [trian_in, data_in])
     # test sumlogdiag
-    res_sumlogdiag = np.reshape(np.log(data_in),(4,4))
-    test_sumlogdiag = mx.sym.linalg_sumlogdiag(data1)
-    check_symbolic_forward(test_sumlogdiag, [data_in], [res_sumlogdiag])
+    res_sumlogdiag = np.reshape(np.log(data_in), (4, 4))
+    test_sumlogdiag = mx.sym.linalg.sumlogdiag(data1)
+    check_fw(test_sumlogdiag, [data_in], [res_sumlogdiag])
     if grad_check == 1:
-      check_numeric_gradient(test_sumlogdiag, [data_in], atol = 0.01, rtol = 2.0)
-
-    # more elaborate example of cholesky factorization
-    matrix = [ 9, 3, -6, 12, 3, 26, -7, -11, -6, -7, 9, 7, 12, -11, 7, 65 ]
-    trian  = [ 3, 0, 0, 0, 1, 5, 0, 0, -2, -1, 2, 0, 4, -3, 6, 2 ]
-    pow    = [ 2, 1, 1, 1, 1, 4, 1, 1, 1, 1, 8, 1, 1, 1, 1, 16 ]
-    inv    = [ 2.98333, 0.01667, 2.65, -0.83333, 0.01667, 0.05, 0.05, 0,  2.65, 0.05, 2.5, -0.75, -0.83333, 0, -0.75, 0.25 ]
-    ident  = [ 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1 ]
-
-    # Tests for numeric gradients for potrf/potri/trmm/trsm are suppressed by default
-    # as they are very volatile and may often report false negatives which
-    # have to be excluded by manual inspection.
-    grad_check = 0
+        check_grad(test_sumlogdiag, [data_in])
+
+    # more elaborate example of Cholesky factorization
+    matrix = np.array([[9., 3., -6., 12.],
+                       [3., 26., -7., -11.],
+                       [-6., -7., 9., 7.],
+                       [12., -11., 7., 65.]])
+    trian  = np.array([[3., 0., 0., 0.],
+                       [1., 5., 0., 0.],
+                       [-2., -1., 2., 0.],
+                       [4., -3., 6., 2.]])
+    pow    = np.array([[2., 1., 1., 1.],
+                       [1., 4., 1., 1.],
+                       [1., 1., 8., 1.],
+                       [1., 1., 1., 16.]])
+    inv    = np.array([[8.95/3., 0.05/3., 2.65, -2.5/3.],
+                       [0.05/3., 0.05, 0.05, 0.],
+                       [2.65, 0.05, 2.5, -0.75],
+                       [-2.5/3., 0., -0.75, 0.25]])
+    ident  = np.eye(4)
 
     # test potrf
-    a = np.tile(np.array(matrix),3)
-    a = np.reshape(a,(3,1,4,4))
-    r = np.tile(np.array(trian),3)
-    r = np.reshape(r,(3,1,4,4))
-    check_symbolic_forward(test_potrf, [a], [r])
+    test_potrf = mx.sym.linalg.potrf(_make_symm_symbol(data1, ndims=4))
+    a = rep_3x(matrix, 4, 4)
+    r = rep_3x(trian, 4, 4)
+    check_fw(test_potrf, [a], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_potrf, [a], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
+        check_grad(test_potrf, [a])
 
     #test potri
-    a = np.tile(np.array(trian),3)
-    a = np.reshape(a,(3,1,4,4))
-    r = np.tile(np.array(inv),3)
-    r = np.reshape(r,(3,1,4,4))
-    check_symbolic_forward(test_potri, [a], [r], atol=0.01)
+    data1_ltri = _make_lower_triangle_symm(
+        data1, ndims=4, m=4, dtype=dtype)
+    test_potri = mx.sym.linalg.potri(data1_ltri)
+    a = rep_3x(trian, 4, 4)
+    r = rep_3x(inv, 4, 4)
+    check_fw(test_potri, [a], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_potri, [a], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
-
-    #test trsm
-    a = np.tile(np.array(trian),3)
-    a = np.reshape(a,(3,1,4,4))
-    b = np.tile(np.array(matrix),3)
-    b = np.reshape(b,(3,1,4,4))
-    r = 7*np.transpose(np.reshape(np.array(trian),(4,4)))
-    r = np.reshape(np.tile(np.reshape(r,(16)),3),(3,1,4,4))
-    check_symbolic_forward(test_trsm, [a,b], [r])
+        check_grad(test_potri, [a])
+
+    # test trsm
+    test_trsm = mx.sym.linalg.trsm(data1_ltri, data2, alpha=7.)
+    a = rep_3x(trian, 4, 4)
+    b = rep_3x(matrix, 4, 4)
+    r = rep_3x(7. * np.transpose(trian), 4, 4)
+    check_fw(test_trsm, [a, b], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_trsm, [a,b], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
+        check_grad(test_trsm, [a, b])
 
-    test_trsm2 = mx.sym.linalg_trsm(data1,data2,alpha = -2, rightside = 1, transpose = 1)
-    r = -2*np.reshape(np.array(trian),(4,4))
-    r = np.reshape(np.tile(np.reshape(r,(16)),3),(3,1,4,4))
-    check_symbolic_forward(test_trsm2, [a,b], [r])
+    test_trsm2 = mx.sym.linalg.trsm(
+        data1_ltri, data2, alpha=-2., rightside=True, transpose=True)
+    r = rep_3x(-2. * trian, 4, 4)
+    check_fw(test_trsm2, [a, b], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_trsm2, [a,b], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
-
-    test_trsm3 = mx.sym.linalg_trsm(data1,data2,alpha = 0.50, transpose = 1)
-    b = np.transpose(np.reshape(np.array(trian),(4,4)))
-    b = np.reshape(np.tile(np.reshape(b,(16)),3),(3,1,4,4))
-    r = 0.5*np.reshape(np.array(ident),(4,4))
-    r = np.reshape(np.tile(np.reshape(r,(16)),3),(3,1,4,4))
-    check_symbolic_forward(test_trsm3, [a,b], [r])
+        check_grad(test_trsm2, [a, b])
+
+    test_trsm3 = mx.sym.linalg.trsm(
+        data1_ltri, data2, alpha=0.5, transpose=True)
+    b = rep_3x(np.transpose(trian), 4, 4)
+    r = rep_3x(0.5 * ident, 4, 4)
+    check_fw(test_trsm3, [a, b], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_trsm3, [a,b], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
-
-    test_trsm4 = mx.sym.linalg_trsm(data1,data2,alpha = -0.5, rightside = 1)
-    b = np.tile(np.array(trian),3)
-    b = np.reshape(b,(3,1,4,4))
-    r = -0.5*np.reshape(np.array(ident),(4,4))
-    r = np.reshape(np.tile(np.reshape(r,(16)),3),(3,1,4,4))
-    check_symbolic_forward(test_trsm4, [a,b], [r])
+        check_grad(test_trsm3, [a, b])
+
+    test_trsm4 = mx.sym.linalg.trsm(
+        data1_ltri, data2, alpha=-0.5, rightside=True)
+    b = rep_3x(trian, 4, 4)
+    r = rep_3x(-0.5 * ident, 4, 4)
+    check_fw(test_trsm4, [a, b], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_trsm4, [a,b], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
-
-    #test trmm
-    a = np.tile(np.array(trian),3)
-    a = np.reshape(a,(3,1,4,4))
-    b = np.tile(np.array(matrix),3)
-    b = np.reshape(b,(3,1,4,4))
-    r = 7*np.dot(np.reshape(np.array(matrix),(4,4)),np.transpose(np.reshape(np.array(trian),(4,4))))
-    r = np.reshape(np.tile(np.reshape(r,(16)),3),(3,1,4,4))
-    check_symbolic_forward(test_trmm, [a,b], [r])
+        check_grad(test_trsm4, [a, b])
+
+    # test trmm
+    test_trmm = mx.sym.linalg.trmm(
+        data1_ltri, data2, alpha=7., transpose=True, rightside=True)
+    a = rep_3x(trian, 4, 4)
+    b = rep_3x(matrix, 4, 4)
+    r = rep_3x(7. * np.dot(matrix, trian.T), 4, 4)
+    check_fw(test_trmm, [a, b], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_trmm, [a,b], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
+        check_grad(test_trmm, [a, b])
 
-    test_trmm2 = mx.sym.linalg_trmm(data1,data2,alpha = -2)
-    r = -2*np.dot(np.reshape(np.array(trian),(4,4)),np.reshape(np.array(matrix),(4,4)))
-    r = np.reshape(np.tile(np.reshape(r,(16)),3),(3,1,4,4))
-    check_symbolic_forward(test_trmm2, [a,b], [r])
+    test_trmm2 = mx.sym.linalg.trmm(data1_ltri, data2, alpha=-2.)
+    r = rep_3x(-2. * np.dot(trian, matrix), 4, 4)
+    check_fw(test_trmm2, [a, b], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_trmm2, [a,b], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
+        check_grad(test_trmm2, [a, b])
 
-    test_trmm3 = mx.sym.linalg_trmm(data1,data2,rightside = 1)
-    r = np.dot(np.reshape(np.array(matrix),(4,4)),np.reshape(np.array(trian),(4,4)))
-    r = np.reshape(np.tile(np.reshape(r,(16)),3),(3,1,4,4))
-    check_symbolic_forward(test_trmm3, [a,b], [r])
+    test_trmm3 = mx.sym.linalg.trmm(data1_ltri, data2, rightside=True)
+    r = rep_3x(np.dot(matrix, trian), 4, 4)
+    check_fw(test_trmm3, [a, b], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_trmm3, [a,b], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
+        check_grad(test_trmm3, [a, b])
 
-    test_trmm4 = mx.sym.linalg_trmm(data1,data2,alpha = 1.2,transpose = 1)
-    r = 1.2*np.dot(np.transpose(np.reshape(np.array(trian),(4,4))),np.reshape(np.array(matrix),(4,4)))
-    r = np.reshape(np.tile(np.reshape(r,(16)),3),(3,1,4,4))
-    check_symbolic_forward(test_trmm4, [a,b], [r])
+    test_trmm4 = mx.sym.linalg.trmm(
+        data1_ltri, data2, alpha=1.2, transpose=True)
+    r = rep_3x(1.2 * np.dot(trian.T, matrix), 4, 4)
+    check_fw(test_trmm4, [a, b], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_trmm4, [a,b], numeric_eps=1e-3, rtol=1e-2, atol=1e-1)
+        check_grad(test_trmm4, [a, b])
 
     # test sumlogdiag
-    a = np.array(pow)
-    a = np.tile(a,3)
-    a = np.reshape(a,(3,1,4,4))
-    r = 10*np.log(np.array([2]))
-    r = np.tile(r,3)
-    r = np.reshape(r,(3))
-    check_symbolic_forward(test_sumlogdiag, [a], [r])
+    a = rep_3x(pow, 4, 4)
+    r = np.reshape(np.tile(10. * np.log(np.array([2.])), 3), (3,))
+    check_fw(test_sumlogdiag, [a], [r])
     if grad_check == 1:
-      check_numeric_gradient(test_sumlogdiag, [a])
+        check_grad(test_sumlogdiag, [a])
+
+
+# Tests for operators linalg.syrk, linalg.gelqf
+
+def _gelqf_combined_symbol(a):
+    q, l = mx.sym.linalg.gelqf(a)
+    q_qt = mx.sym.linalg.syrk(q, transpose=False, alpha=1., name='Q_times_Qt')
+    l_q = mx.sym.linalg.trmm(l, q, alpha=1., name='L_times_Q')
+    return mx.sym.Group([q_qt, l_q])
+
+# NOTE: If we leave the unused output dangling, things break if dtype=np.float64. Namely, the
+# backward gradient for the unused output is of dtype np.float32 then.
+# ==> Very annoying!
+def _gelqf_first_output(a):
+    q, l = mx.sym.linalg.gelqf(a)
+    bogus_scal = mx.sym.sum(mx.sym.BlockGrad(l), axis=(), keepdims=True) * 0.0
+    return mx.sym.broadcast_add(q, bogus_scal)
+
+def _gelqf_second_output(a):
+    q, l = mx.sym.linalg.gelqf(a)
+    bogus_scal = mx.sym.sum(mx.sym.BlockGrad(q), axis=(), keepdims=True) * 0.0
+    return mx.sym.broadcast_add(l, bogus_scal)
+
+def _syevd_combined_symbol(a):
+    u, lam = mx.sym.linalg.syevd(a)
+    u_ut = mx.sym.linalg.syrk(u, transpose=False, alpha=1., name='U_times_Ut')
+    lam_u = mx.sym.broadcast_mul(mx.sym.reshape(lam, shape=(-2, 1)), u)
+    ut_lam_u = mx.sym.linalg.gemm2(u, lam_u, alpha=1., transpose_a=True,
+                                   transpose_b=False, name='Ut_L_U')
+    return mx.sym.Group([u_ut, ut_lam_u])
+
+def test_laop_2():
+    np.random.seed(1896893923)
+    dtype = np.float64
+    rtol_fw = 1e-7
+    atol_fw = 1e-9
+    num_eps = 1e-6
+    rtol_bw = 1e-5
+    atol_bw = 1e-6
+    # enable numerical checking of gradients
+    grad_check = 1
+
+    data1 = mx.symbol.Variable('data1')
+
+    check_fw = lambda sym, location, expected :\
+        check_symbolic_forward(sym, location, expected, rtol=rtol_fw,
+                               atol=atol_fw, dtype=dtype)
+    check_grad = lambda sym, location:\
+        check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol_bw,
+                               atol=atol_bw, dtype=dtype)
+    rep_3x = lambda a, m, n :\
+        np.reshape(np.tile(np.array(a).flatten(), 3), (3, 1, m, n))
+
+    # Tests for linalg.syrk
+    mnalpha_lst = [(2, 3, 1.), (5, 3, -2.), (1, 6, 5.), (3, 3, 0.5), (4, 1, 10.), (1, 1, 1.)]
+    for m, n, alpha in mnalpha_lst:
+        #print('syrk: m={}, n={}, alpha={}'.format(m, n, alpha))
+        data_in1 = np.random.uniform(1, 10, (m, n))
+        res_syrk1 = alpha * np.dot(data_in1, data_in1.T)
+        test_syrk1 = mx.sym.linalg.syrk(data1, transpose=False, alpha=alpha)
+        check_fw(test_syrk1, [data_in1], [res_syrk1])
+        if grad_check == 1:
+            check_grad(test_syrk1, [data_in1])
+        res_syrk2 = alpha * np.dot(data_in1.T, data_in1)
+        test_syrk2 = mx.sym.linalg.syrk(data1, transpose=True, alpha=alpha)
+        check_fw(test_syrk2, [data_in1], [res_syrk2])
+        if grad_check == 1:
+            check_grad(test_syrk2, [data_in1])
+        # Batch mode (3x the same thing)
+        a_batch = rep_3x(data_in1, m, n)
+        r1_batch = rep_3x(res_syrk1, m, m)
+        check_fw(test_syrk1, [a_batch], [r1_batch])
+        if grad_check == 1:
+            check_grad(test_syrk1, [a_batch])
+        r2_batch = rep_3x(res_syrk2, n, n)
+        check_fw(test_syrk2, [a_batch], [r2_batch])
+        if grad_check == 1:
+            check_grad(test_syrk2, [a_batch])
+
+    # Tests for linalg.gelqf
+    # Currently disabled on GPU as they need cuda8
+    # and MxNet builds use cuda 7.5
+    if not (default_context() == mx.cpu()):
+        return
+
+    test_gelqf2 = _gelqf_combined_symbol(data1)  # Outputs (dot(Q, Q.T), dot(L, Q))
+    test_gelqf_q = _gelqf_first_output(data1)  # Output Q (L is not dangling)
+    test_gelqf_l = _gelqf_second_output(data1)  # Output L (Q is not dangling)
+    mn_lst = [(4, 4), (1, 1), (5, 20), (1, 10), (15, 50)]
+    for m, n in mn_lst:
+        #print('gelqf: m={}, n={}'.format(m, n))
+        data_in1 = np.random.normal(0., 10., (m, n))
+        res_eye = np.eye(m)
+        res_a = data_in1
+        check_fw(test_gelqf2, [data_in1], [res_eye, res_a])
+        if grad_check == 1:
+            # A => Q
+            check_grad(test_gelqf_q, [data_in1])
+            # A => L
+            check_grad(test_gelqf_l, [data_in1])
+        # Batch mode (3x the same thing)
+        a_batch = rep_3x(data_in1, m, n)
+        reye_batch = rep_3x(res_eye, m, m)
+        ra_batch = a_batch
+        check_fw(test_gelqf2, [a_batch], [reye_batch, ra_batch])
+        if grad_check == 1:
+            # A => Q
+            check_grad(test_gelqf_q, [a_batch])
+            # A => L
+            check_grad(test_gelqf_l, [a_batch])
+
+
+# Tests for operator linalg.syevd
+
+def _syevd_first_output(a):
+    u, lam = mx.sym.linalg.syevd(a)
+    bogus_scal = mx.sym.sum(mx.sym.BlockGrad(lam), axis=(), keepdims=True) * 0.0
+    return mx.sym.broadcast_add(u, bogus_scal)
+
+def _syevd_second_output(a):
+    u, lam = mx.sym.linalg.syevd(a)
+    bogus_scal = mx.sym.sum(mx.sym.BlockGrad(u), axis=(), keepdims=True) * 0.0
+    return mx.sym.broadcast_add(lam, bogus_scal)
+
+def _syevd_forward(a):
+    lam, ut = np.linalg.eig(a)
+    ind = np.argsort(lam)
+    lam = lam[ind]
+    u = ut[:, ind].T
+    for i in range(0, a.shape[0]):
+        _syevd_forw_eigvec_sign(u[i])
+    return u, lam
+
+def _syevd_forw_eigvec_sign(v):
+    ind = np.argmax(np.abs(v))
+    if v[ind] < 0.:
+        v[:] = -v
+
+def _syevd_backward(grad_u, grad_l, u, l):
+    n = l.size
+    assert grad_l.size == n
+    assert grad_u.shape == (n, n)
+    assert u.shape == (n, n)
+    temp = np.dot(grad_u, u.T)
+    temp2 = np.diag(grad_l)
+    for i in range(1, n):
+        for j in range(0, i):
+            denom = 2. * (l[i] - l[j])
+            elem = (temp[i, j] - temp[j, i])/denom
+            temp2[i, j] = elem
+            temp2[j, i] = elem
+    temp3 = np.dot(u.T, temp2)
+    return np.dot(temp3, u)
+
+def test_laop_3():
+    # Currently disabled on GPU as syevd needs cuda8
+    # and MxNet builds use cuda 7.5
+    if not (default_context() == mx.cpu()):
+        return
+
+    np.random.seed(1896893923)
+    dtype = np.float64
+    rtol_fw = 1e-6
+    atol_fw = 1e-6
+    num_eps = 1e-4
+    rtol_bw = 1e-2
+    atol_bw = 1e-2
+    # enable numerical checking of gradients
+    grad_check = 1
+
+    data1 = mx.symbol.Variable('data1')
+    check_fw = lambda sym, location, expected :\
+        check_symbolic_forward(sym, location, expected, rtol=rtol_fw,
+                               atol=atol_fw, dtype=dtype)
+    check_grad = lambda sym, location:\
+        check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol_bw,
+                               atol=atol_bw, dtype=dtype)
+    rep_3x = lambda a, m, n :\
+        np.reshape(np.tile(np.array(a).flatten(), 3), (3, 1, m, n))
+    check_bw = lambda sym, location, out_grads, expected :\
+        check_symbolic_backward(sym, location, out_grads, expected,
+                                rtol=rtol_fw, atol=atol_fw, dtype=dtype)
+
+    # Tests for linalg.syevd
+    test_syevd2 = _syevd_combined_symbol(data1)  # Outputs (U U^T, U^T (diag L) U)
+    data1_s2 = _make_symm_symbol(data1, ndims=2)
+    test_syevd_u_2 = _syevd_first_output(data1_s2)
+    test_syevd_l_2 = _syevd_second_output(data1_s2)
+    data1_s4 = _make_symm_symbol(data1, ndims=4)
+    test_syevd_u_4 = _syevd_first_output(data1_s4)
+    test_syevd_l_4 = _syevd_second_output(data1_s4)
+    n_lst = [4, 1, 2, 10, 14]
+    for n in n_lst:
+        #print('\n** syevd: n={}'.format(n))
+        data_in1 = np.random.normal(0., 10., (n, n))
+        data_in1 = 0.5 * (data_in1 + data_in1.T)
+        res_eye = np.eye(n)
+        res_a = data_in1
+        check_fw(test_syevd2, [data_in1], [res_eye, res_a])
+        # Check backward
+        grad_u = np.random.normal(0., 2., (n, n))
+        grad_l = np.random.normal(0., 2., (n,))
+        bw_u, bw_l = _syevd_forward(data_in1)
+        grad_a = _syevd_backward(grad_u, grad_l, bw_u, bw_l)
+        check_bw(mx.sym.linalg.syevd(data1), [data_in1], [grad_u, grad_l], [grad_a])
+        if grad_check == 1:
+            # A => U
+            check_grad(test_syevd_u_2, [data_in1])
+            # A => L
+            check_grad(test_syevd_l_2, [data_in1])
+        # Batch mode (3x the same thing)
+        a_batch = rep_3x(data_in1, n, n)
+        reye_batch = rep_3x(res_eye, n, n)
+        ra_batch = a_batch
+        check_fw(test_syevd2, [a_batch], [reye_batch, ra_batch])
+        if grad_check == 1:
+            # A => U
+            check_grad(test_syevd_u_4, [a_batch])
+            # A => L
+            check_grad(test_syevd_l_4, [a_batch])
+
+
+def test_laop_4():
+    # Currently disabled on GPU as syevd needs cuda8
+    # and MxNet builds use cuda 7.5
+    if not (default_context() == mx.cpu()):
+        return
+
+    np.random.seed(1896893923)
+    rtol_fw = 1e-6
+    atol_fw = 1e-6
+
+    data1 = mx.symbol.Variable('data1')
+
+    check_fw = lambda sym, location, expected, dtype :\
+        check_symbolic_forward(sym, location, expected, rtol=rtol_fw,
+                               atol=atol_fw, dtype=dtype)
+
+    a_np = np.array([[1., 2.], [2., 4.]])
+    u_np = np.array([[0.89442718, -0.44721359], [0.44721359, 0.89442718]])
+    l_np = np.array([0., 5.])
+    test_syevd = mx.sym.linalg.syevd(data1)
+    # float64
+    #print('float64')
+    check_fw(test_syevd, [a_np], [u_np, l_np], np.float64)
+    # float32
+    #print('float32')
+    check_fw(test_syevd, [a_np], [u_np, l_np], np.float32)
 
 
 def test_stack():
@@ -3744,6 +4304,370 @@ def test_dropout():
     assert (exe.grad_arrays[0].asnumpy() == exe.outputs[0].asnumpy()).all()
 
 
+def test_scatter_gather_nd():
+    def check(data, idx):
+        data.attach_grad()
+        with mx.autograd.record():
+            y = mx.nd.gather_nd(data, idx)
+            y.backward(y)
+        npidx = tuple(i.asnumpy() for i in idx)
+        assert (data.asnumpy()[npidx] == y.asnumpy()).all()
+        npdata = np.zeros_like(data.asnumpy())
+        npdata[npidx] = y.asnumpy()
+        assert (npdata == data.grad.asnumpy()).all()
+        assert (mx.nd.scatter_nd(y, idx, shape=data.shape).asnumpy() == data.grad.asnumpy()).all()
+
+    data = mx.nd.arange(360, dtype='int32').reshape((3,4,5,6))
+    idx = mx.nd.array([[1,1,2], [3, 3, 0], [3,2,1]], dtype='int32')
+
+    check(data, idx)
+
+    idx = mx.nd.array([[1,1,2], [3,3,0], [3,2,1], [5,2,4]], dtype='int32')
+
+    check(data, idx)
+
+    data = mx.nd.array([2, 3, 0])
+    idx = mx.nd.array([[1, 1, 0], [0, 1, 0]])
+
+    assert (mx.nd.scatter_nd(data, idx, shape=(2, 2)).asnumpy() == [[0, 0], [2, 3]]).all()
+
+def compare_forw_backw_unary_op(
+        name, forward_mxnet_call, forward_numpy_call,
+        backward_numpy_call, shape, input_low, input_high, rtol, atol,
+        dtype=np.float32):
+    check_fw = lambda sym, location, expected :\
+        check_symbolic_forward(sym, location, expected, rtol=rtol,
+                               atol=atol, dtype=dtype)
+    check_bw = lambda sym, location, out_grads, expected :\
+        check_symbolic_backward(sym, location, out_grads, expected,
+                                rtol=rtol, atol=atol, dtype=dtype)
+    op_name = 'unary_op={}, dtype={}'.format(name, dtype)
+    data = mx.symbol.Variable(op_name + '_data', dtype=dtype)
+    # Comparison: Forward expression
+    data_np = np.random.uniform(input_low, input_high, shape).astype(dtype)
+    res_np = forward_numpy_call(data_np)
+    op_ex = mx.sym.broadcast_add(
+        forward_mxnet_call(data), mx.sym.zeros_like(data),
+        name=op_name)
+    check_fw(op_ex, [data_np], [res_np])
+    # Comparison: Backward expression
+    res_grad = np.random.uniform(-2.0, 2.0, shape).astype(dtype)
+    data_grad = backward_numpy_call(data_np) * res_grad
+    check_bw(op_ex, [data_np], [res_grad], [data_grad])
+
+def finite_diff_unary_op(
+        name, forward_mxnet_call, shape, input_low, input_high, rtol, atol,
+        num_eps):
+    # Finite difference tests are done in float64
+    dtype = np.float64
+    check_grad = lambda sym, location:\
+        check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol,
+                               atol=atol, dtype=dtype)
+    data_np = np.random.uniform(input_low, input_high, shape).astype(dtype)
+    data = mx.symbol.Variable('data', dtype=dtype)
+    op_name = 'unary_op={}, dtype={}'.format(name, dtype)
+    op_ex = mx.sym.broadcast_add(
+        forward_mxnet_call(data), mx.sym.zeros_like(data),
+        name=op_name)
+    check_grad(op_ex, [data_np])
+
+def np_smooth_l1(x, sigma):
+    issq = 1. / sigma / sigma
+    absx = np.abs(x)
+    temp = x * sigma
+    return np.where(absx < issq, 0.5 * (temp ** 2), absx - 0.5 * issq)
+
+def np_smooth_l1_grad(x, sigma):
+    ssq = sigma * sigma
+    return np.where(np.abs(x) < 1. / ssq, x * ssq, np.sign(x))
+
+# Tests for unary operators (basic mathematical functions):
+# - Forward: Comparison to NumPy (several dtype)
+# - Backward: Comparison to NumPy (several dtype)
+# - Finite difference tests (only dtype = float64)
+def test_unary_math_operators():
+    np.random.seed(192837465)
+    have_scipy = True
+    try:
+        from scipy import special as scipy_special
+    except:
+        print("Could not import scipy. Skipping unit tests for special functions")
+        have_scipy = False
+    shape=(9, 10)
+    dtype_l = [np.float64, np.float32, np.float16]
+    rtol_l = [1e-7, 1e-6, 1e-2]
+    rtol_less_l = [1e-6, 1e-5, 1e-2]
+    atol_l = [1e-7, 1e-6, 1e-2]
+    atol_less_l = [1e-6, 1e-5, 1e-2]
+    rtol_fd = 1e-5
+    atol_fd = 1e-6
+    num_eps = 1e-6
+    unary_ops = {
+        'arccos' : [lambda x: mx.sym.arccos(x),
+                    lambda x: np.arccos(x),
+                    lambda x: -1. / np.sqrt(1. - x ** 2.),
+                    -0.95, 0.95],
+        'arccosh': [lambda x: mx.sym.arccosh(x),
+                    lambda x: np.arccosh(x),
+                    lambda x: 1. / np.sqrt(x ** 2 - 1.),
+                    1.05, 10.0],
+        'arcsin': [lambda x: mx.sym.arcsin(x),
+                   lambda x: np.arcsin(x),
+                   lambda x: 1. / np.sqrt(1. - x ** 2),
+                   -0.95, 0.95],
+        'arcsinh': [lambda x: mx.sym.arcsinh(x),
+                    lambda x: np.arcsinh(x),
+                    lambda x: 1. / np.sqrt(x**2 + 1.),
+                    -5.0, 5.0],
+        'arctan': [lambda x: mx.sym.arctan(x),
+                   lambda x: np.arctan(x),
+                   lambda x: 1. / (x ** 2. + 1.),
+                   -5.0, 5.0],
+        'arctanh': [lambda x: mx.sym.arctanh(x),
+                    lambda x: np.arctanh(x),
+                    lambda x: 1. / (1. - x ** 2),
+                    -0.95, 0.95],
+        'cbrt': [lambda x: mx.sym.cbrt(x),
+                 lambda x: np.cbrt(x),
+                 lambda x: 1. / (3. * np.cbrt(x) ** 2),
+                 -10.0, 10.0],
+        'cos': [lambda x: mx.sym.cos(x),
+                lambda x: np.cos(x),
+                lambda x: -np.sin(x),
+                -5.0, 5.0],
+        'cosh': [lambda x: mx.sym.cosh(x),
+                 lambda x: np.cosh(x),
+                 lambda x: np.sinh(x),
+                 -2.0, 2.0],
+        'exp': [lambda x: mx.sym.exp(x),
+                lambda x: np.exp(x),
+                lambda x: np.exp(x),
+                -4.0, 4.0],
+        'expm1': [lambda x: mx.sym.expm1(x),
+                  lambda x: np.expm1(x),
+                  lambda x: np.exp(x),
+                  -0.1, 0.1],
+        'log': [lambda x: mx.sym.log(x),
+                lambda x: np.log(x),
+                lambda x: 1. / x,
+                0.01, 100.0],
+        'log10': [lambda x: mx.sym.log10(x),
+                lambda x: np.log10(x),
+                lambda x: 1. / (x * np.log(10.)),
+                0.01, 100.0],
+        'log2': [lambda x: mx.sym.log2(x),
+                lambda x: np.log2(x),
+                lambda x: 1. / (x * np.log(2.)),
+                0.01, 100.0],
+        'log1p': [lambda x: mx.sym.log1p(x),
+                  lambda x: np.log1p(x),
+                  lambda x: 1. / (1. + x),
+                  -0.1, 0.1],
+        'rcbrt': [lambda x: mx.sym.rcbrt(x),
+                  lambda x: 1. / np.cbrt(x),
+                  lambda x: -1. / (3. * x * np.cbrt(x)),
+                  0.01, 100.0],
+        'reciprocal': [lambda x: mx.sym.reciprocal(x),
+                       lambda x: 1. / x,
+                       lambda x: -1. / (x ** 2),
+                       0.01, 100.0],
+        'relu': [lambda x: mx.sym.relu(x),
+                 lambda x: np.maximum(x, 0.),
+                 lambda x: 1. * (x > 0.),
+                 -5.0, 5.0],
+        'rsqrt': [lambda x: mx.sym.rsqrt(x),
+                  lambda x: 1. / np.sqrt(x),
+                  lambda x: -0.5 / (x * np.sqrt(x)),
+                  0.01, 100.0],
+        'sigmoid': [lambda x: mx.sym.sigmoid(x),
+                    lambda x: 1. / (np.exp(-x) + 1.),
+                    lambda x: 1. / (np.exp(-x) + 1.) / (np.exp(x) + 1.),
+                    -3.0, 3.0],
+        'sin': [lambda x: mx.sym.sin(x),
+                lambda x: np.sin(x),
+                lambda x: np.cos(x),
+                -5.0, 5.0],
+        'sinh': [lambda x: mx.sym.sinh(x),
+                 lambda x: np.sinh(x),
+                 lambda x: np.cosh(x),
+                 -2.0, 2.0],
+        'sqrt': [lambda x: mx.sym.sqrt(x),
+                 lambda x: np.sqrt(x),
+                 lambda x: 0.5 / np.sqrt(x),
+                 0.01, 100.0],
+        'tan': [lambda x: mx.sym.tan(x),
+                lambda x: np.tan(x),
+                lambda x: np.tan(x) ** 2 + 1.,
+                -1.5, 1.5],
+        'tanh': [lambda x: mx.sym.tanh(x),
+                 lambda x: np.tanh(x),
+                 lambda x: 1. - np.tanh(x) ** 2,
+                 -4.0, 4.0],
+        'smooth_l1_sig1': [lambda x: mx.sym.smooth_l1(x, scalar=1.),
+                           lambda x: np_smooth_l1(x, 1.),
+                           lambda x: np_smooth_l1_grad(x, 1.),
+                           -2.0, 2.0],
+        'smooth_l1_sig2': [lambda x: mx.sym.smooth_l1(x, scalar=2.),
+                           lambda x: np_smooth_l1(x, 2.),
+                           lambda x: np_smooth_l1_grad(x, 2.),
+                           -1.0, 1.0]
+    }
+    if have_scipy:
+        unary_ops['gamma'] = [lambda x: mx.sym.gamma(x),
+                              lambda x: scipy_special.gamma(x),
+                              lambda x: scipy_special.gamma(x) * scipy_special.psi(x),
+                              0.01, 5.0]
+        unary_ops['gammaln'] = [lambda x: mx.sym.gammaln(x),
+                                lambda x: scipy_special.gammaln(x),
+                                lambda x: scipy_special.psi(x),
+                                0.01, 20.0]
+    # Loop over operators
+    for name, op in unary_ops.items():
+        # Loop over dtype's
+        for ind in range(len(dtype_l)):
+            dtype = dtype_l[ind]
+            if name == 'gammaln' or name == 'gamma':
+                rtol = rtol_less_l[ind]
+                atol = atol_less_l[ind]
+            else:
+                rtol = rtol_l[ind]
+                atol = atol_l[ind]
+            compare_forw_backw_unary_op(
+                name, op[0], op[1], op[2], shape, op[3], op[4], rtol, atol,
+                dtype)
+        # Finite difference testing
+        finite_diff_unary_op(
+            name, op[0], shape, op[3], op[4], rtol_fd, atol_fd, num_eps)
+
+def compare_forw_backw_binary_op(
+        name, forward_mxnet_call, forward_numpy_call,
+        backward1_numpy_call, backward2_numpy_call, shape, input1_low,
+        input1_high, input2_low, input2_high, rtol, atol, dtype=np.float32):
+    check_fw = lambda sym, location, expected :\
+        check_symbolic_forward(sym, location, expected, rtol=rtol,
+                               atol=atol, dtype=dtype)
+    check_bw = lambda sym, location, out_grads, expected :\
+        check_symbolic_backward(sym, location, out_grads, expected,
+                                rtol=rtol, atol=atol, dtype=dtype)
+    op_name = 'binary_op={}, dtype={}'.format(name, dtype)
+    data1 = mx.symbol.Variable(op_name + '_data1', dtype=dtype)
+    data2 = mx.symbol.Variable(op_name + '_data2', dtype=dtype)
+    # Comparison: Forward expression
+    data1_np = np.random.uniform(input1_low, input1_high, shape).astype(dtype)
+    data2_np = np.random.uniform(input2_low, input2_high, shape).astype(dtype)
+    res_np = forward_numpy_call(data1_np, data2_np)
+    op_ex = mx.sym.broadcast_add(
+        forward_mxnet_call(data1, data2), mx.sym.zeros_like(data1),
+        name=op_name)
+    check_fw(op_ex, [data1_np, data2_np], [res_np])
+    # Comparison: Backward expression
+    res_grad = np.random.uniform(-2.0, 2.0, shape).astype(dtype)
+    data1_grad = backward1_numpy_call(data1_np, data2_np) * res_grad
+    data2_grad = backward2_numpy_call(data1_np, data2_np) * res_grad
+    check_bw(op_ex, [data1_np, data2_np], [res_grad], [data1_grad, data2_grad])
+
+def finite_diff_binary_op(
+        name, forward_mxnet_call, shape, input1_low, input1_high, input2_low,
+        input2_high, rtol, atol, num_eps):
+    # Finite difference tests are done in float64
+    dtype = np.float64
+    check_grad = lambda sym, location:\
+        check_numeric_gradient(sym, location, numeric_eps=num_eps, rtol=rtol,
+                               atol=atol, dtype=dtype)
+    data1_np = np.random.uniform(input1_low, input1_high, shape).astype(dtype)
+    data2_np = np.random.uniform(input2_low, input2_high, shape).astype(dtype)
+    data1 = mx.symbol.Variable('data1', dtype=dtype)
+    data2 = mx.symbol.Variable('data2', dtype=dtype)
+    op_name = 'binary_op={}, dtype={}'.format(name, dtype)
+    op_ex = mx.sym.broadcast_add(
+        forward_mxnet_call(data1, data2), mx.sym.zeros_like(data1),
+        name=op_name)
+    check_grad(op_ex, [data1_np, data2_np])
+
+# Tests for unary operators (basic mathematical functions):
+# - Forward: Comparison to NumPy (several dtype)
+# - Backward: Comparison to NumPy (several dtype)
+# - Finite difference tests (only dtype = float64)
+def test_binary_math_operators():
+    np.random.seed(192837465)
+    shape=(9, 10)
+    dtype_l = [np.float64, np.float32, np.float16]
+    rtol_l = [1e-7, 1e-6, 1e-2]
+    atol_l = [1e-7, 1e-6, 1e-2]
+    rtol_fd = 1e-5
+    atol_fd = 1e-6
+    num_eps = 1e-6
+    binary_ops = {
+        'hypot' : [lambda x, y: mx.sym.hypot(x, y),
+                   lambda x, y: np.hypot(x, y),
+                   lambda x, y: x / np.hypot(x, y),
+                   lambda x, y: y / np.hypot(x, y),
+                    -5.0, 5.0, -5.0, 5.0],
+        'pow': [lambda x, y: mx.sym.pow(x, y),
+                lambda x, y: np.power(x, y),
+                lambda x, y: np.power(x, y - 1.) * y,
+                lambda x, y: np.power(x, y) * np.log(x),
+                0.2, 5.0, -4.0, 4.0]
+    }
+    # Loop over operators
+    for name, op in binary_ops.items():
+        # Loop over dtype's
+        for ind in range(len(dtype_l)):
+            dtype = dtype_l[ind]
+            compare_forw_backw_binary_op(
+                name, op[0], op[1], op[2], op[3], shape, op[4], op[5], op[6],
+                op[7], rtol_l[ind], atol_l[ind], dtype)
+        # Finite difference testing
+        finite_diff_binary_op(
+            name, op[0], shape, op[4], op[5], op[6], op[7], rtol_fd, atol_fd,
+            num_eps)
+
+
+def test_softmax():
+    check_softmax_with_shape((3, 4), default_context(), preserve_shape=False)
+    check_softmax_with_shape((3, 4), default_context(), preserve_shape=True)
+    check_softmax_with_shape((3, 4, 2), default_context(), preserve_shape=True)
+    check_softmax_grad(default_context())
+    check_smoothed_softmax_grad(default_context())
+
+
+def test_slice():
+    def test_slice_forward_backward(a, index):
+        a_np = a.asnumpy()
+        begin = []
+        end = []
+        step = []
+        for slice_i in index:
+            begin.append(slice_i.start)
+            end.append(slice_i.stop)
+            step.append(slice_i.step)
+        b = mx.nd.slice(a, begin=begin, end=end, step=step)
+        b_np = a_np[index]
+        assert same(b.asnumpy(), b_np)
+
+        data = mx.sym.Variable('data')
+        slice_sym = mx.sym.slice(data, begin=begin, end=end, step=step)
+        expected_in_grad = np.zeros_like(a_np)
+        expected_in_grad[index] = b_np
+        check_symbolic_backward(slice_sym, [a_np], [b_np], [expected_in_grad])
+
+    shape = (16, 14, 17, 20)
+    arr = mx.nd.arange(np.prod(shape)).reshape(shape=shape)
+    index_list = [(slice(None),), (slice(None), slice(None)), (slice(1, 10),), (slice(1, 10), slice(3, 9)),
+                  (slice(1, 10), slice(2, 5), slice(3, 6), slice(7, 10)),
+                  (slice(1, 10, 2), slice(2, 9, 3), slice(3, 6, 5), slice(7, 10, 2)),
+                  (slice(None, None, -1), slice(None, None, -1), slice(None, None, -1)),
+                  (slice(10, 0, -2), slice(5, 2, -1), slice(7, None, 3), slice(None, 12, 4))]
+    for index in index_list:
+        test_slice_forward_backward(arr, index)
+
+    # check numeric gradient
+    in_data = np.arange(36).reshape(2, 2, 3, 3)
+    data = mx.sym.Variable('data')
+    slice_sym = mx.sym.slice(data, begin=[0, None], end=[1, None], step=[2, -1])
+    check_numeric_gradient(slice_sym, [in_data])
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 3b3b92b372d8..1a26434015de 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -17,10 +17,30 @@
 
 import numpy as np
 import mxnet as mx
+import mxnet.lr_scheduler as lr_scheduler
+import unittest
+from nose.tools import raises
 import math
 from mxnet.test_utils import *
 
-# Common
+def test_learning_rate():
+    o1 = mx.optimizer.Optimizer(learning_rate=0.01)
+    o1.set_learning_rate(0.2)
+    assert o1.learning_rate == 0.2
+
+    lr_s = lr_scheduler.FactorScheduler(step=1)
+    o2 = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3)
+    assert o2.learning_rate == 0.3
+    o2.lr_scheduler.base_lr = 0.4
+    assert o2.learning_rate == 0.4
+
+
+@raises(UserWarning)
+def test_learning_rate_expect_user_warning():
+    lr_s = lr_scheduler.FactorScheduler(step=1)
+    o = mx.optimizer.Optimizer(lr_scheduler=lr_s, learning_rate=0.3)
+    o.set_learning_rate(0.5)
+
 
 def test_lr_wd_mult():
     data = mx.sym.Variable('data')
@@ -46,27 +66,40 @@ def test_lr_wd_mult():
     assert not mx.test_utils.almost_equal(args1['fc1_bias'], args2['fc1_bias'], 1e-1)
     assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1)
 
-
-def compare_optimizer(opt1, opt2, shape, dtype):
-    w1 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-    g1 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-
-    w2 = w1.copyto(default_context())
-    g2 = g1.copyto(default_context())
-
-    state1 = opt1.create_state(0, w1)
-    state2 = opt2.create_state(0, w2)
-    if state1 is not None and state2 is not None:
-        for s1, s2, in zip(state1, state2):
-            if s1 is not None or s2 is not None:
-                assert(same(s1.asnumpy(), s2.asnumpy()))
-
-    opt1.update(0, w1, g1, state1)
-    opt2.update(0, w2, g2, state2)
-    if state1 is not None and state2 is not None:
-        for s1, s2, in zip(state1, state2):
-            if s1 is not None or s2 is not None:
-                assert_almost_equal(s1.asnumpy(), s2.asnumpy(), rtol=1e-4, atol=1e-5)
+def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
+    if t1 is not None and t2 is not None:
+        if isinstance(t1, tuple):
+            for s1, s2 in zip(t1, t2):
+                compare_ndarray_tuple(s1, s2, rtol, atol)
+        else:
+            assert_almost_equal(t1.asnumpy(), t2.asnumpy(), rtol=rtol, atol=atol)
+
+
+def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default'):
+    if w_stype == 'default':
+        w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+        w1 = w2.copyto(default_context())
+    elif w_stype == 'row_sparse' or w_stype == 'csr':
+        w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
+        w1 = w2.copyto(default_context()).tostype('default')
+    else:
+        raise Exception("type not supported yet")
+    if g_stype == 'default':
+        g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+        g1 = g2.copyto(default_context())
+    elif g_stype == 'row_sparse' or g_stype == 'csr':
+        g2 = rand_ndarray(shape, g_stype, dtype=dtype)
+        g1 = g2.copyto(default_context()).tostype('default')
+    else:
+        raise Exception("type not supported yet")
+
+    state1 = opt1.create_state_multi_precision(0, w1)
+    state2 = opt2.create_state_multi_precision(0, w2)
+    compare_ndarray_tuple(state1, state2)
+
+    opt1.update_multi_precision(0, w1, g1, state1)
+    opt2.update_multi_precision(0, w2, g2, state2)
+    compare_ndarray_tuple(state1, state2, rtol=1e-4, atol=1e-5)
     assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=1e-4, atol=1e-5)
 
 # SGD
@@ -100,6 +133,9 @@ def create_state(self, index, weight):
                 momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
             return momentum
 
+    def create_state_multi_precision(self, index, weight):
+        return self.create_state(index, weight)
+
     def update(self, index, weight, grad, state):
         """Update the parameters.
 
@@ -159,6 +195,9 @@ def update(self, index, weight, grad, state):
             tmp = weight32.astype(weight.dtype)
             tmp.copyto(weight)
 
+    def update_multi_precision(self, index, weight, grad, state):
+        self.update(index, weight, grad, state)
+
 def test_sgd():
     mx.random.seed(0)
     opt1 = PySGD
@@ -186,18 +225,124 @@ def test_sgd():
                                         not kwarg['multi_precision'])):
                                 continue
                             compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+                            # test operator fallback on cpu
+                            if (default_context() == mx.cpu()):
+                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
+                                                  g_stype='row_sparse')
+                                if dtype != np.float16:
+                                    compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2],
+                                                      dtype, w_stype='csr', g_stype='csr')
+    # test optimizer with a big shape
+    big_shape = (54686454, 1)
+    kwarg = {'momentum': 0.9, 'wd': 0.05}
+    compare_optimizer(opt1(**kwarg), opt2(**kwarg), big_shape, np.float32)
+
+class PySparseSGD(mx.optimizer.Optimizer):
+    """python reference implemenation of sgd"""
+    def __init__(self, learning_rate=0.01, momentum=0.0, **kwargs):
+        super(PySparseSGD, self).__init__(learning_rate=learning_rate, **kwargs)
+        self.momentum = momentum
+
+    def create_state(self, index, weight):
+        """Create additional optimizer state: momentum
+
+        Parameters
+        ----------
+        weight : NDArray
+        The weight data
+
+        """
+        if self.momentum == 0.0:
+            return None
+        else:
+            return mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
+
+    def update(self, index, weight, grad, state):
+        """Update the parameters.
+
+        Parameters
+        ----------
+        index : int
+        An unique integer key used to index the parameters
+
+        weight : NDArray
+        weight ndarray
+
+        grad : NDArray
+        grad ndarray
+
+        state : NDArray or other objects returned by init_state
+        The auxiliary state used in optimization.
+        """
+        lr = self._get_lr(index)
+        wd = self._get_wd(index)
+        self._update_count(index)
+        num_rows = weight.shape[0]
+        if self.momentum == 0.0:
+            # Update on a per row basis, skip all-zero rows
+            for row in range(num_rows):
+                grad_row = grad[row].asnumpy()
+                all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+                if all_zeros:
+                   continue
+                if self.clip_gradient is not None:
+                    weight[row] = ((1 - lr*wd)*weight[row] -
+                        lr*mx.nd.clip(grad[row]*self.rescale_grad,
+                                     -self.clip_gradient, self.clip_gradient))
+                else:
+                    weight[row] = (1 - lr*wd)*weight[row] - lr*self.rescale_grad*grad[row]
+        else:
+            mom = state
+            for row in range(num_rows):
+              grad_row = grad[row].asnumpy()
+              all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+              if all_zeros:
+                  continue
+              if self.clip_gradient is not None:
+                  mom[row] = (self.momentum*mom[row] - lr*wd*weight[row] -
+                      lr*mx.nd.clip(grad[row]*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
+                  weight[row] += mom[row]
+              else:
+                  mom[row] = self.momentum*mom[row] - lr*wd*weight[row] - lr*self.rescale_grad*grad[row]
+                  weight[row] += mom[row]
+
+def test_sparse_sgd():
+    mx.random.seed(0)
+    opt1 = PySparseSGD
+    opt2 = mx.optimizer.SGD
+    shape = (3, 4, 5)
+    mom_options = [{}, {'momentum': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    for dtype in [np.float32]:
+        for mom_option in mom_options:
+            for cg_option in cg_options:
+                for rg_option in rg_options:
+                    for wd_option in wd_options:
+                        for mp_option in mp_options:
+                            kwarg = {}
+                            kwarg.update(mom_option)
+                            kwarg.update(cg_option)
+                            kwarg.update(rg_option)
+                            kwarg.update(wd_option)
+                            kwarg.update(mp_option)
+                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
+                                              w_stype='row_sparse', g_stype='row_sparse')
 
 # ADAM
 
 class PyAdam(mx.optimizer.Optimizer):
     """python reference implemenation of adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 decay_factor=(1 - 1e-8), **kwargs):
+                 decay_factor=(1 - 1e-8), sparse_update=False, **kwargs):
         super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
         self.decay_factor = decay_factor
+        self.sparse_update = sparse_update
 
     def create_state(self, index, weight):
         """Create additional optimizer state: mean, variance
@@ -235,21 +380,28 @@ def update(self, index, weight, grad, state):
         mean, variance = state
 
         wd = self._get_wd(index)
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient, out=grad)
-
-        mean *= self.beta1
-        mean += grad * (1. - self.beta1)
-
-        variance *= self.beta2
-        variance += (1 - self.beta2) * mx.nd.square(grad, out=grad)
-
+        num_rows = weight.shape[0]
         coef1 = 1. - self.beta1**t
         coef2 = 1. - self.beta2**t
         lr *= math.sqrt(coef2)/coef1
-
-        weight -= lr*mean/(mx.nd.sqrt(variance) + self.epsilon)
+        for row in range(num_rows):
+            # check row slices of all zeros
+            all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
+            # skip zeros during sparse update
+            if all_zeros and self.sparse_update:
+                continue
+            grad[row] = grad[row] * self.rescale_grad + wd * weight[row]
+            # clip gradients
+            if self.clip_gradient is not None:
+                mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+            # update mean
+            mean[row] *= self.beta1
+            mean[row] += grad[row] * (1. - self.beta1)
+            # update variance
+            variance[row] *= self.beta2
+            variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row])
+            # update weight
+            weight[row] -= lr*mean[row]/(mx.nd.sqrt(variance[row]) + self.epsilon)
 
 
 def test_adam():
@@ -257,15 +409,28 @@ def test_adam():
     opt1 = PyAdam
     opt2 = mx.optimizer.Adam
     shape = (3, 4, 5)
-    kwargs = [{},
-              {'clip_gradient': 0.5},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14},
-              {'rescale_grad': 0.8},
-              {'clip_gradient': 0.5, 'wd': 0.07},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03},
-              {'rescale_grad': 0.8, 'wd': 0.05}]
-    for kwarg in kwargs:
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    for dtype in [np.float16, np.float32, np.float64]:
+        for cg_option in cg_options:
+            for rg_option in rg_options:
+                for wd_option in wd_options:
+                    for mp_option in mp_options:
+                        kwarg = {}
+                        kwarg.update(cg_option)
+                        kwarg.update(rg_option)
+                        kwarg.update(wd_option)
+                        kwarg.update(mp_option)
+                        if (dtype == np.float16 and
+                                ('multi_precision' not in kwarg or
+                                    not kwarg['multi_precision'])):
+                            continue
+                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+                        if (default_context() == mx.cpu()):
+                            compare_optimizer(opt1(sparse_update=True, **kwarg), opt2(**kwarg), shape,
+                                          dtype, w_stype='row_sparse', g_stype='row_sparse')
 
 # RMSProp
 class PyRMSProp(mx.optimizer.Optimizer):
@@ -372,11 +537,98 @@ def update(self, index, weight, grad, state):
         if self.clip_weights:
              mx.ndarray.clip(weight, -self.clip_weights, self.clip_weights, out=weight)
 
+@unittest.skip("Test fails intermittently. Temporarily disabled until fixed. Tracked at https://github.com/apache/incubator-mxnet/issues/8230")
 def test_rms():
     mx.random.seed(0)
     opt1 = PyRMSProp
     opt2 = mx.optimizer.RMSProp
     shape = (3, 4, 5)
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    cw_options = [{}, {'clip_weights': 0.01}]
+    center_options = [{}, {'centered': False}, {'centered': True}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    for dtype in [np.float16, np.float32]:
+        for cw_option in cw_options:
+            for cg_option in cg_options:
+                for center_option in center_options:
+                    for rg_option in rg_options:
+                        for wd_option in wd_options:
+                            for mp_option in mp_options:
+                                kwarg = {}
+                                kwarg.update(cw_option)
+                                kwarg.update(cg_option)
+                                kwarg.update(center_option)
+                                kwarg.update(rg_option)
+                                kwarg.update(wd_option)
+                                kwarg.update(mp_option)
+                                if (dtype == np.float16 and
+                                        ('multi_precision' not in kwarg or
+                                            not kwarg['multi_precision'])):
+                                    continue
+                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+                                if (default_context() == mx.cpu()):
+                                    compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, g_stype='row_sparse')
+
+class PyFtrl(mx.optimizer.Optimizer):
+    """The Ftrl optimizer.
+
+    Referenced from *Ad Click Prediction: a View from the Trenches*, available at
+    http://dl.acm.org/citation.cfm?id=2488200.
+
+    Parameters
+    ----------
+    lamda1 : float, optional
+        L1 regularization coefficient.
+    learning_rate : float, optional
+        The initial learning rate.
+    beta : float, optional
+        Per-coordinate learning rate correlation parameter.
+    eta :
+        .. math::
+           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^t}}
+    """
+
+    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, sparse_update=False, **kwargs):
+        super(PyFtrl, self).__init__(**kwargs)
+        self.lamda1 = lamda1
+        self.beta = beta
+        self.lr = learning_rate
+        self.sparse_update = sparse_update
+
+    def create_state(self, index, weight):
+        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # dn
+                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # n
+
+    def update(self, index, weight, grad, state):
+        self._update_count(index)
+        wd = self._get_wd(index)
+        lr = self._get_lr(index)
+        num_rows = weight.shape[0]
+
+        dn, n = state
+        for row in range(num_rows):
+            all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
+            if all_zeros and self.sparse_update:
+                continue
+            grad[row] = grad[row] * self.rescale_grad
+            if self.clip_gradient is not None:
+                mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+
+            #update dn, n
+            dn[row] += grad[row] - (mx.nd.sqrt(n[row] + grad[row] * grad[row]) - mx.nd.sqrt(n[row])) * weight[row] / lr
+            n[row] += grad[row] * grad[row]
+
+            # update weight
+            weight[row] = (mx.nd.sign(dn[row]) * self.lamda1 - dn[row]) / \
+                          ((self.beta + mx.nd.sqrt(n[row])) / lr + wd) * (mx.nd.abs(dn[row]) > self.lamda1)
+
+def test_ftrl():
+    mx.random.seed(0)
+    opt1 = PyFtrl
+    opt2 = mx.optimizer.Ftrl
+    shape = (3, 4, 5)
     kwargs = [{},
               {'clip_gradient': 0.5},
               {'clip_gradient': 0.4, 'rescale_grad': 0.14},
@@ -384,30 +636,14 @@ def test_rms():
               {'clip_gradient': 0.5, 'wd': 0.07},
               {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03},
               {'rescale_grad': 0.8, 'wd': 0.05},
-              {'centered': True},
-              {'clip_gradient': 0.5, 'centered': True},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'centered': True},
-              {'rescale_grad': 0.8, 'centered': True},
-              {'clip_gradient': 0.5, 'wd': 0.07, 'centered': True},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03, 'centered': True},
-              {'rescale_grad': 0.8, 'wd': 0.05, 'centered': True},
-              {'clip_gradient': 0.5, 'clip_weights': 0.01},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'clip_weights': 0.01},
-              {'rescale_grad': 0.8, 'clip_weights': 0.01},
-              {'clip_gradient': 0.5, 'wd': 0.07, 'clip_weights': 0.01},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03, 'clip_weights': 0.01},
-              {'rescale_grad': 0.8, 'wd': 0.05, 'clip_weights': 0.01},
-              {'centered': True, 'clip_weights': 0.01},
-              {'clip_gradient': 0.5, 'centered': True, 'clip_weights': 0.01},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'centered': True, 'clip_weights': 0.01},
-              {'rescale_grad': 0.8, 'centered': True, 'clip_weights': 0.01},
-              {'clip_gradient': 0.5, 'wd': 0.07, 'centered': True, 'clip_weights': 0.01},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03, 'centered': True, 'clip_weights': 0.01},
-              {'rescale_grad': 0.8, 'wd': 0.05, 'centered': True, 'clip_weights': 0.01}]
+              {'rescale_grad': 0.8, 'wd': 0.05, 'lamda1': 0.01},
+              {'clip_gradient': 0.5, 'wd': 0.07, 'lamda1': 1.0}]
     for kwarg in kwargs:
         compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
+        compare_optimizer(opt1(sparse_update=True, **kwarg), opt2(**kwarg), shape,
+                          np.float32, w_stype='row_sparse', g_stype='row_sparse')
 
 if __name__ == '__main__':
-    test_adam()
-    test_rms()
-    test_sgd()
+    import nose
+    nose.runmodule()
+
diff --git a/tests/python/unittest/test_random.py b/tests/python/unittest/test_random.py
index 6b8311c145f5..a67e2d1113cb 100644
--- a/tests/python/unittest/test_random.py
+++ b/tests/python/unittest/test_random.py
@@ -27,9 +27,8 @@ def check_with_device(device, dtype):
     symbols = [
         {
             'name': 'normal',
-            'symbol': mx.sym.random_normal,
-            'multisymbol': mx.sym.sample_normal,
-            'ndop': mx.random.normal,
+            'symbol': mx.sym.random.normal,
+            'ndop': mx.nd.random.normal,
             'params': { 'loc': 10.0, 'scale': 0.5 },
             'inputs': [ ('loc',[ [ 0.0, 2.5 ], [ -9.75, -7.0 ] ]) , ('scale',[ [ 1.0, 3.7 ], [ 4.2, 1.5 ] ]) ],
             'checks': [
@@ -39,24 +38,19 @@ def check_with_device(device, dtype):
         },
         {
             'name': 'uniform',
-            'symbol': mx.sym.random_uniform,
-            'multisymbol': mx.sym.sample_uniform,
-            'ndop': mx.random.uniform,
+            'symbol': mx.sym.random.uniform,
+            'ndop': mx.nd.random.uniform,
             'params': { 'low': -1.5, 'high': 3.0 },
             'inputs': [ ('low', [ [ 0.0, 2.5 ], [ -9.75, -1.0 ] ]) , ('high', [ [ 1.0, 3.7 ], [ 4.2, 10.5 ] ]) ],
             'checks': [
                 ('mean', lambda x, params: np.mean(x.astype(np.float64)) - (params['low'] + params['high']) / 2.0, tol),
                 ('std', lambda x,  params: np.std(x.astype(np.float64)) - np.sqrt(1.0 / 12.0) * (params['high'] - params['low']), tol)
             ]
-        }
-    ]
-    if device.device_type == 'cpu':
-        symbols.extend([
-            {
+        },
+        {
                 'name': 'gamma',
-                'symbol': mx.sym.random_gamma,
-                'multisymbol': mx.sym.sample_gamma,
-                'ndop': mx.random.gamma,
+                'symbol': mx.sym.random.gamma,
+                'ndop': mx.nd.random.gamma,
                 'params': { 'alpha': 9.0, 'beta': 0.5 },
                 'inputs': [ ('alpha', [ [ 0.0, 2.5 ], [ 9.75, 11.0 ] ]) , ('beta', [ [ 1.0, 0.7 ], [ 0.5, 0.3 ] ]) ],
                 'checks': [
@@ -66,23 +60,21 @@ def check_with_device(device, dtype):
             },
             {
                 'name': 'exponential',
-                'symbol': mx.sym.random_exponential,
-                'multisymbol': mx.sym.sample_exponential,
-                'ndop': mx.random.exponential,
-                'params': { 'lam': 4.0 },
-                'inputs': [ ('lam', [ [ 1.0, 8.5 ], [ 2.7 , 0.5 ] ]) ],
+                'symbol': mx.sym.random.exponential,
+                'ndop': mx.nd.random.exponential,
+                'params': { 'scale': 1.0/4.0 },
+                'inputs': [ ('scale', [ [ 1.0/1.0, 1.0/8.5 ], [ 1.0/2.7 , 1.0/0.5 ] ]) ],
                 'checks': [
-                    ('mean', lambda x, params: np.mean(x.astype(np.float64)) - 1.0 / params['lam'], tol),
-                    ('std', lambda x, params: np.std(x.astype(np.float64)) - 1.0 / params['lam'], tol)
+                    ('mean', lambda x, params: np.mean(x.astype(np.float64)) - params['scale'], tol),
+                    ('std', lambda x, params: np.std(x.astype(np.float64)) - params['scale'], tol)
                 ]
             },
             {
                 'name': 'poisson',
-                'symbol': mx.sym.random_poisson,
-                'ndop': mx.random.poisson,
-                'multisymbol': mx.sym.sample_poisson,
+                'symbol': mx.sym.random.poisson,
+                'ndop': mx.nd.random.poisson,
                 'params': { 'lam': 4.0 },
-                'inputs': [ ('lam', [ [ 1.0, 8.5 ], [ 2.7 , 0.5 ] ]) ],
+                'inputs': [ ('lam', [ [ 25.0, 8.5 ], [ 2.7 , 0.5 ] ]) ],
                 'checks': [
                     ('mean', lambda x, params: np.mean(x.astype(np.float64)) - params['lam'], tol),
                     ('std', lambda x, params: np.std(x.astype(np.float64)) - np.sqrt(params['lam']), tol)
@@ -90,11 +82,10 @@ def check_with_device(device, dtype):
             },
             {
                 'name': 'neg-binomial',
-                'symbol': mx.sym.random_negative_binomial,
-                'multisymbol': mx.sym.sample_negative_binomial,
-                'ndop': mx.random.negative_binomial,
+                'symbol': mx.sym.random.negative_binomial,
+                'ndop': mx.nd.random.negative_binomial,
                 'params': { 'k': 3, 'p': 0.4 },
-                'inputs': [ ('k', [ [ 20, 49 ], [ 15 , 16 ] ]) , ('p', [ [ 0.4 , 0.77 ], [ 0.5, 0.84 ] ]) ],
+                'inputs': [ ('k', [ [ 3, 4 ], [ 5 , 6 ] ]) , ('p', [ [ 0.4 , 0.77 ], [ 0.5, 0.84 ] ]) ],
                 'checks': [
                     ('mean', lambda x, params: np.mean(x.astype(np.float64)) - params['k'] * (1.0 - params['p']) /  params['p'], tol),
                     ('std', lambda x, params: np.std(x.astype(np.float64)) - np.sqrt(params['k'] * (1.0 - params['p']))/params['p'], tol)
@@ -102,9 +93,8 @@ def check_with_device(device, dtype):
             },
             {
                 'name': 'gen-neg-binomial',
-                'symbol': mx.sym.random_generalized_negative_binomial,
-                'multisymbol': mx.sym.sample_generalized_negative_binomial,
-                'ndop': mx.random.generalized_negative_binomial,
+                'symbol': mx.sym.random.generalized_negative_binomial,
+                'ndop': mx.nd.random.generalized_negative_binomial,
                 'params': { 'mu': 2.0, 'alpha': 0.3 },
                 'inputs': [ ('mu', [ [ 2.0, 2.5 ], [ 1.3, 1.9 ] ]) , ('alpha', [ [ 1.0, 0.1 ], [ 0.2, 0.5 ] ]) ],
                 'checks': [
@@ -113,9 +103,10 @@ def check_with_device(device, dtype):
                 ]
             }
 
-        ])
+        ]
 
-    shape = (100, 100)
+    # Create enough samples such that we get a meaningful distribution.
+    shape = (500, 500)
     for symbdic in symbols:
         name = symbdic['name']
         ndop = symbdic['ndop']
@@ -133,6 +124,22 @@ def check_with_device(device, dtype):
         for check_name, check_func, tol in symbdic['checks']:
             assert np.abs(check_func(ret1, params)) < tol, "ndarray test: %s check for `%s` did not pass" % (check_name, name)
 
+        # check multi-distribution sampling
+        params = {'shape': shape, 'dtype': dtype, 'ctx': device}
+        params.update({k : mx.nd.array(v, ctx=device, dtype=dtype) for k, v in symbdic['inputs']})
+        mx.random.seed(128)
+        ret1 = ndop(**params).asnumpy()
+        mx.random.seed(128)
+        ret2 = ndop(**params).asnumpy()
+        assert device.device_type == 'gpu' or same(ret1, ret2), \
+                "ndarray test: `%s` should give the same result with the same seed" % name
+        for i in range(2):
+            for j in range(2):
+                stats = {k : v[i][j] for k, v in symbdic['inputs']}
+                for check_name, check_func, tol in symbdic['checks']:
+                    err = np.abs(check_func(ret2[i,j], stats))
+                    assert err < tol, "%f vs %f: symbolic test: %s check for `%s` did not pass" % (err, tol, check_name, name)
+
         # check symbolic
         symbol = symbdic['symbol']
         X = mx.sym.Variable("X")
@@ -157,29 +164,28 @@ def check_with_device(device, dtype):
         for check_name, check_func, tol in symbdic['checks']:
             assert np.abs(check_func(ret1, params)) < tol, "symbolic test: %s check for `%s` did not pass" % (check_name, name)
 
-        # check multi-distribution sampling, only supports cpu for now
-        if device.device_type == 'cpu':
-            symbol = symbdic['multisymbol']
-            params = { 'shape' : shape, 'dtype' : dtype }
-            single_param = len(symbdic['inputs']) == 1;
-            v1 = mx.sym.Variable('v1')
-            v2 = mx.sym.Variable('v2')
-            Y = symbol(v1,**params) if single_param else symbol(v1,v2,**params)
-            bindings = { 'v1' : mx.nd.array(symbdic['inputs'][0][1]) }
-            if not single_param :
-                bindings.update({ 'v2' : mx.nd.array(symbdic['inputs'][1][1]) })
-            yexec = Y.bind(ctx=device, args=bindings)
-            yexec.forward()
-            un1 = yexec.outputs[0].copyto(device).asnumpy()
-            params = {}
-            for i, r in enumerate(symbdic['inputs'][0][1]):
-                for j, p1 in enumerate(r):
-                    params.update({ symbdic['inputs'][0][0] : p1 })
-                    if not single_param:
-                       params.update({ symbdic['inputs'][1][0] : symbdic['inputs'][1][1][i][j] })
-                    samples = un1[i,j]
-                    for check_name, check_func, tol in symbdic['checks']:
-                        assert np.abs(check_func(samples, params)) < tol, "symbolic test: %s check for `%s` did not pass" % (check_name, name)
+        # check multi-distribution sampling
+        symbol = symbdic['symbol']
+        params = { 'shape' : shape, 'dtype' : dtype }
+        single_param = len(symbdic['inputs']) == 1;
+        v1 = mx.sym.Variable('v1')
+        v2 = mx.sym.Variable('v2')
+        Y = symbol(v1,**params) if single_param else symbol(v1,v2,**params)
+        bindings = { 'v1' : mx.nd.array(symbdic['inputs'][0][1]) }
+        if not single_param :
+            bindings.update({ 'v2' : mx.nd.array(symbdic['inputs'][1][1]) })
+        yexec = Y.bind(ctx=device, args=bindings)
+        yexec.forward()
+        un1 = yexec.outputs[0].copyto(device).asnumpy()
+        params = {}
+        for i, r in enumerate(symbdic['inputs'][0][1]):
+            for j, p1 in enumerate(r):
+                params.update({ symbdic['inputs'][0][0] : p1 })
+                if not single_param:
+                   params.update({ symbdic['inputs'][1][0] : symbdic['inputs'][1][1][i][j] })
+                samples = un1[i,j]
+                for check_name, check_func, tol in symbdic['checks']:
+                    assert np.abs(check_func(samples, params)) < tol, "symbolic test: %s check for `%s` did not pass" % (check_name, name)
 
 def test_random():
     check_with_device(mx.context.current_context(), 'float16')
@@ -192,7 +198,7 @@ def test_sample_multinomial():
     dx = mx.nd.ones_like(x)
     mx.contrib.autograd.mark_variables([x], [dx])
     with mx.autograd.record():
-        y, prob = mx.nd.sample_multinomial(x, shape=1000, get_prob=True)
+        y, prob = mx.nd.random.multinomial(x, shape=1000, get_prob=True)
         r = prob * 5
         r.backward()
 
@@ -212,5 +218,5 @@ def test_sample_multinomial():
 
 
 if __name__ == '__main__':
-    test_random()
-    test_sample_multinomial()
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
new file mode 100644
index 000000000000..e59e476601f7
--- /dev/null
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -0,0 +1,830 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pickle as pkl
+
+from mxnet.ndarray import NDArray
+from mxnet.test_utils import *
+from mxnet.base import mx_real_t
+from numpy.testing import assert_allclose
+import numpy.random as rnd
+from common import assertRaises
+from mxnet.ndarray.sparse import RowSparseNDArray, CSRNDArray
+
+
+def sparse_nd_ones(shape, stype):
+    return mx.nd.ones(shape).tostype(stype)
+
+
+def test_sparse_nd_elemwise_add():
+    def check_sparse_nd_elemwise_binary(shapes, stypes, f, g):
+        # generate inputs
+        nds = []
+        for i, stype in enumerate(stypes):
+            if stype == 'row_sparse':
+                nd, _ = rand_sparse_ndarray(shapes[i], stype)
+            elif stype == 'default':
+                nd = mx.nd.array(random_arrays(shapes[i]), dtype = np.float32)
+            else:
+                assert(False)
+            nds.append(nd)
+        # check result
+        test = f(nds[0], nds[1])
+        assert_almost_equal(test.asnumpy(), g(nds[0].asnumpy(), nds[1].asnumpy()))
+
+    num_repeats = 3
+    g = lambda x,y: x + y
+    op = mx.nd.elemwise_add
+    for i in range(num_repeats):
+        shape = [rand_shape_2d()] * 2
+        check_sparse_nd_elemwise_binary(shape, ['default'] * 2, op, g)
+        check_sparse_nd_elemwise_binary(shape, ['row_sparse', 'row_sparse'], op, g)
+
+
+def test_sparse_nd_copy():
+    def check_sparse_nd_copy(from_stype, to_stype, shape):
+        from_nd = rand_ndarray(shape, from_stype)
+        # copy to ctx
+        to_ctx = from_nd.copyto(default_context())
+        # copy to stype
+        to_nd = rand_ndarray(shape, to_stype)
+        to_nd = from_nd.copyto(to_nd)
+        assert np.sum(np.abs(from_nd.asnumpy() != to_ctx.asnumpy())) == 0.0
+        assert np.sum(np.abs(from_nd.asnumpy() != to_nd.asnumpy())) == 0.0
+
+    shape = rand_shape_2d()
+    shape_3d = rand_shape_3d()
+    stypes = ['row_sparse', 'csr']
+    for stype in stypes:
+        check_sparse_nd_copy(stype, 'default', shape)
+        check_sparse_nd_copy('default', stype, shape)
+    check_sparse_nd_copy('row_sparse', 'row_sparse', shape_3d)
+    check_sparse_nd_copy('row_sparse', 'default', shape_3d)
+    check_sparse_nd_copy('default', 'row_sparse', shape_3d)
+
+def test_sparse_nd_basic():
+    def check_sparse_nd_basic_rsp():
+        storage_type = 'row_sparse'
+        shape = rand_shape_2d()
+        nd, (v, idx) = rand_sparse_ndarray(shape, storage_type)
+        assert(nd._num_aux == 1)
+        assert(nd.indices.dtype == np.int64)
+        assert(nd.stype == 'row_sparse')
+
+    check_sparse_nd_basic_rsp()
+
+
+def test_sparse_nd_setitem():
+    def check_sparse_nd_setitem(stype, shape, dst):
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        x[:] = dst
+        dst_nd = mx.nd.array(dst) if isinstance(dst, (np.ndarray, np.generic)) else dst
+        assert same(x.asnumpy(), dst_nd.asnumpy())
+
+    shape = rand_shape_2d()
+    for stype in ['row_sparse', 'csr']:
+        # ndarray assignment
+        check_sparse_nd_setitem(stype, shape, rand_ndarray(shape, 'default'))
+        check_sparse_nd_setitem(stype, shape, rand_ndarray(shape, stype))
+        # numpy assignment
+        check_sparse_nd_setitem(stype, shape, np.ones(shape))
+
+
+def test_sparse_nd_slice():
+    shape = (rnd.randint(2, 10), rnd.randint(2, 10))    
+    stype = 'csr'
+    A, _ = rand_sparse_ndarray(shape, stype)
+    A2 = A.asnumpy()
+    start = rnd.randint(0, shape[0] - 1)
+    end = rnd.randint(start + 1, shape[0])
+    assert same(A[start:end].asnumpy(), A2[start:end])
+    assert same(A[start - shape[0]:end].asnumpy(), A2[start:end])
+    assert same(A[start:].asnumpy(), A2[start:])
+    assert same(A[:end].asnumpy(), A2[:end])
+    ind = rnd.randint(-shape[0], shape[0] - 1)
+    assert same(A[ind].asnumpy(), A2[ind][np.newaxis, :])
+        
+    start_col = rnd.randint(0, shape[1] - 1)
+    end_col = rnd.randint(start_col + 1, shape[1])
+    result = mx.nd.slice(A, begin=(start, start_col), end=(end, end_col))
+    result_dense = mx.nd.slice(mx.nd.array(A2), begin=(start, start_col), end=(end, end_col))
+    assert same(result_dense.asnumpy(), result.asnumpy())
+    
+    A = mx.nd.sparse.zeros('csr', shape)
+    A2 = A.asnumpy()
+    assert same(A[start:end].asnumpy(), A2[start:end])
+    result = mx.nd.slice(A, begin=(start, start_col), end=(end, end_col))
+    result_dense = mx.nd.slice(mx.nd.array(A2), begin=(start, start_col), end=(end, end_col))
+    assert same(result_dense.asnumpy(), result.asnumpy())
+
+    def check_slice_nd_csr_fallback(shape):
+        stype = 'csr'
+        A, _ = rand_sparse_ndarray(shape, stype)
+        A2 = A.asnumpy()
+        start = rnd.randint(0, shape[0] - 1)
+        end = rnd.randint(start + 1, shape[0])
+
+        # non-trivial step should fallback to dense slice op
+        result = mx.nd.sparse.slice(A, begin=(start,), end=(end + 1,), step=(2,))
+        result_dense = mx.nd.slice(mx.nd.array(A2), begin=(start,), end=(end + 1,), step=(2,))
+        assert same(result_dense.asnumpy(), result.asnumpy())
+
+    shape = (rnd.randint(2, 10), rnd.randint(1, 10))
+    check_slice_nd_csr_fallback(shape)
+
+
+def test_sparse_nd_equal():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = x == y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = 0 == x
+        assert (z.asnumpy() == np.ones(shape)).all()
+
+
+def test_sparse_nd_not_equal():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = x != y
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = 0 != x
+        assert (z.asnumpy() == np.zeros(shape)).all()
+
+
+def test_sparse_nd_greater():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = x > y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = y > 0
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = 0 > y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+
+
+def test_sparse_nd_greater_equal():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = x >= y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = y >= 0
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = 0 >= y
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = y >= 1
+        assert (z.asnumpy() == np.ones(shape)).all()
+
+
+def test_sparse_nd_lesser():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = y < x
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = 0 < y
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = y < 0
+        assert (z.asnumpy() == np.zeros(shape)).all()
+
+
+def test_sparse_nd_lesser_equal():
+    for stype in ['row_sparse', 'csr']:
+        shape = rand_shape_2d()
+        x = mx.nd.zeros(shape=shape, stype=stype)
+        y = sparse_nd_ones(shape, stype)
+        z = y <= x
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = 0 <= y
+        assert (z.asnumpy() == np.ones(shape)).all()
+        z = y <= 0
+        assert (z.asnumpy() == np.zeros(shape)).all()
+        z = 1 <= y
+        assert (z.asnumpy() == np.ones(shape)).all()
+
+
+def test_sparse_nd_binary():
+    N = 3
+    def check_binary(fn, stype):
+        for _ in range(N):
+            ndim = 2
+            oshape = np.random.randint(1, 6, size=(ndim,))
+            bdim = 2
+            lshape = list(oshape)
+            # one for broadcast op, another for elemwise op
+            rshape = list(oshape[ndim-bdim:])
+            for i in range(bdim):
+                sep = np.random.uniform(0, 1)
+                if sep < 0.33:
+                    lshape[ndim-i-1] = 1
+                elif sep < 0.66:
+                    rshape[bdim-i-1] = 1
+            lhs = np.random.uniform(0, 1, size=lshape)
+            rhs = np.random.uniform(0, 1, size=rshape)
+            lhs_nd = mx.nd.array(lhs).tostype(stype)
+            rhs_nd = mx.nd.array(rhs).tostype(stype)
+            assert_allclose(fn(lhs, rhs), fn(lhs_nd, rhs_nd).asnumpy(), rtol=1e-4, atol=1e-4)
+
+    stypes = ['row_sparse', 'csr']
+    for stype in stypes:
+        check_binary(lambda x, y: x + y, stype)
+        check_binary(lambda x, y: x - y, stype)
+        check_binary(lambda x, y: x * y, stype)
+        check_binary(lambda x, y: x / y, stype)
+        check_binary(lambda x, y: x ** y, stype)
+        check_binary(lambda x, y: x > y, stype)
+        check_binary(lambda x, y: x < y, stype)
+        check_binary(lambda x, y: x >= y, stype)
+        check_binary(lambda x, y: x <= y, stype)
+        check_binary(lambda x, y: x == y, stype)
+
+
+def test_sparse_nd_binary_scalar_op():
+    N = 3
+    def check(fn, stype):
+        for _ in range(N):
+            ndim = 2
+            shape = np.random.randint(1, 6, size=(ndim,))
+            npy = np.random.normal(0, 1, size=shape)
+            nd = mx.nd.array(npy).tostype(stype)
+            assert_allclose(fn(npy), fn(nd).asnumpy(), rtol=1e-4, atol=1e-4)
+
+    stypes = ['row_sparse', 'csr']
+    for stype in stypes:
+        check(lambda x: 1 + x, stype)
+        check(lambda x: 1 - x, stype)
+        check(lambda x: 1 * x, stype)
+        check(lambda x: 1 / x, stype)
+        check(lambda x: 2 ** x, stype)
+        check(lambda x: 1 > x, stype)
+        check(lambda x: 0.5 > x, stype)
+        check(lambda x: 0.5 < x, stype)
+        check(lambda x: 0.5 >= x, stype)
+        check(lambda x: 0.5 <= x, stype)
+        check(lambda x: 0.5 == x, stype)
+        check(lambda x: x / 2, stype)
+
+def test_sparse_nd_binary_iop():
+    N = 3
+    def check_binary(fn, stype):
+        for _ in range(N):
+            ndim = 2
+            oshape = np.random.randint(1, 6, size=(ndim,))
+            lshape = list(oshape)
+            rshape = list(oshape)
+            lhs = np.random.uniform(0, 1, size=lshape)
+            rhs = np.random.uniform(0, 1, size=rshape)
+            lhs_nd = mx.nd.array(lhs).tostype(stype)
+            rhs_nd = mx.nd.array(rhs).tostype(stype)
+            assert_allclose(fn(lhs, rhs),
+                            fn(lhs_nd, rhs_nd).asnumpy(),
+                            rtol=1e-4, atol=1e-4)
+
+    def inplace_add(x, y):
+        x += y
+        return x
+    def inplace_mul(x, y):
+        x *= y
+        return x
+    stypes = ['csr', 'row_sparse']
+    fns = [inplace_add, inplace_mul]
+    for stype in stypes:
+        for fn in fns:
+            check_binary(fn, stype)
+
+def test_sparse_nd_negate():
+    def check_sparse_nd_negate(shape, stype):
+        npy = np.random.uniform(-10, 10, rand_shape_2d())
+        arr = mx.nd.array(npy).tostype(stype)
+        assert_almost_equal(npy, arr.asnumpy())
+        assert_almost_equal(-npy, (-arr).asnumpy())
+
+        # a final check to make sure the negation (-) is not implemented
+        # as inplace operation, so the contents of arr does not change after
+        # we compute (-arr)
+        assert_almost_equal(npy, arr.asnumpy())
+
+    shape = rand_shape_2d()
+    stypes = ['csr', 'row_sparse']
+    for stype in stypes:
+        check_sparse_nd_negate(shape, stype)
+
+def test_sparse_nd_broadcast():
+    sample_num = 1000
+    # TODO(haibin) test with more than 2 dimensions
+    def test_broadcast_to(stype):
+        for i in range(sample_num):
+            ndim = 2
+            target_shape = np.random.randint(1, 11, size=ndim)
+            shape = target_shape.copy()
+            axis_flags = np.random.randint(0, 2, size=ndim)
+            axes = []
+            for (axis, flag) in enumerate(axis_flags):
+                if flag:
+                    shape[axis] = 1
+            dat = np.random.rand(*shape) - 0.5
+            numpy_ret = dat
+            ndarray = mx.nd.array(dat).tostype(stype)
+            ndarray_ret = ndarray.broadcast_to(shape=target_shape)
+            if type(ndarray_ret) is mx.ndarray.NDArray:
+                ndarray_ret = ndarray_ret.asnumpy()
+            assert (ndarray_ret.shape == target_shape).all()
+            err = np.square(ndarray_ret - numpy_ret).mean()
+            assert err < 1E-8
+    stypes = ['csr', 'row_sparse']
+    for stype in stypes:
+        test_broadcast_to(stype)
+
+
+def test_sparse_nd_transpose():
+    npy = np.random.uniform(-10, 10, rand_shape_2d())
+    stypes = ['csr', 'row_sparse']
+    for stype in stypes:
+        nd = mx.nd.array(npy).tostype(stype)
+        assert_almost_equal(npy.T, (nd.T).asnumpy())
+
+def test_sparse_nd_storage_fallback():
+    def check_output_fallback(shape):
+        ones = mx.nd.ones(shape)
+        out = mx.nd.zeros(shape=shape, stype='csr')
+        mx.nd.broadcast_add(ones, ones * 2, out=out)
+        assert(np.sum(out.asnumpy() - 3) == 0)
+
+    def check_input_fallback(shape):
+        ones = mx.nd.ones(shape)
+        out = mx.nd.broadcast_add(ones.tostype('csr'), ones.tostype('row_sparse'))
+        assert(np.sum(out.asnumpy() - 2) == 0)
+
+    def check_fallback_with_temp_resource(shape):
+        ones = mx.nd.ones(shape)
+        out = mx.nd.sum(ones)
+        assert(out.asscalar() == np.prod(shape))
+
+    shape = rand_shape_2d()
+    check_output_fallback(shape)
+    check_input_fallback(shape)
+    check_fallback_with_temp_resource(shape)
+
+def test_sparse_nd_random():
+    """ test sparse random operator on cpu """
+    # gpu random operator doesn't use fixed seed
+    if default_context().device_type is 'gpu':
+        return
+    shape = (100, 100)
+    fns = [mx.nd.random.uniform, mx.nd.random.normal, mx.nd.random.gamma]
+    for fn in fns:
+        rsp_out = mx.nd.zeros(shape=shape, stype='row_sparse')
+        dns_out = mx.nd.zeros(shape=shape, stype='default')
+        mx.random.seed(0)
+        np.random.seed(0)
+        fn(shape=shape, out=dns_out)
+        mx.random.seed(0)
+        np.random.seed(0)
+        fn(shape=shape, out=rsp_out)
+        assert_almost_equal(dns_out.asnumpy(), rsp_out.asnumpy())
+
+
+def test_sparse_nd_astype():
+    stypes = ['row_sparse', 'csr']
+    for stype in stypes:
+        x = mx.nd.zeros(shape=rand_shape_2d(), stype=stype, dtype='float32')
+        y = x.astype('int32')
+        assert(y.dtype == np.int32), y.dtype
+
+
+def test_sparse_nd_pickle():
+    np.random.seed(0)
+    repeat = 1
+    dim0 = 40
+    dim1 = 40
+    stypes = ['row_sparse', 'csr']
+    densities = [0, 0.5]
+    stype_dict = {'row_sparse': RowSparseNDArray, 'csr': CSRNDArray}
+    for _ in range(repeat):
+        shape = rand_shape_2d(dim0, dim1)
+        for stype in stypes:
+            for density in densities:
+                a, _ = rand_sparse_ndarray(shape, stype, density)
+                assert isinstance(a, stype_dict[stype])
+                data = pkl.dumps(a)
+                b = pkl.loads(data)
+                assert isinstance(b, stype_dict[stype])
+                assert same(a.asnumpy(), b.asnumpy())
+
+
+def test_sparse_nd_save_load():
+    np.random.seed(0)
+    repeat = 1
+    stypes = ['default', 'row_sparse', 'csr']
+    stype_dict = {'default': NDArray, 'row_sparse': RowSparseNDArray, 'csr': CSRNDArray}
+    num_data = 20
+    densities = [0, 0.5]
+    fname = 'tmp_list.bin'
+    for _ in range(repeat):
+        data_list1 = []
+        for i in range(num_data):
+            stype = stypes[np.random.randint(0, len(stypes))]
+            shape = rand_shape_2d(dim0=40, dim1=40)
+            density = densities[np.random.randint(0, len(densities))]
+            data_list1.append(rand_ndarray(shape, stype, density))
+            assert isinstance(data_list1[-1], stype_dict[stype])
+        mx.nd.save(fname, data_list1)
+
+        data_list2 = mx.nd.load(fname)
+        assert len(data_list1) == len(data_list2)
+        for x, y in zip(data_list1, data_list2):
+            assert same(x.asnumpy(), y.asnumpy())
+
+        data_map1 = {'ndarray xx %s' % i: x for i, x in enumerate(data_list1)}
+        mx.nd.save(fname, data_map1)
+        data_map2 = mx.nd.load(fname)
+        assert len(data_map1) == len(data_map2)
+        for k, x in data_map1.items():
+            y = data_map2[k]
+            assert same(x.asnumpy(), y.asnumpy())
+    os.remove(fname)
+
+def test_sparse_nd_unsupported():
+    nd = mx.nd.zeros((2,2), stype='row_sparse')
+    fn_slice = lambda x: x._slice(None, None)
+    fn_at = lambda x: x._at(None)
+    fn_reshape = lambda x: x.reshape(None)
+    fns = [fn_slice, fn_at, fn_reshape]
+    for fn in fns:
+        try:
+            fn(nd)
+            assert(False)
+        except:
+            pass
+
+def test_create_csr():
+    def check_create_csr_from_nd(shape, density, dtype):
+        matrix = rand_ndarray(shape, 'csr', density)
+        # create data array with provided dtype and ctx
+        data = mx.nd.array(matrix.data.asnumpy(), dtype=dtype)
+        indptr = matrix.indptr
+        indices = matrix.indices
+        csr_created = mx.nd.sparse.csr_matrix((data, indices, indptr), shape=shape)
+        assert csr_created.stype == 'csr'
+        assert same(csr_created.data.asnumpy(), data.asnumpy())
+        assert same(csr_created.indptr.asnumpy(), indptr.asnumpy())
+        assert same(csr_created.indices.asnumpy(), indices.asnumpy())
+        # verify csr matrix dtype and ctx is consistent from the ones provided
+        assert csr_created.dtype == dtype, (csr_created, dtype)
+        assert csr_created.data.dtype == dtype, (csr_created.data.dtype, dtype)
+        assert csr_created.context == Context.default_ctx, (csr_created.context, Context.default_ctx)
+        csr_copy = mx.nd.array(csr_created)
+        assert(same(csr_copy.asnumpy(), csr_created.asnumpy()))
+
+    def check_create_csr_from_coo(shape, density, dtype):
+        matrix = rand_ndarray(shape, 'csr', density)
+        sp_csr = matrix.asscipy()
+        sp_coo = sp_csr.tocoo()
+        csr_created = mx.nd.sparse.csr_matrix((sp_coo.data, (sp_coo.row, sp_coo.col)), shape=shape, dtype=dtype)
+        assert csr_created.stype == 'csr'
+        assert same(csr_created.data.asnumpy(), sp_csr.data)
+        assert same(csr_created.indptr.asnumpy(), sp_csr.indptr)
+        assert same(csr_created.indices.asnumpy(), sp_csr.indices)
+        csr_copy = mx.nd.array(csr_created)
+        assert(same(csr_copy.asnumpy(), csr_created.asnumpy()))
+        # verify csr matrix dtype and ctx is consistent
+        assert csr_created.dtype == dtype, (csr_created.dtype, dtype)
+        assert csr_created.data.dtype == dtype, (csr_created.data.dtype, dtype)
+        assert csr_created.context == Context.default_ctx, (csr_created.context, Context.default_ctx)
+
+    def check_create_csr_from_scipy(shape, density, f):
+        def assert_csr_almost_equal(nd, sp):
+            assert_almost_equal(nd.data.asnumpy(), sp.data)
+            assert_almost_equal(nd.indptr.asnumpy(), sp.indptr)
+            assert_almost_equal(nd.indices.asnumpy(), sp.indices)
+            sp_csr = nd.asscipy()
+            assert_almost_equal(sp_csr.data, sp.data)
+            assert_almost_equal(sp_csr.indptr, sp.indptr)
+            assert_almost_equal(sp_csr.indices, sp.indices)
+            assert(sp.dtype == sp_csr.dtype), (sp.dtype, sp_csr.dtype)
+
+        try:
+            import scipy.sparse as spsp
+            # random canonical csr
+            csr_sp = spsp.rand(shape[0], shape[1], density, format="csr")
+            csr_nd = f(csr_sp)
+            assert_csr_almost_equal(csr_nd, csr_sp)
+            # non-canonical csr which contains duplicates and unsorted indices
+            indptr = np.array([0, 2, 3, 7])
+            indices = np.array([0, 2, 2, 0, 1, 2, 1])
+            data = np.array([1, 2, 3, 4, 5, 6, 1])
+            non_canonical_csr = spsp.csr_matrix((data, indices, indptr), shape=(3, 3), dtype=csr_nd.dtype)
+            canonical_csr_nd = f(non_canonical_csr, dtype=csr_nd.dtype)
+            canonical_csr_sp = non_canonical_csr.copy()
+            canonical_csr_sp.sum_duplicates()
+            canonical_csr_sp.sort_indices()
+            assert_csr_almost_equal(canonical_csr_nd, canonical_csr_sp)
+        except ImportError:
+            print("Could not import scipy.sparse. Skipping unit tests for scipy csr creation")
+
+    dim0 = 20
+    dim1 = 20
+    densities = [0, 0.5]
+    dtype = np.float64
+    for density in densities:
+        shape = rand_shape_2d(dim0, dim1)
+        check_create_csr_from_nd(shape, density, dtype)
+        check_create_csr_from_coo(shape, density, dtype)
+        check_create_csr_from_scipy(shape, density, mx.nd.sparse.array)
+        check_create_csr_from_scipy(shape, density, mx.nd.array)
+
+def test_create_row_sparse():
+    dim0 = 50
+    dim1 = 50
+    densities = [0, 0.5, 1]
+    for density in densities:
+        shape = rand_shape_2d(dim0, dim1)
+        matrix = rand_ndarray(shape, 'row_sparse', density)
+        data = matrix.data
+        indices = matrix.indices
+        rsp_created = mx.nd.sparse.row_sparse_array((data, indices), shape=shape)
+        assert rsp_created.stype == 'row_sparse'
+        assert same(rsp_created.data.asnumpy(), data.asnumpy())
+        assert same(rsp_created.indices.asnumpy(), indices.asnumpy())
+        rsp_copy = mx.nd.array(rsp_created)
+        assert(same(rsp_copy.asnumpy(), rsp_created.asnumpy()))
+
+def test_create_sparse_nd_infer_shape():
+    def check_create_csr_infer_shape(shape, density, dtype):
+        try:
+            matrix = rand_ndarray(shape, 'csr', density=density)
+            data = matrix.data
+            indptr = matrix.indptr
+            indices = matrix.indices
+            nd = mx.nd.sparse.csr_matrix((data, indices, indptr), dtype=dtype)
+            num_rows, num_cols = nd.shape
+            assert(num_rows == len(indptr) - 1)
+            assert(indices.shape[0] > 0), indices
+            assert(np.sum((num_cols <= indices).asnumpy()) == 0)
+            assert(nd.dtype == dtype), (nd.dtype, dtype)
+        # cannot infer on invalid shape
+        except ValueError:
+            pass
+
+    def check_create_rsp_infer_shape(shape, density, dtype):
+        try:
+            array = rand_ndarray(shape, 'row_sparse', density=density)
+            data = array.data
+            indices = array.indices
+            nd = mx.nd.sparse.row_sparse_array((data, indices), dtype=dtype)
+            inferred_shape = nd.shape
+            assert(inferred_shape[1:] == data.shape[1:])
+            assert(indices.ndim > 0)
+            assert(nd.dtype == dtype)
+            if indices.shape[0] > 0:
+                assert(np.sum((inferred_shape[0] <= indices).asnumpy()) == 0)
+        # cannot infer on invalid shape
+        except ValueError:
+            pass
+
+    dtype = np.int32
+    shape = rand_shape_2d()
+    shape_3d = rand_shape_3d()
+    densities = [0, 0.5, 1]
+    for density in densities:
+        check_create_csr_infer_shape(shape, density, dtype)
+        check_create_rsp_infer_shape(shape, density, dtype)
+        check_create_rsp_infer_shape(shape_3d, density, dtype)
+
+def test_create_sparse_nd_from_dense():
+    def check_create_from_dns(shape, f, dense_arr, dtype, default_dtype, ctx):
+        arr = f(dense_arr, dtype=dtype, ctx=ctx)
+        assert(same(arr.asnumpy(), np.ones(shape)))
+        assert(arr.dtype == dtype)
+        assert(arr.context == ctx)
+        # verify the default dtype inferred from dense arr
+        arr2 = f(dense_arr)
+        assert(arr2.dtype == default_dtype)
+        assert(arr2.context == Context.default_ctx)
+    shape = rand_shape_2d()
+    dtype = np.int32
+    src_dtype = np.float64
+    ctx = mx.cpu(1)
+    dense_arrs = [mx.nd.ones(shape, dtype=src_dtype), np.ones(shape, dtype=src_dtype), \
+                  np.ones(shape, dtype=src_dtype).tolist()]
+    for f in [mx.nd.sparse.csr_matrix, mx.nd.sparse.row_sparse_array]:
+        for dense_arr in dense_arrs:
+            default_dtype = dense_arr.dtype if isinstance(dense_arr, (NDArray, np.ndarray)) \
+                            else np.float32
+            check_create_from_dns(shape, f, dense_arr, dtype, default_dtype, ctx)
+
+def test_create_sparse_nd_from_sparse():
+    def check_create_from_sp(shape, f, sp_arr, dtype, src_dtype, ctx):
+        arr = f(sp_arr, dtype=dtype, ctx=ctx)
+        assert(same(arr.asnumpy(), np.ones(shape)))
+        assert(arr.dtype == dtype)
+        assert(arr.context == ctx)
+        # verify the default dtype inferred from dense arr
+        arr2 = f(sp_arr)
+        assert(arr2.dtype == src_dtype)
+        assert(arr2.context == Context.default_ctx)
+
+    shape = rand_shape_2d()
+    src_dtype = np.float64
+    dtype = np.int32
+    ctx = mx.cpu(1)
+    ones = mx.nd.ones(shape, dtype=src_dtype)
+    csr_arrs = [ones.tostype('csr')]
+    rsp_arrs = [ones.tostype('row_sparse')]
+    try:
+        import scipy.sparse as spsp
+        csr_sp = spsp.csr_matrix(np.ones(shape, dtype=src_dtype))
+        csr_arrs.append(csr_sp)
+    except ImportError:
+        print("Could not import scipy.sparse. Skipping unit tests for scipy csr creation")
+    f_csr = mx.nd.sparse.csr_matrix
+    f_rsp = mx.nd.sparse.row_sparse_array
+    for sp_arr in csr_arrs:
+        check_create_from_sp(shape, f_csr, sp_arr, dtype, src_dtype, ctx)
+    for sp_arr in rsp_arrs:
+        check_create_from_sp(shape, f_rsp, sp_arr, dtype, src_dtype, ctx)
+
+def test_create_sparse_nd_empty():
+    def check_empty(shape, stype):
+        arr = mx.nd.empty(shape, stype=stype)
+        assert(arr.stype == stype)
+        assert same(arr.asnumpy(), np.zeros(shape))
+
+    def check_csr_empty(shape, dtype, ctx):
+        arr = mx.nd.sparse.csr_matrix(shape, dtype=dtype, ctx=ctx)
+        assert(arr.stype == 'csr')
+        assert(arr.dtype == dtype)
+        assert(arr.context == ctx)
+        assert same(arr.asnumpy(), np.zeros(shape))
+        # check the default value for dtype and ctx
+        arr = mx.nd.sparse.csr_matrix(shape)
+        assert(arr.dtype == np.float32)
+        assert(arr.context == Context.default_ctx)
+
+    def check_rsp_empty(shape, dtype, ctx):
+        arr = mx.nd.sparse.row_sparse_array(shape, dtype=dtype, ctx=ctx)
+        assert(arr.stype == 'row_sparse')
+        assert(arr.dtype == dtype)
+        assert(arr.context == ctx)
+        assert same(arr.asnumpy(), np.zeros(shape))
+        # check the default value for dtype and ctx
+        arr = mx.nd.sparse.row_sparse_array(shape)
+        assert(arr.dtype == np.float32)
+        assert(arr.context == Context.default_ctx)
+
+    stypes = ['csr', 'row_sparse']
+    shape = rand_shape_2d()
+    shape_3d = rand_shape_3d()
+    dtype = np.int32
+    ctx = mx.cpu(1)
+    for stype in stypes:
+        check_empty(shape, stype)
+    check_csr_empty(shape, dtype, ctx)
+    check_rsp_empty(shape, dtype, ctx)
+    check_rsp_empty(shape_3d, dtype, ctx)
+
+def test_synthetic_dataset_generator():
+    def test_powerlaw_generator(csr_arr, final_row=1):
+        """Test power law distribution
+        Total Elements: 32000, Number of zeros: 3200
+        Every row has 2 * non zero elements of the previous row.
+        Also since (2047 < 3200 < 4095) this will be true till 10th row"""
+        indices = csr_arr.indices.asnumpy()
+        indptr = csr_arr.indptr.asnumpy()
+        for row in range(1, final_row + 1):
+            nextrow = row + 1
+            current_row_nnz = indices[indptr[row] - 1] + 1
+            next_row_nnz = indices[indptr[nextrow] - 1] + 1
+            assert next_row_nnz == 2 * current_row_nnz
+
+    # Test if density is preserved
+    csr_arr_cols, _ = rand_sparse_ndarray(shape=(32, 10000), stype="csr",
+                                          density=0.01, distribution="powerlaw")
+
+    csr_arr_small, _ = rand_sparse_ndarray(shape=(5, 5), stype="csr",
+                                           density=0.5, distribution="powerlaw")
+
+    csr_arr_big, _ = rand_sparse_ndarray(shape=(32, 1000000), stype="csr",
+                                         density=0.4, distribution="powerlaw")
+
+    csr_arr_square, _ = rand_sparse_ndarray(shape=(1600, 1600), stype="csr",
+                                            density=0.5, distribution="powerlaw")
+    assert len(csr_arr_cols.data) == 3200
+    test_powerlaw_generator(csr_arr_cols, final_row=9)
+    test_powerlaw_generator(csr_arr_small, final_row=1)
+    test_powerlaw_generator(csr_arr_big, final_row=4)
+    test_powerlaw_generator(csr_arr_square, final_row=6)
+
+def test_sparse_nd_fluent():
+    def check_fluent_regular(stype, func, kwargs, shape=(5, 17), equal_nan=False):
+        with mx.name.NameManager():
+            data = mx.nd.random_uniform(shape=shape, ctx=default_context()).tostype(stype)
+            regular = getattr(mx.ndarray, func)(data, **kwargs)
+            fluent = getattr(data, func)(**kwargs)
+            if isinstance(regular, list):
+                for r, f in zip(regular, fluent):
+                    assert almost_equal(r.asnumpy(), f.asnumpy(), equal_nan=equal_nan)
+            else:
+                assert almost_equal(regular.asnumpy(), fluent.asnumpy(), equal_nan=equal_nan)
+
+    common_func = ['zeros_like', 'square']
+    rsp_func = ['round', 'rint', 'fix', 'floor', 'ceil', 'trunc',
+                'abs', 'sign', 'sin', 'degrees', 'radians', 'expm1']
+    for func in common_func:
+        check_fluent_regular('csr', func, {})
+    for func in common_func + rsp_func:
+        check_fluent_regular('row_sparse', func, {})
+
+    rsp_func = ['arcsin', 'arctan', 'tan', 'sinh', 'tanh',
+                'arcsinh', 'arctanh', 'log1p', 'sqrt', 'relu']
+    for func in rsp_func:
+        check_fluent_regular('row_sparse', func, {}, equal_nan=True)
+
+    check_fluent_regular('csr', 'slice', {'begin': (2, 5), 'end': (4, 7)}, shape=(5, 17))
+    check_fluent_regular('row_sparse', 'clip', {'a_min': -0.25, 'a_max': 0.75})
+
+    for func in ['sum', 'mean']:
+        check_fluent_regular('csr', func, {'axis': 0})
+
+
+def test_sparse_nd_exception():
+    """ test invalid sparse operator will throw a exception """
+    a = mx.nd.ones((2,2))
+    assertRaises(mx.base.MXNetError, mx.nd.sparse.retain, a, invalid_arg="garbage_value")
+    assertRaises(ValueError, mx.nd.sparse.csr_matrix, a, shape=(3,2))
+    assertRaises(ValueError, mx.nd.sparse.csr_matrix, (2,2), shape=(3,2))
+    assertRaises(ValueError, mx.nd.sparse.row_sparse_array, (2,2), shape=(3,2))
+    assertRaises(ValueError, mx.nd.sparse.zeros, "invalid_stype", (2,2))
+
+def test_sparse_nd_check_format():
+    """ test check_format for sparse ndarray """
+    shape = rand_shape_2d()
+    stypes = ["csr", "row_sparse"]
+    for stype in stypes:
+        arr, _ = rand_sparse_ndarray(shape, stype)
+        arr.check_format()
+        arr = mx.nd.sparse.zeros(stype, shape)
+        arr.check_format()
+    # CSR format index pointer array should be less than the number of rows
+    shape = (3, 4)
+    data_list = [7, 8, 9]
+    indices_list = [0, 2, 1]
+    indptr_list = [0, 5, 2, 3]
+    a = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # CSR format indices should be in ascending order per row
+    indices_list = [2, 1, 1]
+    indptr_list = [0, 2, 2, 3]
+    a = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # CSR format indptr should end with value equal with size of indices
+    indices_list = [1, 2, 1]
+    indptr_list = [0, 2, 2, 4]
+    a = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # CSR format indices should not be negative
+    indices_list = [0, 2, 1]
+    indptr_list = [0, -2, 2, 3]
+    a = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # Row Sparse format indices should be less than the number of rows
+    shape = (3, 2)
+    data_list = [[1, 2], [3, 4]]
+    indices_list = [1, 4]
+    a = mx.nd.sparse.row_sparse_array((data_list, indices_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # Row Sparse format indices should be in ascending order
+    indices_list = [1, 0]
+    a = mx.nd.sparse.row_sparse_array((data_list, indices_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+    # Row Sparse format indices should not be negative
+    indices_list = [1, -2]
+    a = mx.nd.sparse.row_sparse_array((data_list, indices_list), shape=shape)
+    assertRaises(mx.base.MXNetError, a.check_format)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
new file mode 100644
index 000000000000..a08b6187bc7d
--- /dev/null
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -0,0 +1,1778 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from mxnet.test_utils import *
+import random
+import warnings
+
+def is_scalar(var):
+    return False if hasattr(var, "__len__") else True
+
+def get_result_type(call, dflt_stype):
+    """Try to infer result storage type for a sparse matrix and a given unary operation"""
+    if call is not None and dflt_stype != 'default':
+        zero = np.zeros(([1]))
+        result = do_normalize(call(zero))
+        if not almost_equal(result, zero, equal_nan=True):
+            expected_result_type = 'default'
+        else:
+            if dflt_stype is not None:
+                expected_result_type = dflt_stype;
+            else:
+                expected_result_type = 'default'
+    else:
+        expected_result_type = 'default'
+
+    return expected_result_type
+
+
+def get_result_type_with_scalar(call, dflt_stype):
+    """Try to infer result storage type when operating a sparse matrices and a scalar"""
+    if call is not None and dflt_stype != 'default':
+        zero = np.zeros(([1]))
+        result = call(zero, 5)
+
+        if not almost_equal(result, zero, equal_nan=True):
+            expected_result_type = 'default'
+        else:
+            if dflt_stype is not None:
+                expected_result_type = dflt_stype;
+            else:
+                expected_result_type = 'default'
+    else:
+        expected_result_type = 'default'
+
+    return expected_result_type
+
+
+def get_result_type_2(call, dflt_stype):
+    """Try to infer result storage type when operating on two sparse matrices"""
+    if call is not None and dflt_stype != 'default':
+        zero = np.zeros(([1]))
+        need_default = False
+        for outer in [zero, np.ones(zero.shape)]:
+            for inner in [zero, np.ones(zero.shape)]:
+                result = do_normalize(call(outer, inner))
+                if not almost_equal(result, zero, equal_nan=True):
+                    need_default = True
+                    break
+            if need_default is True:
+                break
+
+        if not need_default and dflt_stype is not None:
+            expected_result_type = dflt_stype
+        else:
+            expected_result_type = 'default'
+    else:
+        expected_result_type = 'default'
+
+    return expected_result_type
+
+
+def get_result_type_3(call, dflt_stype):
+    """Try to infer result storage type when operating on three sparse matrices"""
+    if call is not None and dflt_stype != 'default':
+        zero = np.zeros(([1]))
+        need_default = False
+        for moon in [zero]:
+            for outer in [zero]:
+                for inner in [zero]:
+                    res_1, res_2 = call(moon, outer, inner)
+                    result = do_normalize(res_1)
+                    if not almost_equal(result, zero, equal_nan=True):
+                        need_default = True
+                        break
+                    result = do_normalize(res_2)
+                    if not almost_equal(result, zero, equal_nan=True):
+                        need_default = True
+                        break
+                if need_default is True:
+                    break
+            if need_default is True:
+                break
+
+        if not need_default and dflt_stype is not None:
+            expected_result_type = dflt_stype
+        else:
+            expected_result_type = 'default'
+    else:
+        expected_result_type = 'default'
+
+    return expected_result_type
+
+
+def get_fw_bw_result_types(forward_numpy_call,  fwd_res_dflt,
+                           backward_numpy_call, bwd_res_dflt):
+
+    return (get_result_type(forward_numpy_call,  fwd_res_dflt),
+            get_result_type(backward_numpy_call, bwd_res_dflt))
+
+
+def get_fw_bw_result_types_2(forward_numpy_call,  fwd_res_dflt,
+                             backward_numpy_call, bwd_res_dflt):
+    return (get_result_type(forward_numpy_call,  fwd_res_dflt),
+            get_result_type_2(backward_numpy_call, bwd_res_dflt))
+
+def get_fw_bw_result_types_with_scalar(forward_numpy_call,  fwd_res_dflt,
+                                       backward_numpy_call, bwd_res_dflt):
+    return (get_result_type_with_scalar(forward_numpy_call,  fwd_res_dflt),
+            get_result_type_with_scalar(backward_numpy_call, bwd_res_dflt))
+
+def gen_rsp_random_indices(shape, density=.5, force_indices=None):
+    assert density >= 0 and density <= 1
+    indices = set()
+    if force_indices is not None:
+        for val in force_indices:
+            indices.add(int(val))
+    if not np.isclose(density, .0, rtol=1.e-3, atol=1.e-3, equal_nan=True) and len(shape) > 0:
+        row_count = shape[0]
+        for i in range(row_count):
+            r = random.uniform(0, 1)
+            if r <= density and len(indices) < shape[0]:
+                indices.add(i)
+    assert len(indices) <= shape[0]
+    return list(indices)
+
+
+def all_zero(var):
+    return 0
+
+def test_elemwise_binary_ops():
+    def test_elemwise_binary_op(name, lhs_stype, rhs_stype, shape,
+                                forward_mxnet_call, forward_numpy_call, backward_numpy_call,
+                                lhs_grad_stype,
+                                rhs_grad_stype,
+                                expected_result_storage_type=None,
+                                modifier_func=None,
+                                lhs_density=.5,
+                                rhs_density=.5,
+                                force_lr_overlap=False,
+                                force_grad_overlap=False,
+                                ograd_density=0.0,
+                                skip_gradient_check=False,
+                                shuffle_csr_indices=True,
+                                verbose=False):
+        if lhs_grad_stype is None:
+            lhs_grad_stype = lhs_stype
+        if rhs_grad_stype is None:
+            rhs_grad_stype = rhs_stype
+
+        lhs_grad_stype = get_result_type_3(backward_numpy_call, lhs_grad_stype)
+        rhs_grad_stype = get_result_type_3(backward_numpy_call, rhs_grad_stype)
+
+        if verbose is True:
+            print("testing: {}  lhs={}, rhs={}, lhs_grad_stype={}, rhs_grad_stype={}"
+                  .format(name, lhs_stype, rhs_stype, lhs_grad_stype, rhs_grad_stype))
+
+        # Output type should be same as lvalue type, unless otherwise specified
+        if expected_result_storage_type is None:
+            if lhs_stype == 'default' or rhs_stype == 'default':
+                expected_result_storage_type = 'default'
+            else:
+                expected_result_storage_type = lhs_stype
+
+        lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
+        rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
+
+        grad_stypes = dict()
+        grad_stypes['lhs'] = lhs_grad_stype
+        grad_stypes['rhs'] = rhs_grad_stype
+
+        if lhs_stype == 'default':
+            lhs_nd = rand_ndarray(shape, 'default')
+            if abs(lhs_density) < 1e-4:
+                func = all_zero
+            else:
+                func = modifier_func
+            lhs_nd = mx.nd.array(assign_each(lhs_nd.asnumpy(), func))
+        else:
+            lhs_nd = create_sparse_array_zd(
+                shape, lhs_stype, density=lhs_density,
+                modifier_func=modifier_func,
+                shuffle_csr_indices=shuffle_csr_indices,
+                rsp_indices=gen_rsp_random_indices(
+                    shape,
+                    density=lhs_density,
+                    force_indices=[(shape[0]/2)] if force_lr_overlap is True else None
+                ))
+
+        if rhs_stype == 'default':
+            rhs_nd = rand_ndarray(shape, 'default')
+            if abs(rhs_density) < 1e-4:
+                func = all_zero
+            else:
+                func = modifier_func
+            rhs_nd = mx.nd.array(assign_each(rhs_nd.asnumpy(), func))
+        else:
+            rhs_nd = create_sparse_array_zd(
+                shape, rhs_stype, density=rhs_density,
+                modifier_func=modifier_func,
+                shuffle_csr_indices=shuffle_csr_indices,
+                rsp_indices=gen_rsp_random_indices(
+                    shape,
+                    density=rhs_density,
+                    force_indices=[(shape[0]/2)] if force_lr_overlap is True else None
+                ))
+
+        lhs_np = lhs_nd.asnumpy()
+        rhs_np = rhs_nd.asnumpy()
+
+        if verbose is True:
+            print("lhs input: {}".format(lhs_np))
+            print("rhs input: {}".format(rhs_np))
+
+        out_np = forward_numpy_call(lhs_np, rhs_np)
+
+        if verbose is True:
+            print("out_np: {}".format(out_np))
+
+        test = forward_mxnet_call(lhs, rhs)
+
+        location = {'lhs': lhs_nd, 'rhs': rhs_nd}
+
+        outputs = check_symbolic_forward(test, location, [out_np], equal_nan=True)
+        assert len(outputs) == 1
+        assert outputs[0].stype == expected_result_storage_type
+
+        if verbose is True:
+            print ("mx forward output: ", outputs[0].asnumpy())
+            print ("lhs_nd: ", lhs_nd.stype)
+            print ("rhs_nd: ", rhs_nd.stype)
+            print ("forward output: ", outputs[0].stype)
+
+        if outputs[0].stype != 'default':
+            out_grad = create_sparse_array_zd(
+                shape, outputs[0].stype, density=ograd_density,
+                data_init=1,
+                modifier_func=lambda x: 2,
+                shuffle_csr_indices=shuffle_csr_indices,
+                rsp_indices=gen_rsp_random_indices(
+                    shape,
+                    density=ograd_density,
+                    force_indices=[(shape[0]/2)] if force_grad_overlap is True else None
+                ))
+        else:
+            if abs(ograd_density) < 1e-4:
+                out_grad = mx.nd.array(np.zeros(shape))
+            else:
+                out_grad = mx.nd.array(np.ones(shape))
+
+
+        out_grad_np = out_grad.asnumpy()
+
+        if verbose is True:
+            print("out_grad_np", out_grad_np)
+
+        ingrad_lhs_np, ingrad_rhs_np = backward_numpy_call(out_grad_np, lhs_np, rhs_np)
+
+        if verbose is True:
+            print("out_grad", out_grad.asnumpy())
+            print("ingrad_lhs_np", ingrad_lhs_np)
+            print("ingrad_rhs_np", ingrad_rhs_np)
+
+        igrads_result = check_symbolic_backward(test, location, [out_grad],
+                                                [ingrad_lhs_np, ingrad_rhs_np],
+                                                grad_stypes=grad_stypes,
+                                                equal_nan=True)
+
+        if verbose is True:
+            print("ingrad_lhs", igrads_result['lhs'].asnumpy())
+            print("ingrad_rhs", igrads_result['rhs'].asnumpy())
+
+        assert len(igrads_result) == 2
+
+        if lhs_grad_stype is not None:
+            assert igrads_result['lhs'].stype == lhs_grad_stype
+        if rhs_grad_stype is not None:
+            assert igrads_result['rhs'].stype == rhs_grad_stype
+
+        if skip_gradient_check is not True:
+            check_numeric_gradient(test, location,
+                                   grad_stype_dict=grad_stypes)
+
+    def check_all(l, r, check_function):
+        assert l.shape == r.shape
+        return check_function(l, r)
+
+    def gt(l, r):
+        return check_all(l, r, lambda a, b: a > b)
+
+    def ge(l, r):
+        return check_all(l, r, lambda a, b: a >= b)
+
+    def lt(l, r):
+        return check_all(l, r, lambda a, b: a < b)
+
+    def le(l, r):
+        return check_all(l, r, lambda a, b: a <= b)
+
+    def least_sparse(lstype, rstype):
+        if lstype == 'default' and rstype == 'default':
+            return 'default'
+        elif rstype != 'default':
+            return rstype
+        return lstype
+
+    def most_dense(lstype, rstype):
+      if lstype == rstype:
+        return lstype
+      return 'default'
+
+    def check_elemwise_binary_ops(lhs_stype, rhs_stype, shape,
+                                  lhs_grad_stype=None, rhs_grad_stype=None,
+                                  lhs_density=.5, rhs_density=.5,
+                                  force_lr_overlap=False,
+                                  force_grad_overlap=False,
+                                  ograd_density=0.0):
+        test_elemwise_binary_op("elemwise_add", lhs_stype, rhs_stype, shape,
+                                lambda l, r: mx.sym.sparse.elemwise_add(l, r),
+                                lambda l, r: l + r,
+                                lambda outg, l, r: (outg, outg),
+                                lhs_grad_stype, rhs_grad_stype,
+                                ograd_density=ograd_density,
+                                force_lr_overlap=force_lr_overlap,
+                                force_grad_overlap=force_grad_overlap,
+                                lhs_density=lhs_density, rhs_density=rhs_density,
+                                verbose=False)
+
+        test_elemwise_binary_op("elemwise_sub", lhs_stype, rhs_stype, shape,
+                                lambda l, r: mx.sym.sparse.elemwise_sub(l, r),
+                                lambda l, r: l - r,
+                                lambda outg, l, r: (outg, -outg),
+                                lhs_grad_stype, rhs_grad_stype,
+                                ograd_density=ograd_density,
+                                force_lr_overlap=force_lr_overlap,
+                                force_grad_overlap=force_grad_overlap,
+                                lhs_density=lhs_density,
+                                rhs_density=rhs_density,
+                                verbose=False)
+
+        test_elemwise_binary_op("elemwise_mul", lhs_stype, rhs_stype, shape,
+                                lambda l, r: mx.sym.sparse.elemwise_mul(l, r),
+                                lambda l, r: l * r,
+                                lambda outg, l, r: (outg * r, outg * l),
+                                least_sparse(lhs_stype, rhs_stype),
+                                least_sparse(lhs_stype, rhs_stype),
+                                expected_result_storage_type=most_dense(lhs_stype, rhs_stype),
+                                ograd_density=ograd_density,
+                                force_lr_overlap=force_lr_overlap,
+                                force_grad_overlap=force_grad_overlap,
+                                lhs_density=lhs_density, rhs_density=rhs_density,
+                                verbose=False)
+
+        test_elemwise_binary_op("elemwise_div", lhs_stype, rhs_stype, shape,
+                                lambda l, r: mx.sym.sparse.elemwise_div(l, r),
+                                lambda l, r: l / r,
+                                lambda outg, l, r: (outg * (1/r), outg * (-l/(r*r))),
+                                lhs_grad_stype, rhs_grad_stype,
+                                modifier_func=lambda a: a if abs(a) > 0.25 else abs(a) + 1,
+                                force_lr_overlap=force_lr_overlap,
+                                force_grad_overlap=force_grad_overlap,
+                                lhs_density=lhs_density, rhs_density=rhs_density,
+                                ograd_density=ograd_density,
+                                expected_result_storage_type='default',
+                                skip_gradient_check=True,
+                                verbose=False)
+
+        test_elemwise_binary_op("maximum", lhs_stype, rhs_stype, shape,
+                                lambda l, r: mx.sym._internal._maximum(l, r),
+                                lambda l, r: np.maximum(l, r),
+                                lambda outg, l, r: (outg * ge(l, r), outg * lt(l, r)),
+                                lhs_grad_stype, rhs_grad_stype,
+                                modifier_func=lambda a: a if abs(a) > 0.25 else abs(a) + 1,
+                                force_lr_overlap=force_lr_overlap,
+                                force_grad_overlap=force_grad_overlap,
+                                lhs_density=lhs_density, rhs_density=rhs_density,
+                                skip_gradient_check=True,
+                                ograd_density=ograd_density,
+                                verbose=False)
+
+        test_elemwise_binary_op("minimum", lhs_stype, rhs_stype, shape,
+                                lambda l, r: mx.sym._internal._minimum(l, r),
+                                lambda l, r: np.minimum(l, r),
+                                lambda outg, l, r: (outg * le(l, r), outg * gt(l, r)),
+                                lhs_grad_stype, rhs_grad_stype,
+                                modifier_func=lambda a: a if abs(a) > 0.25 else abs(a) + 1,
+                                force_lr_overlap=force_lr_overlap,
+                                force_grad_overlap=force_grad_overlap,
+                                lhs_density=lhs_density, rhs_density=rhs_density,
+                                ograd_density=ograd_density,
+                                skip_gradient_check=True,
+                                verbose=False)
+
+        test_elemwise_binary_op("hypot", lhs_stype, rhs_stype, shape,
+                                lambda l, r: mx.sym._internal._hypot(l, r),
+                                lambda l, r: np.hypot(l, r),
+                                lambda outg, l, r: (
+                                    outg * assign_each2(
+                                        l, r, lambda a, b: a/np.sqrt(a * a + b * b)),
+                                    outg * assign_each2(
+                                        l, r, lambda a, b: b/np.sqrt(a * a + b * b))
+                                ),
+                                lhs_grad_stype, rhs_grad_stype,
+                                force_lr_overlap=force_lr_overlap,
+                                force_grad_overlap=force_grad_overlap,
+                                lhs_density=lhs_density, rhs_density=rhs_density,
+                                ograd_density=ograd_density,
+                                skip_gradient_check=True,
+                                verbose=False)
+
+    # Run basic tests
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+
+        for ii in range(1):
+            # Run defaults
+            check_elemwise_binary_ops('default', 'default', rand_shape_2d())
+
+            # Try different densities
+            for lhs_density in [0.0, random.uniform(0, 1), 1.0]:
+                for rhs_density in [0.0, random.uniform(0, 1), 1.0]:
+                    for ograd_density in [0.0, random.uniform(0, 1), 1.0]:
+                        shape = rand_shape_2d()
+
+                        print("lhs_density={}, rhs_density={}, ograd_density={}, shape: {}".format(
+                            lhs_density, rhs_density, ograd_density, shape))
+
+                        # Try row_sparse overlaps
+                        for force_lr_overlap in [False, True]:
+                            for force_grad_overlap in [False, True]:
+
+                                shape = rand_shape_2d()
+
+                                print("  force_lr_overlap={}, force_grad_overlap={}, shape={}".
+                                      format(force_lr_overlap, force_grad_overlap, shape))
+
+                                # Left and right always overlap when one is default storage
+                                # (assuming the row_sparse one has some entries in it)
+                                if force_lr_overlap is False:
+                                    check_elemwise_binary_ops('default', 'row_sparse', shape,
+                                                              lhs_density=lhs_density,
+                                                              rhs_density=rhs_density,
+                                                              force_lr_overlap=force_lr_overlap,
+                                                              force_grad_overlap=force_grad_overlap,
+                                                              ograd_density=ograd_density)
+                                    check_elemwise_binary_ops('row_sparse', 'default', shape,
+                                                              lhs_density=lhs_density,
+                                                              rhs_density=rhs_density,
+                                                              force_lr_overlap=force_lr_overlap,
+                                                              force_grad_overlap=force_grad_overlap,
+                                                              ograd_density=ograd_density)
+
+                                # Back to left-right overlap possiblities
+                                check_elemwise_binary_ops('row_sparse', 'row_sparse', shape,
+                                                          lhs_grad_stype='row_sparse',
+                                                          rhs_grad_stype='row_sparse',
+                                                          lhs_density=lhs_density,
+                                                          rhs_density=rhs_density,
+                                                          force_lr_overlap=force_lr_overlap,
+                                                          force_grad_overlap=force_grad_overlap,
+                                                          ograd_density=ograd_density)
+
+                        # No overlap flags for CSR
+                        check_elemwise_binary_ops('csr', 'csr', shape,
+                                                  lhs_grad_stype='csr',
+                                                  rhs_grad_stype='csr',
+                                                  lhs_density=lhs_density,
+                                                  rhs_density=rhs_density,
+                                                  ograd_density=ograd_density)
+                        check_elemwise_binary_ops('csr', 'csr', shape,
+                                                  lhs_grad_stype='default',
+                                                  rhs_grad_stype='default',
+                                                  lhs_density=lhs_density,
+                                                  rhs_density=rhs_density,
+                                                  ograd_density=ograd_density)
+                        check_elemwise_binary_ops('default', 'csr', shape,
+                                                  lhs_grad_stype='csr',
+                                                  rhs_grad_stype='csr',
+                                                  lhs_density=lhs_density,
+                                                  rhs_density=rhs_density,
+                                                  ograd_density=ograd_density)
+                        check_elemwise_binary_ops('csr', 'default', shape,
+                                                  lhs_grad_stype='csr',
+                                                  rhs_grad_stype='csr',
+                                                  lhs_density=lhs_density,
+                                                  rhs_density=rhs_density,
+                                                  ograd_density=ograd_density)
+
+def test_elemwise_csr_same_zeros():
+    # Zeroes
+    a = mx.nd.sparse.zeros('csr', (1,1))
+    b = mx.nd.elemwise_add(a,a)
+    res = a.asnumpy() + a.asnumpy()
+    assert_almost_equal(b.asnumpy(), res)
+
+def as_dense(arr):
+    if arr.stype != 'default':
+        return mx.nd.cast_storage(arr, stype='default')
+    else:
+        return arr;
+
+# Make sure that 0's look like 0's when we do a comparison
+def do_normalize(arr):
+    ret = arr.copy()
+    idx = np.isclose(arr, -0, rtol=1.e-3, atol=1.e-3, equal_nan=True)
+    ret[idx] = 0
+    return ret
+
+def check_sparse_mathematical_core(name, stype,
+                                   forward_mxnet_call, forward_numpy_call, backward_numpy_call=None,
+                                   rhs_arg=None, data_init=9., grad_init=2., output_grad_stype=None,
+                                   input_grad_stype=None, force_overlap=False, density=.5,
+                                   ograd_density=.5, verbose=False, shuffle_csr_indices=True):
+    if verbose is True:
+        print("TESTING: " + name)
+
+    data = mx.symbol.Variable('data', stype=stype)
+
+    temp_input_grad_stype = input_grad_stype
+
+    if temp_input_grad_stype is None:
+        temp_input_grad_stype = stype
+
+    if rhs_arg is not None:
+        if is_scalar(rhs_arg):
+            expected_result_type, expected_grad_result_type = \
+                get_fw_bw_result_types_with_scalar(forward_numpy_call, stype,
+                                                   backward_numpy_call, temp_input_grad_stype)
+        else:
+            expected_result_type, expected_grad_result_type = \
+                get_fw_bw_result_types_2(forward_numpy_call, stype,
+                                         backward_numpy_call, temp_input_grad_stype)
+    else:
+        expected_result_type, expected_grad_result_type = \
+            get_fw_bw_result_types(forward_numpy_call, stype,
+                                   backward_numpy_call, temp_input_grad_stype)
+
+    if input_grad_stype is not None and input_grad_stype != expected_grad_result_type:
+        print("{}: explicit override of deduced input grad type '{}' with '{}'".format(
+            name, expected_grad_result_type, input_grad_stype))
+        expected_grad_result_type = input_grad_stype
+
+    shape = rand_shape_2d()
+
+    if verbose is True:
+        print("Shape: ", shape, "density: ", density, "force_overlap", force_overlap)
+
+    if stype == 'default':
+        data_tmp = np.zeros(shape)
+        if abs(density) >= 1e-4:
+            data_tmp[:] = data_init
+        arr_data = mx.nd.array(data_tmp)
+    else:
+        arr_data = create_sparse_array_zd(
+            shape, stype, density=density,
+            data_init=data_init,
+            shuffle_csr_indices=shuffle_csr_indices,
+            rsp_indices=gen_rsp_random_indices(
+                shape,
+                density=density,
+                force_indices=[(shape[0]/2)] if force_overlap is True else None
+            )
+        )
+        data_tmp = arr_data.asnumpy()
+        if verbose is True:
+            print("arr_data indices", arr_data.indices.asnumpy())
+
+    if verbose is True:
+        print("input", data_tmp)
+
+    if backward_numpy_call is None:
+        arr_grad = None
+    elif expected_grad_result_type == 'default':
+        if abs(density) < 1e-4:
+            arr_grad = mx.nd.zeros(shape)
+        else:
+            arr_grad = mx.nd.ones(shape)
+    else:
+        arr_grad = create_sparse_array_zd(
+            shape,
+            expected_grad_result_type,
+            density=density,
+            data_init=1,
+            shuffle_csr_indices=shuffle_csr_indices,
+            rsp_indices=gen_rsp_random_indices(
+                shape,
+                density=density,
+                force_indices=[(shape[0]/2)] if force_overlap is True else None
+            )
+        )
+
+    if rhs_arg is not None:
+        test = forward_mxnet_call(data, rhs_arg)
+    else:
+        test = forward_mxnet_call(data)
+
+    args = list()
+    args.append(arr_data)
+
+    if arr_grad is not None:
+        exe_test = test.bind(default_context(), args=args, args_grad=[arr_grad])
+    else:
+        exe_test = test.bind(default_context(), args=args)
+
+    exe_test.forward(is_train=True)
+    assert exe_test.outputs[0].stype == expected_result_type
+    out = exe_test.outputs[0].asnumpy()
+
+    if rhs_arg is not None:
+        npout = forward_numpy_call(data_tmp, rhs_arg)
+    else:
+        npout = forward_numpy_call(data_tmp)
+
+    if verbose is True:
+        print("out", out)
+        print("npout", npout)
+
+    assert_almost_equal(out, npout, equal_nan=True)
+
+    if backward_numpy_call is not None:
+        if output_grad_stype == 'default' or output_grad_stype is None:
+            out_grad = mx.nd.empty(shape)
+            out_grad[:] = grad_init
+        else:
+            out_grad = create_sparse_array_zd(
+                shape, output_grad_stype,
+                density=density,
+                data_init=grad_init,
+                shuffle_csr_indices=shuffle_csr_indices,
+                rsp_indices=gen_rsp_random_indices(
+                    shape,
+                    density=ograd_density,
+                    force_indices=[(shape[0]/2)] if force_overlap is True else None))
+
+        npout_grad = out_grad.asnumpy()
+
+        if verbose is True:
+            print("npout_grad", npout_grad)
+
+        if rhs_arg is not None:
+            temp = backward_numpy_call(data_tmp, rhs_arg)
+        else:
+            temp = backward_numpy_call(data_tmp)
+        input_grad = npout_grad * temp
+
+        if verbose is True:
+            print(arr_grad.asnumpy())
+        exe_test.backward(out_grad)
+        if verbose is True:
+            print(arr_grad.asnumpy())
+
+        assert arr_grad.stype == expected_grad_result_type
+
+        arr_grad = arr_grad.asnumpy()
+
+        if verbose is True:
+            print(name)
+            print("arr_grad", arr_grad)
+            print("input_grad", input_grad)
+
+        assert_almost_equal(arr_grad, input_grad, equal_nan=True)
+
+
+def test_sparse_mathematical_core():
+    def util_sign(a):
+        if np.isclose(a, -0, rtol=1.e-3, atol=1.e-3, equal_nan=True):
+            return 0
+        elif np.isclose(a, 0, rtol=1.e-3, atol=1.e-3, equal_nan=True):
+            return 0
+        elif a < 0.0:
+            return -1
+        else:  # a > 0.0:
+            return 1
+
+    # Check scalar binary operators
+    def check_binary_op_with_scalar(stype,
+                                    output_grad_stype=None,
+                                    input_grad_stype=None,
+                                    density=.5, ograd_density=.5,
+                                    force_overlap=False,):
+        # mul_scalar
+        check_sparse_mathematical_core("mul_scalar", stype,
+                                       lambda x, y: x * y,
+                                       lambda x, y: x * y,
+                                       lambda input, rhs: rhs,
+                                       rhs_arg=5.0,
+                                       data_init=2, grad_init=3,
+                                       output_grad_stype=output_grad_stype,
+                                       input_grad_stype=input_grad_stype,
+                                       density=density, ograd_density=ograd_density,
+                                       force_overlap=force_overlap,
+                                       verbose=False)
+
+        # plus_scalar
+        check_sparse_mathematical_core("plus_scalar", stype,
+                                       lambda x, y: x + y,
+                                       lambda x, y: x + y,
+                                       lambda input, rhs: 1,
+                                       rhs_arg=5.0,
+                                       data_init=2, grad_init=3,
+                                       output_grad_stype=output_grad_stype,
+                                       input_grad_stype=input_grad_stype,
+                                       density=density, ograd_density=ograd_density,
+                                       force_overlap=force_overlap,
+                                       verbose=False)
+
+        # minus_scalar
+        check_sparse_mathematical_core("minus_scalar", stype,
+                                       lambda x, y: x - y,
+                                       lambda x, y: x - y,
+                                       lambda input, rhs: 1,
+                                       rhs_arg=5.0,
+                                       data_init=2, grad_init=3,
+                                       output_grad_stype=output_grad_stype,
+                                       input_grad_stype=input_grad_stype,
+                                       density=density, ograd_density=ograd_density,
+                                       force_overlap=force_overlap,
+                                       verbose=False)
+
+    # Check many basic unary operators
+    def check_mathematical_core(stype, output_grad_stype=None,
+                                input_grad_stype=None, force_overlap=False,
+                                density=.5, ograd_density=.5):
+
+        # negative
+        check_sparse_mathematical_core("negative", stype,
+                                       lambda x: mx.sym.sparse.negative(x),
+                                       lambda x: np.negative(x),
+                                       force_overlap=force_overlap,
+                                       density=density,
+                                       input_grad_stype=input_grad_stype,
+                                       ograd_density=ograd_density)
+
+        # square
+        check_sparse_mathematical_core("square", stype,
+                                       lambda x: mx.sym.sparse.square(x),
+                                       lambda x: np.square(x),
+                                       lambda x: 2 * x,
+                                       output_grad_stype=output_grad_stype,
+                                       input_grad_stype=input_grad_stype,
+                                       force_overlap=force_overlap,
+                                       density=density, ograd_density=ograd_density,
+                                       verbose=False)
+
+        if stype != "csr":
+            # sqrt
+            check_sparse_mathematical_core("sqrt", stype,
+                                           lambda x: mx.sym.sparse.sqrt(x),
+                                           lambda x: np.sqrt(x),
+                                           lambda x: 1.0/(2.0 * np.sqrt(x)),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density,
+                                           verbose=False)
+
+            # rsqrt
+            check_sparse_mathematical_core("rsqrt", stype,
+                                           lambda x: mx.sym.sparse.rsqrt(x),
+                                           lambda x: 1 / np.sqrt(x),
+                                           lambda x: -(1.0 / (2.0 * x * np.sqrt(x))),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # tan
+            check_sparse_mathematical_core("tan", stype,
+                                           lambda x: mx.sym.sparse.tan(x),
+                                           lambda x: np.tan(x),
+                                           lambda x: np.tan(x) ** 2 + 1,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           density=density,
+                                           ograd_density=ograd_density)
+
+            # abs
+            check_sparse_mathematical_core("abs", stype,
+                                           lambda x: mx.sym.sparse.abs(x),
+                                           lambda x: np.abs(x),
+                                           lambda x: assign_each(x, function=util_sign),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # floor
+            check_sparse_mathematical_core("floor", stype, lambda x: mx.sym.sparse.floor(x),
+                                           lambda x: np.floor(x),
+                                           force_overlap=force_overlap,
+                                           input_grad_stype=input_grad_stype,
+                                           density=density, ograd_density=ograd_density)
+
+            # ceil
+            check_sparse_mathematical_core("ceil", stype,
+                                           lambda x: mx.sym.sparse.ceil(x),
+                                           lambda x: np.ceil(x),
+                                           force_overlap=force_overlap,
+                                           input_grad_stype=input_grad_stype,
+                                           density=density, ograd_density=ograd_density)
+
+            # sign
+            check_sparse_mathematical_core("sign", stype,
+                                           lambda x: mx.sym.sparse.sign(x),
+                                           lambda x: np.sign(x),
+                                           lambda x: np.zeros(x.shape),
+                                           output_grad_stype=output_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # cos
+            check_sparse_mathematical_core("cos", stype,
+                                           lambda x: mx.sym.sparse.cos(x),
+                                           lambda x: np.cos(x),
+                                           lambda x: -np.sin(x),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # sin
+            check_sparse_mathematical_core("sin", stype,
+                                           lambda x: mx.sym.sparse.sin(x),
+                                           lambda x: np.sin(x),
+                                           lambda x: np.cos(x),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # arcsin
+            check_sparse_mathematical_core("arcsin", stype,
+                                           lambda x: mx.sym.sparse.arcsin(x),
+                                           lambda x: np.arcsin(x),
+                                           lambda x: 1. / (1. - x ** 2) ** (1. / 2.),
+                                           data_init=0.5, grad_init=0.5,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # arccos
+            check_sparse_mathematical_core("arccos", stype,
+                                           lambda x: mx.sym.sparse.arccos(x),
+                                           lambda x: np.arccos(x),
+                                           lambda x: -1. / (1. - x ** 2.) ** (1. / 2.),
+                                           data_init=0.5, grad_init=0.5,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap, density=density,
+                                           ograd_density=ograd_density)
+
+            # arctan
+            check_sparse_mathematical_core("arctan", stype,
+                                           lambda x: mx.sym.sparse.arctan(x),
+                                           lambda x: np.arctan(x),
+                                           lambda x: 1. / (x ** 2. + 1.),
+                                           data_init=0.5, grad_init=0.5,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # degrees
+            check_sparse_mathematical_core("degrees", stype,
+                                           lambda x: mx.sym.sparse.degrees(x),
+                                           lambda x: np.degrees(x),
+                                           lambda x: assign_each(x, lambda a: 180./np.pi),
+                                           data_init=0.5, grad_init=0.5,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # radians
+            check_sparse_mathematical_core("radians", stype,
+                                           lambda x: mx.sym.sparse.radians(x),
+                                           lambda x: np.radians(x),
+                                           lambda x: assign_each(x, lambda a: np.pi / 180.),
+                                           data_init=0.6, grad_init=1,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # sinh
+            check_sparse_mathematical_core("sinh", stype,
+                                           lambda x: mx.sym.sparse.sinh(x),
+                                           lambda x: np.sinh(x),
+                                           lambda x: np.cosh(x),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # cosh
+            check_sparse_mathematical_core("cosh", stype,
+                                           lambda x: mx.sym.sparse.cosh(x),
+                                           lambda x: np.cosh(x),
+                                           lambda x: np.sinh(x),
+                                           data_init=5, grad_init=5,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap,
+                                           density=density, ograd_density=ograd_density)
+
+            # tanh
+            check_sparse_mathematical_core("tanh", stype,
+                                           lambda x: mx.sym.sparse.tanh(x),
+                                           lambda x: np.tanh(x),
+                                           lambda x: 1. - np.tanh(x) ** 2,
+                                           data_init=0.5, grad_init=1,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap, density=density,
+                                           ograd_density=ograd_density)
+
+            # arcsinh
+            check_sparse_mathematical_core("arcsinh", stype,
+                                           lambda x: mx.sym.sparse.arcsinh(x),
+                                           lambda x: np.arcsinh(x),
+                                           lambda x: 1./(x**2 + 1.)**(1./2.),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap, density=density,
+                                           ograd_density=ograd_density)
+
+            # arccosh
+            check_sparse_mathematical_core("arccosh", stype,
+                                           lambda x: mx.sym.sparse.arccosh(x),
+                                           lambda x: np.arccosh(x),
+                                           lambda x: 1./(x**2 - 1.)**(1./2.),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap, density=density,
+                                           ograd_density=ograd_density)
+
+            # arctanh
+            check_sparse_mathematical_core("arctanh", stype,
+                                           lambda x: mx.sym.sparse.arctanh(x),
+                                           lambda x: np.arctanh(x),
+                                           lambda x: -1./(x**2 - 1.),
+                                           data_init=0.5,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap, density=density,
+                                           ograd_density=ograd_density)
+
+            # log1p
+            check_sparse_mathematical_core("log1p", stype,
+                                           lambda x: mx.sym.sparse.log1p(x),
+                                           lambda x: np.log1p(x),
+                                           lambda x: 1. / (1.0 + x),
+                                           data_init=0.5, grad_init=0.5,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap, density=density,
+                                           ograd_density=ograd_density)
+
+            # expm1
+            check_sparse_mathematical_core("expm1", stype,
+                                           lambda x: mx.sym.sparse.expm1(x),
+                                           lambda x: np.expm1(x),
+                                           lambda x: np.exp(x),
+                                           data_init=0.5, grad_init=0.5,
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap, density=density,
+                                           ograd_density=ograd_density)
+
+            # log10
+            check_sparse_mathematical_core("log10", stype,
+                                           lambda x: mx.sym.sparse.log10(x),
+                                           lambda x: np.log10(x),
+                                           lambda x: 1. / (x * np.log(10.)),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap, density=density,
+                                           ograd_density=ograd_density)
+
+            # log2
+            check_sparse_mathematical_core("log2", stype,
+                                           lambda x: mx.sym.sparse.log2(x),
+                                           lambda x: np.log2(x),
+                                           lambda x: 1. / (x * np.log(2.)),
+                                           output_grad_stype=output_grad_stype,
+                                           input_grad_stype=input_grad_stype,
+                                           force_overlap=force_overlap, density=density,
+                                           ograd_density=ograd_density)
+
+            # rint
+            check_sparse_mathematical_core("rint", stype,
+                                           lambda x: mx.sym.sparse.rint(x),
+                                           lambda x: np.rint(x),
+                                           force_overlap=force_overlap, density=density,
+                                           input_grad_stype=input_grad_stype,
+                                           ograd_density=ograd_density)
+
+            # fix
+            check_sparse_mathematical_core("fix", stype,
+                                           lambda x: mx.sym.sparse.fix(x),
+                                           lambda x: np.fix(x),
+                                           force_overlap=force_overlap, density=density,
+                                           input_grad_stype=input_grad_stype,
+                                           ograd_density=ograd_density)
+
+            try:
+                from scipy import special as scipy_special
+                import_succeeded = True
+                # gamma
+                check_sparse_mathematical_core("gamma", stype,
+                                               lambda x: mx.sym.sparse.gamma(x),
+                                               lambda x: scipy_special.gamma(x),
+                                               lambda x: scipy_special.gamma(x) * scipy_special.psi(x),
+                                               output_grad_stype=output_grad_stype,
+                                               input_grad_stype=input_grad_stype,
+                                               force_overlap=force_overlap,
+                                               density=density, ograd_density=ograd_density)
+                # gammaln
+                check_sparse_mathematical_core("gammaln", stype,
+                                               lambda x: mx.sym.sparse.gammaln(x),
+                                               lambda x: scipy_special.gammaln(x),
+                                               lambda x: scipy_special.psi(x),
+                                               output_grad_stype=output_grad_stype,
+                                               input_grad_stype=input_grad_stype,
+                                               force_overlap=force_overlap,
+                                               density=density, ograd_density=ograd_density)
+
+            except:
+                if import_succeeded == False:
+                    print("Could not import scipy. Skipping unit tests for special functions")
+                else:
+                    raise
+
+    for i in range(1):
+        print("pass", i)
+        for density in [0.0, random.uniform(0, 1), 1.0]:
+            for ograd_density in [0.0, random.uniform(0, 1), 1.0]:
+                for force_overlap in [False, True]:
+                    print("{}, {}, {}".format(density, ograd_density, force_overlap))
+                    with warnings.catch_warnings():
+                        warnings.simplefilter("ignore")
+
+                        # Check unary ops (unary fwd, binary bwd)
+                        check_mathematical_core('default', force_overlap=force_overlap,
+                                                density=density, ograd_density=ograd_density)
+                        check_mathematical_core('row_sparse', force_overlap=force_overlap,
+                                                density=density, ograd_density=ograd_density)
+                        check_mathematical_core('row_sparse', output_grad_stype='default',
+                                                force_overlap=force_overlap,
+                                                density=density, ograd_density=ograd_density)
+                        check_mathematical_core('row_sparse', output_grad_stype='row_sparse',
+                                                force_overlap=force_overlap,
+                                                density=density, ograd_density=ograd_density)
+                        check_mathematical_core('csr', output_grad_stype='default',
+                                                force_overlap=force_overlap,
+                                                density=density, ograd_density=ograd_density)
+                        check_mathematical_core('csr', output_grad_stype='csr',
+                                                force_overlap=force_overlap,
+                                                density=density, ograd_density=ograd_density)
+
+                        # Check binary with scalar ops
+                        check_binary_op_with_scalar('default',
+                                                    density=density,
+                                                    ograd_density=ograd_density,
+                                                    force_overlap=force_overlap)
+                        check_binary_op_with_scalar('row_sparse',
+                                                    density=density,
+                                                    ograd_density=ograd_density,
+                                                    force_overlap=force_overlap)
+                        check_binary_op_with_scalar('row_sparse', output_grad_stype='default',
+                                                    density=density,
+                                                    ograd_density=ograd_density,
+                                                    force_overlap=force_overlap)
+                        check_binary_op_with_scalar('row_sparse',
+                                                    output_grad_stype='row_sparse',
+                                                    density=density, ograd_density=ograd_density,
+                                                    force_overlap=force_overlap)
+                        check_binary_op_with_scalar('csr',
+                                                    output_grad_stype='csr',
+                                                    input_grad_stype='default',
+                                                    density=density,
+                                                    ograd_density=ograd_density,
+                                                    force_overlap=force_overlap)
+                        check_binary_op_with_scalar('csr',
+                                                    output_grad_stype='csr',
+                                                    input_grad_stype='csr',
+                                                    density=density,
+                                                    ograd_density=ograd_density,
+                                                    force_overlap=force_overlap)
+                        check_binary_op_with_scalar('csr',
+                                                    output_grad_stype='default',
+                                                    density=density,
+                                                    ograd_density=ograd_density,
+                                                    force_overlap=force_overlap)
+
+
+
+def test_elemwise_add_ex():
+    def check_elemwise_add_ex(lhs_stype, rhs_stype, shape, lhs_grad_stype=None, rhs_grad_stype=None):
+        lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
+        rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
+        lhs_nd = rand_ndarray(shape, lhs_stype)
+        rhs_nd = rand_ndarray(shape, rhs_stype)
+        lhs_np = lhs_nd.asnumpy()
+        rhs_np = rhs_nd.asnumpy()
+
+        out_np = lhs_np + rhs_np
+        test = mx.symbol.sparse.elemwise_add(lhs, rhs)
+        location = {'lhs': lhs_nd, 'rhs': rhs_nd}
+        check_symbolic_forward(test, location, [out_np])
+        check_numeric_gradient(test, location)
+        grad_stypes = {}
+        if lhs_grad_stype is not None and lhs_grad_stype != 'default':
+            grad_stypes['lhs'] = lhs_grad_stype
+        if rhs_grad_stype is not None and rhs_grad_stype != 'default':
+            grad_stypes['rhs'] = rhs_grad_stype
+        check_symbolic_backward(test, location, [out_np], [out_np, out_np],
+                                grad_stypes=grad_stypes)
+
+    shapes = [rand_shape_2d(), rand_shape_3d()]
+    for shape in shapes:
+        check_elemwise_add_ex('default', 'default', shape)
+        check_elemwise_add_ex('row_sparse', 'row_sparse', shape,
+                              lhs_grad_stype='row_sparse', rhs_grad_stype='row_sparse')
+
+
+def test_cast_storage_ex():
+    def check_cast_storage(shape, density, from_stype, to_stype, check_numeric_grad=True):
+        x = mx.symbol.Variable('x', stype=from_stype)
+        x_nd = rand_ndarray(shape, from_stype, density=density)
+        x_np = x_nd.asnumpy()
+        out_np = x_np
+        test = mx.symbol.cast_storage(x, stype=to_stype)
+        location = {'x': x_nd}
+        check_symbolic_forward(test, location, [out_np])
+        # consider disable the numeric grad check for gpu block kernel since the input is large
+        if check_numeric_grad:
+            check_numeric_gradient(test, location)
+        grad_stypes = {'x': to_stype}
+        check_symbolic_backward(test, location, [out_np], [out_np], grad_stypes=grad_stypes)
+
+    density = [1.00, 0.50, 0.01]
+    for d in density:
+        shape_2d = rand_shape_2d()
+        shape_3d = rand_shape_3d()
+        check_cast_storage(shape_2d, d, 'csr', 'default')
+        check_cast_storage(shape_2d, d, 'default', 'csr')
+        check_cast_storage(shape_2d, d, 'row_sparse', 'default')
+        check_cast_storage(shape_2d, d, 'default', 'row_sparse')
+        check_cast_storage(shape_3d, d, 'row_sparse', 'default')
+        check_cast_storage(shape_3d, d, 'default', 'row_sparse')
+        for i in range(4, 6):
+            shape = rand_shape_nd(i, 5)
+            check_cast_storage(shape, d, 'default', 'row_sparse')
+            check_cast_storage(shape, d, 'row_sparse', 'default')
+        # Test specific gpu kernels
+        if default_context().device_type is 'gpu':
+            dim0 = rnd.randint(1, 10)
+            # test gpu thread kernel
+            check_cast_storage((dim0, rnd.randint(  1,   32)), d, 'default', 'csr')
+            # test gpu warp   kernel
+            check_cast_storage((dim0, rnd.randint( 32,  512)), d, 'default', 'csr')
+            # test gpu block  kernel
+            check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'csr',
+                               check_numeric_grad=False)
+            # test gpu thread kernel
+            check_cast_storage((dim0, rnd.randint(  1,   32)), d, 'default', 'row_sparse')
+            # test gpu warp   kernel
+            check_cast_storage((dim0, rnd.randint( 32,  512)), d, 'default', 'row_sparse')
+            # test gpu block  kernel
+            check_cast_storage((dim0, rnd.randint(512, 1024)), d, 'default', 'row_sparse',
+                               check_numeric_grad=False)
+
+
+def test_sparse_dot():
+    def test_dot_csr(lhs_shape, rhs_shape, rhs_stype, trans_lhs, lhs_density, rhs_density):
+        lhs_nd = rand_ndarray(lhs_shape, 'csr', density=lhs_density, shuffle_csr_indices=False)
+        lhs_dns = lhs_nd.tostype('default')
+        rhs_nd = rand_ndarray(rhs_shape, rhs_stype, density=rhs_density)
+        rhs_dns = rhs_nd if rhs_stype == 'default' else rhs_nd.tostype('default')
+
+        out = mx.nd.dot(lhs_nd, rhs_nd, transpose_a=trans_lhs)
+        out_dns = mx.nd.dot(lhs_dns, rhs_dns, transpose_a=trans_lhs)
+        out_np = out_dns.asnumpy()
+        assert_almost_equal(out.asnumpy(), out_np, rtol=1e-4, atol=1e-5)
+
+        # test symbolic forward
+        lhs = mx.symbol.Variable('lhs', stype='csr')
+        rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
+        out = mx.symbol.sparse.dot(lhs, rhs, transpose_a=trans_lhs)
+        location = {'lhs': lhs_nd, 'rhs': rhs_nd}
+        check_symbolic_forward(out, location, [out_np], rtol=1e-3, atol=1e-4)
+
+        # test symbolic backward
+        backward_trans = not trans_lhs
+        rhs_backward_grad = mx.nd.dot(lhs_dns, out_dns, transpose_a=backward_trans).asnumpy()
+        expected = {'rhs': rhs_backward_grad}
+        check_symbolic_backward(out, location, [out_np], expected,
+                                grad_req={'lhs': 'null', 'rhs': 'write'},
+                                rtol=1e-3, atol=1e-4)
+
+    def test_sparse_dot_zero_output(lhs_shape, trans_lhs, rhs_num_cols):
+        """Test for nnr_out = 0. Before the fix, the test would fail."""
+        lhs = mx.nd.zeros(lhs_shape)
+        irow = np.random.randint(0, lhs_shape[0])
+        icol = np.random.randint(0, lhs_shape[1])
+        lhs[irow, icol] = 1.0
+        if trans_lhs:
+            rhs = rand_ndarray(shape=(lhs_shape[0], rhs_num_cols), stype='default')
+            rhs[irow, :] = 0
+        else:
+            rhs = rand_ndarray(shape=(lhs_shape[1], rhs_num_cols), stype='default')
+            rhs[icol, :] = 0
+        dns_out = mx.nd.dot(lhs, rhs, transpose_a=trans_lhs)
+        assert mx.nd.sum(mx.nd.abs(dns_out)).asscalar() == 0
+        sps_out = mx.nd.sparse.dot(lhs.tostype('csr'), rhs.tostype('row_sparse'), transpose_a=trans_lhs)
+        assert same(dns_out.asnumpy(), sps_out.asnumpy())
+
+    density = [1.00, 0.50, 0.01]
+    for lhs_d in density:
+        lhs_shape = rand_shape_2d(50, 200)
+        rhs_d = 1
+        test_dot_csr(lhs_shape, (lhs_shape[1], 1), 'default', False, lhs_d, rhs_d)  # test gpu SpMV
+        test_dot_csr(lhs_shape, (lhs_shape[0], 1), 'default', True,  lhs_d, rhs_d)  # (vector kernel)
+        test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(5, 10)), 'default', False, lhs_d, rhs_d)  # test gpu SpMM
+        test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(5, 10)), 'default', True, lhs_d, rhs_d)  # (scalar kernel)
+        for rhs_d in density:
+            test_dot_csr(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'row_sparse', False, lhs_d, rhs_d)
+            test_dot_csr(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'row_sparse', True, lhs_d, rhs_d)
+
+    test_sparse_dot_zero_output(rand_shape_2d(50, 200), False, 40)
+    test_sparse_dot_zero_output(rand_shape_2d(50, 200), True, 40)
+
+
+def test_sparse_slice():
+    def check_csr_slice(shape, slice_input):
+        storage_type = 'csr'
+        B, _ = rand_sparse_ndarray(shape, storage_type)
+        np = B.asnumpy()
+        begin = rnd.randint(0, B.shape[0] - 1)
+        end = rnd.randint(begin + 1, B.shape[0])
+        nd_slice = mx.nd.crop(B, begin=begin, end=end)
+        assert same(nd_slice.asnumpy(), np[begin:end]), (nd_slice.asnumpy(), np[begin:end])
+
+    shape = (rnd.randint(7, 15), rnd.randint(1, 10))
+    check_csr_slice(shape, True)
+    check_csr_slice(shape, False)
+
+
+def test_sparse_retain():
+    def check_sparse_retain(shape, density, index_type=np.int64):
+        num_rows = shape[0]
+        rsp, _ = rand_sparse_ndarray(shape=shape, stype='row_sparse', density=density)
+        length = np.random.randint(1, num_rows + 1)
+        idx = random_sample(list(range(0, num_rows)), length)
+        idx.sort()
+        dns = rsp.asnumpy()
+        tensor_retained_expected = np.zeros(shape)
+        for i in idx:
+            tensor_retained_expected[i][:] = dns[i]
+        indices = mx.nd.array(idx, dtype=index_type)
+        rsp_retained = mx.nd.sparse.retain(rsp, indices=indices)
+        assert same(tensor_retained_expected, rsp_retained.asnumpy())
+
+        # check numeric gradient
+        data = mx.symbol.Variable('data')
+        idx = mx.symbol.Variable('indices')
+        sym = mx.sym.sparse.retain(data=data, indices=idx)
+        check_numeric_gradient(sym, [rsp, indices], grad_nodes=['data'],
+                               grad_stype_dict={'data': 'row_sparse'})
+
+    shape = rand_shape_2d()
+    shape_3d = rand_shape_3d()
+    densities = [0.01, 0.5, 1.0]
+    index_types = [np.float32, np.int32, np.int64]
+    for density in densities:
+        for itype in index_types:
+            check_sparse_retain(shape, density, itype)
+            check_sparse_retain(shape_3d, density, itype)
+
+
+def test_sparse_unary_with_numerics():
+    def check_sparse_simple(name, stype, mxnet_func, forward_numpy_call,
+                            backward_numpy_call, output_grad_stype=None,
+                            backward_is_use_output=False):
+        if output_grad_stype is None:
+            output_grad_stype = stype
+
+        expected_result_type, expected_grad_result_type = \
+            get_fw_bw_result_types_2(forward_numpy_call, stype, backward_numpy_call, output_grad_stype)
+        if backward_is_use_output is True:
+            expected_grad_result_type = expected_result_type
+
+        shape = (3, 4)
+        data = mx.symbol.Variable("data")
+
+        grad_stypes = {'data' : expected_grad_result_type}
+
+        y = mxnet_func(data)
+        if stype == 'default':
+            xa = np.random.uniform(low=-1.0, high=1.0, size=shape)
+            xa_np = xa
+        else:
+            xa = create_sparse_array(shape, stype, data_init=None, rsp_indices=[1],
+                                     modifier_func=lambda a: a - 0.5,
+                                     shuffle_csr_indices=True)
+            xa_np = xa.asnumpy()
+
+        if output_grad_stype != 'default':
+            out_grad = create_sparse_array(shape, output_grad_stype, data_init=None,
+                                           rsp_indices=[1, 2],
+                                           modifier_func=lambda a: a - 0.5,
+                                           shuffle_csr_indices=True)
+            out_grad_np = out_grad.asnumpy()
+        else:
+            out_grad_np = np.ones(xa.shape)
+            out_grad = mx.nd.array(out_grad_np)
+
+        output_np = forward_numpy_call(xa_np)
+        input_grad_np = backward_numpy_call(output_np, out_grad_np)
+
+        outputs = check_symbolic_forward(y, [xa], [output_np])
+        output = outputs[0]
+
+        assert output.stype == expected_result_type
+
+        input_grad_dict = check_symbolic_backward(y, location=[xa], out_grads=[out_grad],
+                                                  expected=[input_grad_np],
+                                                  grad_stypes=grad_stypes)
+        inp_grad = input_grad_dict["data"]
+
+        assert inp_grad.stype == expected_grad_result_type
+
+    def check_sparse_function(name, mxnet_func, forward_numpy_call, backward_numpy_call,
+                              backward_is_use_output=False):
+        check_sparse_simple(name, 'default', mxnet_func, forward_numpy_call, backward_numpy_call)
+        for output_grad_stype in [None, "row_sparse", "default"]:
+            check_sparse_simple(name, 'row_sparse', mxnet_func, forward_numpy_call, backward_numpy_call,
+                                output_grad_stype=output_grad_stype,
+                                backward_is_use_output=backward_is_use_output)
+
+    check_sparse_function('relu',
+                          lambda x: mx.sym.relu(x),
+                          lambda x: np.maximum(x, 0.0),
+                          lambda output, outg: outg * assign_each(output, lambda x: x > 0.0), backward_is_use_output=True)
+
+    check_sparse_function('sigmoid',
+                          lambda x: mx.sym.sigmoid(x),
+                          lambda x: np.divide(1.0, (1.0 + np.exp(-x))),
+                          lambda output, outg: outg * assign_each(output, lambda x: x * (1.0 - x)),
+                          backward_is_use_output=True)
+
+
+def test_sparse_nd_zeros():
+    def check_sparse_nd_zeros(stype, shape):
+        zero = mx.nd.zeros(shape)
+        sparse_zero = mx.nd.zeros(shape=shape, stype=stype)
+        assert_almost_equal(sparse_zero.asnumpy(), zero.asnumpy())
+
+    shape = rand_shape_2d()
+    check_sparse_nd_zeros('row_sparse', shape)
+    check_sparse_nd_zeros('csr', shape)
+    check_sparse_nd_zeros('default', shape)
+
+
+def test_sparse_nd_zeros_like():
+    def check_sparse_nd_zeros_like(stype, shape):
+        zero = mx.nd.zeros(shape, stype=stype)
+        zero_like = mx.nd.sparse.zeros_like(zero)
+        assert_almost_equal(zero.asnumpy(), zero_like.asnumpy())
+
+    shape = rand_shape_2d()
+    check_sparse_nd_zeros_like('row_sparse', shape)
+    check_sparse_nd_zeros_like('csr', shape)
+
+
+def test_sparse_axis_operations():
+    def test_variations(func_name):
+        dim0 = 30
+        dim1 = 100
+        axes = [0, 1]
+        densities = [0, 0.5, 1]
+        for density in densities:
+            shape = rand_shape_2d(dim0, dim1)
+            csr_array = rand_ndarray(shape=shape, stype='csr', density=density)
+            dns = csr_array.tostype('default')
+            for axis in axes:
+                ret = func_name(csr_array, axis=axis)
+                assert ret.stype == 'default'
+                ret_expected = func_name(dns, axis=axis)
+                assert_almost_equal(ret.asnumpy(), ret_expected.asnumpy())
+
+    def test_fallback(func_name, axis=0, keepdims=True, exclude=True):
+        dim0 = 30
+        dim1 = 100
+        shape = rand_shape_2d(dim0, dim1)
+        csr_array = rand_ndarray(shape=shape, stype='csr', density=0.01)
+        ret= func_name(csr_array, axis=axis, keepdims=keepdims,
+                       exclude=exclude)
+
+    test_variations(mx.nd.sum)
+    test_fallback(mx.nd.sum, axis=0, keepdims=True, exclude=True)
+    test_variations(mx.nd.mean)
+    test_fallback(mx.nd.mean, axis=0, keepdims=True, exclude=True)
+
+
+def test_sparse_square_sum():
+    if default_context().device_type == 'cpu':
+        dim0 = 30
+        dim1 = 30
+        axes = [0, 1]
+        keepdims = [False, True]
+        densities = [0, 0.01, 0.2, 0.5, 1.0]
+        for density in densities:
+            shape = rand_shape_2d(dim0, dim1)
+            rsp = rand_ndarray(shape, 'row_sparse', density)
+            dns = rsp.tostype('default')
+            for axis in axes:
+                for keepdim in keepdims:
+                    ret = mx.nd._internal._square_sum(rsp, axis=axis, keepdims=keepdim)
+                    if axis == 1 and keepdim:
+                        assert ret.stype == 'row_sparse'
+                    else:
+                        assert ret.stype == 'default'
+                    ret_expected = mx.nd.sum(dns*dns, axis=axis, keepdims=keepdim)
+                    # check forward result
+                    assert_almost_equal(ret.asnumpy(), ret_expected.asnumpy())
+
+                    rsp_data = mx.sym.Variable('data', stype='row_sparse')
+                    test = mx.symbol._internal._square_sum(rsp_data, axis=axis, keepdims=keepdim)
+
+                    # check symbolic backward since ograd can be an rsp
+                    # and cannot be checked through check_numeric_gradient
+                    # because it will add a loss layer as the output layer
+                    # which makes ograd of the square_sum dense
+                    if axis == 1 and keepdim:
+                        dns_data = mx.sym.Variable('data')
+                        baseline = mx.sym.sum(mx.sym.square(dns_data), axis=axis, keepdims=keepdim)
+                        igrad_expected = mx.nd.empty(dns.shape)
+                        baseline_exec = baseline.bind(default_context(), args=[dns],
+                                                      args_grad=[igrad_expected])
+                        baseline_exec.forward(is_train=True)
+                        baseline_exec.backward([ret_expected])
+                        # check backward when ograd is row sparse
+                        check_symbolic_backward(test, [rsp], [ret_expected.tostype('row_sparse')],
+                                                [igrad_expected.asnumpy()], grad_stypes={'data': 'row_sparse'})
+
+                        # check backward when ograd is dense
+                        # the stype of output of the square_sum is deteremined in symbol binding stage.
+                        # The ograd stype of the last layer is the same as the output stype of the last layer.
+                        # Need to add one more layer after square_sum to trigger the kernel for ograd
+                        # with default stype in square_sum op.
+                        baseline1 = baseline + 1
+                        baseline_exec1 = baseline1.bind(default_context(), args=[dns],
+                                                        args_grad=[igrad_expected])
+                        baseline_exec1.forward(is_train=True)
+                        baseline_exec1.backward([ret_expected])
+                        test1 = test + 1
+                        check_symbolic_backward(test1, [rsp], [ret_expected], [igrad_expected.asnumpy()],
+                                                grad_stypes={'data': 'row_sparse'})
+
+                    # check numeric gradient
+                    check_numeric_gradient(test, [rsp], grad_stype_dict={'data': 'row_sparse'},
+                                           atol=1e-2, rtol=0.1)
+
+
+def test_sparse_storage_fallback():
+    """ test operators which don't implement FComputeEx or FStatefulComputeEx """
+    def check_broadcast_add(shape, lhs_stype, rhs_stype):
+        lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
+        rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
+        lhs_nd = rand_ndarray(shape, lhs_stype)
+        rhs_nd = rand_ndarray(shape, rhs_stype)
+        lhs_dns = mx.nd.cast_storage(lhs_nd, stype='default')
+        rhs_dns = mx.nd.cast_storage(rhs_nd, stype='default')
+
+        out_dns = (lhs_dns + rhs_dns).asnumpy()
+        test = mx.symbol.broadcast_add(lhs, rhs)
+        location = {'lhs': lhs_nd, 'rhs': rhs_nd}
+        check_symbolic_forward(test, location, [out_dns])
+        check_numeric_gradient(test, location)
+        check_symbolic_backward(test, location, [out_dns], [out_dns, out_dns])
+
+    def np_softmax(x, axis=-1):
+        # fix for old numpy on Travis not supporting keepdims
+        x = x - np.max(x, axis=axis, keepdims=True)
+        x = np.exp(x)
+        x /= np.sum(x, axis=axis, keepdims=True)
+        return x
+
+    def check_softmax_with_shape(lhs_stype, rhs_stype, shape, preserve_shape=False):
+        # bind with label
+        ctx = default_context()
+        X = mx.symbol.Variable('X', stype=lhs_stype)
+        L = mx.symbol.Variable('L', stype=rhs_stype)
+        Y = mx.symbol.SoftmaxOutput(data=X, label=L, preserve_shape=preserve_shape)
+        x = rand_ndarray(shape, lhs_stype)
+        l = rand_ndarray(shape, rhs_stype)
+        l[:] = np_softmax(l.asnumpy())
+        grad = mx.nd.empty(shape, ctx=ctx)
+        exec1 = Y.bind(ctx, args = [x, l], args_grad = {'X': grad})
+        exec1.forward(is_train=True)
+        out = exec1.outputs[0].asnumpy()
+        assert_almost_equal(out, np_softmax(x.asnumpy()), rtol=1e-4)
+        exec1.backward()
+        assert_almost_equal(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy(),
+                            rtol=1e-3, atol=1e-4)
+
+    def check_concat(shape, lhs_stype, rhs_stype):
+        x = mx.symbol.Variable('x', stype=lhs_stype)
+        w = mx.symbol.Variable('w', stype=rhs_stype)
+        test = mx.sym.Concat(x, w)
+        x_nd = rand_ndarray(shape, lhs_stype)
+        w_nd = rand_ndarray(shape, rhs_stype)
+        location = {'x': x_nd, 'w': w_nd}
+        check_numeric_gradient(test, location)
+
+    def check_operator_with_temp_resource(shape, stype):
+        x = mx.symbol.Variable('x', stype=stype)
+        test = mx.sym.sum(x)
+        x_nd = rand_ndarray(shape, stype)
+        location = {'x': x_nd}
+        check_numeric_gradient(test, location)
+
+    shape = rand_shape_2d()
+    stypes = ['default', 'csr', 'row_sparse']
+    for lhs in stypes:
+        check_operator_with_temp_resource(shape, lhs)
+        for rhs in stypes:
+            check_broadcast_add(shape, lhs, rhs)
+            check_concat(shape, lhs, rhs)
+            check_softmax_with_shape(lhs, rhs, shape, preserve_shape=False)
+            check_softmax_with_shape(rhs, rhs, shape, preserve_shape=True)
+
+
+def test_sparse_elementwise_sum():
+    def check_sparse_elementwise_sum_with_shape(stype, shape, n):
+        # forward
+        inputs = [mx.symbol.Variable('arg%d' % i) for i in range(n)]
+        out = mx.symbol.sparse.add_n(*inputs, name='esum')
+        arr = []
+        arr_grad = [mx.nd.empty(shape, stype=stype) for _ in range(n)]
+        densities = [0, 0.01, 0.5, 1.0]
+        for i in range(n):
+            arr.append(rand_ndarray(shape, stype, densities[np.random.randint(0, len(densities))]))
+
+        exec1 = out.bind(default_context(),
+                         args=arr,
+                         args_grad=arr_grad)
+        exec1.forward(is_train=True)
+        out1 = exec1.outputs[0].asnumpy()
+        out = sum(a.asnumpy() for a in arr)
+        assert_almost_equal(out, out1)
+
+        out_grad = mx.nd.empty(shape)
+        out_grad[:] = np.random.uniform(-10, 10, shape)
+        # backward
+        exec1.backward([out_grad])
+        for a in arr_grad:
+            assert_almost_equal(a.asnumpy(), out_grad.asnumpy())
+
+    for dim in range(2, 4):
+        shape = tuple(np.random.randint(5, 10, size=dim))
+        check_sparse_elementwise_sum_with_shape('row_sparse', shape, np.random.randint(1, 9))
+
+
+def test_sparse_embedding():
+    ''' test sparse embedding op on cpu '''
+    def check_sparse_embedding(executor, weight_ref, data_onehot, grad, density):
+        # update weight based on density
+        weight[:] = rand_ndarray(weight.shape, 'row_sparse', density=density)
+        # check forward
+        executor.forward(is_train=True)
+        assert_almost_equal(executor.outputs[0].asnumpy(), np.dot(data_onehot, weight.asnumpy()))
+        # check backward
+        executor.backward([grad])
+        assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(data_onehot.T, grad.asnumpy()))
+
+    densities = [0, 0.5, 1]
+    in_dim = 50
+    out_dim = 3
+    batch = 8
+    # init executor
+    data = mx.sym.Variable("data")
+    weight = mx.sym.Variable("embed_weight", stype='row_sparse')
+    embed = mx.sym.contrib.SparseEmbedding(data=data, weight=weight, input_dim=in_dim,
+                                           output_dim=out_dim, name="embed")
+    grad_req = {'data': 'null', 'embed_weight': 'write'}
+    exe_test = embed.simple_bind(default_context(), grad_req=grad_req, data=(batch,))
+    arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
+    grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
+    # init data
+    np_data = np.random.randint(low=0, high=in_dim, size=batch)
+    np_onehot = np.zeros((batch, in_dim))
+    np_onehot[np.arange(batch), np_data] = 1.0
+    arg_map["data"][:] = np_data
+    # init grad
+    np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
+    grad = mx.nd.sparse.zeros('row_sparse', np_grad.shape)
+    grad[:] = np_grad
+    # weight
+    weight = arg_map["embed_weight"]
+    for density in densities:
+        check_sparse_embedding(exe_test, weight, np_onehot, grad, density)
+
+
+def test_scatter_ops():
+    def csr_get_seen_points(name, csr_array, verbose=False):
+        """Get a unique list of points int he CSR array as well as a
+        corresponding parallel list of points and values"""
+        seen_points = set()
+        seen_point_list = list()
+        values = list()
+        row_count = csr_array.shape[0]
+        row_pointers = csr_array.indptr.asnumpy()
+        col_indexes  = csr_array.indices.asnumpy()
+        data = csr_array.data.asnumpy()
+        for row in range(row_count):
+            start_pos = row_pointers[row]
+            end_pos = row_pointers[row + 1]
+            for col_index in range(start_pos, end_pos):
+                col = col_indexes[col_index]
+                val = data[col_index]
+                if verbose is True:
+                    print("{}: (row, col = ({}, {}) = {}".format(name, row, col, val))
+                seen_points.add((row, col))
+                seen_point_list.append((row, col))
+                values.append(val)
+        return seen_points, values, seen_point_list
+
+    def check_scatter_ops(name, shape, lhs_stype, rhs_stype, forward_mxnet_call, forward_numpy_call,
+                          density=0.25, rhs_is_scalar=False, verbose=False):
+        lhs = mx.symbol.Variable('lhs', stype=lhs_stype)
+        if rhs_is_scalar is False:
+            rhs = mx.symbol.Variable('rhs', stype=rhs_stype)
+
+        if verbose is True:
+            print(name)
+
+        if lhs_stype != 'default':
+            lhs_nd = create_sparse_array_zd(
+                shape, lhs_stype, density=density,
+                rsp_indices=gen_rsp_random_indices(
+                    shape,
+                    density=density,
+                    force_indices=[(shape[0]/2)]  # force at least one overlap
+                ))
+        else:
+            lhs_nd = rand_ndarray(shape, 'default')
+
+        if rhs_is_scalar is False:
+            if rhs_stype != 'default':
+                rhs_nd = create_sparse_array_zd(
+                    shape, rhs_stype, density=density,
+                    rsp_indices=gen_rsp_random_indices(
+                        shape,
+                        density=density,
+                        force_indices=[(shape[0]/2)]  # force at least one overlap
+                    ))
+            else:
+                rhs_nd = rand_ndarray(shape, 'default')
+        else:
+            rhs_nd = 9
+            rhs = rhs_nd
+
+        lhs_np = lhs_nd.asnumpy()
+        rhs_np = rhs_nd if rhs_is_scalar is True else rhs_nd.asnumpy()
+
+        if verbose is True:
+            print("lhs = {}".format(lhs_np))
+            print("rhs = {}".format(rhs_np))
+
+        out_np = forward_numpy_call(lhs_np, rhs_np)
+
+        if verbose is True:
+            print("Numpy: out_np = {}".format(out_np))
+
+        location = {'lhs': lhs_nd, 'rhs': rhs_nd}
+
+        out = forward_mxnet_call(lhs, rhs)
+        exe_test = out.bind(default_context(), args=location)
+        exe_test.forward(is_train=False)
+        out_nd = exe_test.outputs[0]
+
+        if verbose is True:
+            print("Sym: out_nd = {}".format(out_nd.asnumpy()))
+
+        # For row_sparse, check that rows only exist for rows that are
+        # either int lhs or rhs, and if they exist, they should equal
+        # the numpy values
+        if lhs_stype == 'default':
+            almost_equal(out_nd.asnumpy(), out_np, equal_nan=True)
+        elif lhs_stype == 'row_sparse':
+            seen_rows = set()
+            indices = lhs_nd.indices.asnumpy()
+            for i in range(len(indices)):
+                seen_rows.add(indices[i])
+            assert len(out_nd.indices.asnumpy()) == len(seen_rows)
+            out_nd_np = out_nd.asnumpy()
+            for row in seen_rows:
+                row_nd = out_nd_np[row]
+                row_np = out_np[row]
+                almost_equal(row_nd, row_np, equal_nan=True)
+        elif lhs_stype == 'csr' and rhs_is_scalar is False:
+            almost_equal(out_nd.asnumpy(), out_np, equal_nan=True)
+        else:
+            assert rhs_is_scalar
+            lhs_seen_points, _, _ = csr_get_seen_points("lhs", lhs_nd, verbose)
+            if rhs_is_scalar is False:
+                rhs_seen_points, _, _ = csr_get_seen_points("rhs", rhs_nd, verbose)
+            else:
+                rhs_seen_points = set()
+            input_seen_points = lhs_seen_points.union(rhs_seen_points)
+            out_seen_pounts, out_values, seen_point_list = csr_get_seen_points("out_nd", out_nd, verbose)
+            # Some may have been zero
+            assert len(out_seen_pounts) <= len(input_seen_points)
+            out_nd_np = out_nd.asnumpy()
+            val_index = 0
+            for row_col in seen_point_list:
+                row = row_col[0]
+                col = row_col[1]
+                val = out_values[val_index]
+                val_np = out_nd_np[row, col]
+                almost_equal(val, val_np, equal_nan=True)
+                val_index += 1
+
+    shape = (10, 5)
+
+    for lhs_stype in ['row_sparse', 'default', 'csr']:
+        for rhs_stype in ['row_sparse', 'default', 'csr']:
+            print("op: {}, lhs_stype: {}, rhs_stype: {}".format('_scatter_elemwise_div',
+                                                                lhs_stype, rhs_stype))
+            check_scatter_ops('_scatter_elemwise_div', shape, lhs_stype, rhs_stype,
+                              lambda l, r: mx.sym._internal._scatter_elemwise_div(l, r),
+                              lambda l, r: l / r,
+                              verbose=False)
+
+    for lhs_stype in ['row_sparse', 'default', 'csr']:
+        print("op: {}, lhs_stype: {}".format('_scatter_plus', lhs_stype))
+        check_scatter_ops('_scatter_plus', shape, lhs_stype, 'scalar',
+                          lambda l, r: mx.sym._internal._scatter_plus_scalar(l, r),
+                          lambda l, r: l + r,
+                          rhs_is_scalar=True, verbose=False)
+
+        print("op: {}, lhs_stype: {}".format('_scatter_minus', lhs_stype))
+        check_scatter_ops('_scatter_minus', shape, lhs_stype, 'scalar',
+                          lambda l, r: mx.sym._internal._scatter_minus_scalar(l, r),
+                          lambda l, r: l + r,
+                          rhs_is_scalar=True, verbose=False, density=0.5)
+
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index c570325a6b66..30e76a272e2a 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -42,10 +42,8 @@ def test_symbol_compose():
     net2 = mx.symbol.FullyConnected(name='fc3', num_hidden=10)
     net2 = mx.symbol.Activation(data=net2, act_type='relu')
     net2 = mx.symbol.FullyConnected(data=net2, name='fc4', num_hidden=20)
-    #print(net2.debug_str())
 
     composed = net2(fc3_data=net1, name='composed')
-    #print(composed.debug_str())
     multi_out = mx.symbol.Group([composed, net1])
     assert len(multi_out.list_outputs()) == 2
 
@@ -162,12 +160,65 @@ def test_symbol_infer_shape_var():
     assert arg_shapes[1] == overwrite_shape
     assert out_shapes[0] == overwrite_shape
 
-def check_symbol_consistency(sym1, sym2, ctx):
+def test_symbol_fluent():
+    has_grad = set(['flatten', 'expand_dims', 'flip', 'tile', 'transpose', 'sum', 'nansum', 'prod',
+                    'nanprod', 'mean', 'max', 'min', 'reshape', 'broadcast_to', 'split',
+                    'broadcast_axes', 'pad', 'swapaxes', 'slice', 'slice_axis', 'take',
+                    'one_hot', 'pick', 'sort', 'topk', 'argsort', 'argmax', 'argmin',
+                    'clip', 'abs', 'sign', 'sin', 'cos', 'tan', 'arcsin', 'arccos', 'arctan',
+                    'degrees', 'radians', 'sinh', 'cosh', 'tanh', 'arcsinh', 'arccosh', 'arctanh',
+                    'exp', 'expm1', 'log', 'log10', 'log2', 'log1p', 'sqrt', 'rsqrt',
+                    'square', 'reciprocal' 'reshape_like', 'cbrt', 'rcbrt', 'relu', 'sigmoid',
+                    'softmax', 'log_softmax'])
+    def check_fluent_regular(func, kwargs, shape=(5, 17, 1), equal_nan=False):
+        with mx.name.NameManager():
+            data = mx.symbol.Variable('data')
+            regular = getattr(mx.symbol, func)(data, name=func+'0', **kwargs)
+            fluent = getattr(data, func)(**kwargs)
+            check_symbol_consistency(regular, fluent, {'ctx': mx.context.current_context(),
+                                                       'data': shape},
+                                     skip_grad=func not in has_grad,
+                                     equal_nan=equal_nan)
+
+    for func in ['flatten', 'norm', 'round', 'rint', 'fix', 'floor', 'ceil', 'trunc', 'zeros_like',
+                 'ones_like', 'abs', 'sign', 'sin', 'cos', 'degrees', 'radians',
+                 'exp', 'expm1',  'square', 'reciprocal', 'argmax_channel']:
+        check_fluent_regular(func, {})
+
+    for func in ['arccosh', 'arcsin', 'arccos', 'arctan', 'tan', 'sinh', 'cosh', 'tanh',
+                 'arcsinh', 'arctanh', 'log', 'log10', 'log2', 'log1p', 'sqrt', 'rsqrt',
+                 'cbrt', 'rcbrt', 'relu', 'sigmoid', 'softmax', 'log_softmax']:
+        check_fluent_regular(func, {}, equal_nan=True)
+
+    for func in ['expand_dims', 'flip', 'sort', 'topk', 'argsort', 'argmax', 'argmin']:
+        check_fluent_regular(func, {'axis': 1})
+
+    check_fluent_regular('one_hot', {'depth': 15})
+    check_fluent_regular('tile', {'reps': (1,2)})
+    check_fluent_regular('repeat', {'repeats': 3})
+    check_fluent_regular('transpose', {'axes': (1,0,2)})
+    check_fluent_regular('split', {'axis': 2, 'num_outputs': 3}, shape=(5, 17, 6))
+    check_fluent_regular('slice', {'begin': (2, 5, 1), 'end': (4, 7, 6)}, shape=(5, 17, 6))
+    check_fluent_regular('slice_axis', {'axis': 1, 'begin': 5, 'end': 7})
+    check_fluent_regular('clip', {'a_min': 0.25, 'a_max': 0.75})
+    check_fluent_regular('broadcast_axes', {'axis': (2,), 'size': (5,)})
+    check_fluent_regular('pad', {'mode': 'constant', 'pad_width': (0,0,0,0,3,0,0,4)}, shape=(5, 17, 2, 3))
+    check_fluent_regular('reshape_like', {'rhs': mx.sym.ones((30, 17))}, shape=(5, 17, 2, 3))
+
+    for func in ['sum', 'nansum', 'prod', 'nanprod', 'mean', 'max', 'min']:
+        check_fluent_regular(func, {'axis': (1, 2)})
+
+    check_fluent_regular('reshape', {'shape': (17, 1, 5)})
+    check_fluent_regular('broadcast_to', {'shape': (5, 17, 47)})
+
+def check_symbol_consistency(sym1, sym2, ctx, skip_grad=False, equal_nan=False):
     assert sym1.list_arguments() == sym2.list_arguments()
     assert sym1.list_auxiliary_states() == sym2.list_auxiliary_states()
     assert sym1.list_outputs() == sym2.list_outputs()
 
-    mx.test_utils.check_consistency([sym1, sym2], ctx_list=[ctx, ctx])
+    mx.test_utils.check_consistency([sym1, sym2], ctx_list=[ctx, ctx],
+                                    grad_req='null' if skip_grad else 'write',
+                                    equal_nan=equal_nan)
 
 def test_load_000800():
     with mx.AttrScope(ctx_group='stage1'):
@@ -246,6 +297,45 @@ def test_zero_prop2():
     assert False
 
 
+def test_simple_bind_incomplete_shape_inference_in_one_forward_pass():
+    """This is a special case that results in shape inference
+    failure after moving simple_bind logic from frontend to backend.
+    Added here for testing against the network similar to the following one.
+
+    Network diagram:
+    weight --> abs_op --> sum_op --
+          \                        |--> add_op
+    data   --> fc_op  --> sum_op --
+
+    Given data's shape, if the shape inference starts from weight node,
+    then the node entries of negative_op and sum_op are unknown in the
+    forward pass. Therefore, there are several unknown shapes after the
+    first forward pass is done. Now the backward inference pass starts with
+    the assumption that there are no unknown-shape node entries in the forward
+    pass, and consequently, leads to CHECK_EQ failure.
+    """
+    data_shape = (5, 13)
+    data = mx.sym.Variable('data')
+    fc = mx.sym.FullyConnected(data=data, num_hidden=1, no_bias=True, name='fc')
+    modified_weight = mx.sym.abs(fc.get_internals()['fc_weight'])
+    net = mx.sym.sum(modified_weight) + mx.sym.sum(fc)
+    net.simple_bind(ctx=mx.cpu(), data=data_shape)
+
+
+def test_simple_bind_gradient_graph_possible_with_cycle():
+    """This is a special case that results in a cycle in the gradient graph
+    before this bug was fixed. With the following symbol, the node entries
+    passed into function AggregateGradient(std::vector<nnvm::NodeEntry>&& v)
+    are the outputs of the same node. Therefore, adding a node to the
+    control_deps of itself must be skipped.
+    See GitHub issue:
+    https://github.com/apache/incubator-mxnet/issues/8029
+    for more details."""
+    data = mx.symbol.Variable('data')
+    res = data + data + data + data + data + data + data + data
+    res.simple_bind(ctx=mx.cpu(), data=(1,))
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
diff --git a/tests/travis/r_vignettes.R b/tests/travis/r_vignettes.R
index 1aa9a4f755c1..1b03b8bba4ec 100644
--- a/tests/travis/r_vignettes.R
+++ b/tests/travis/r_vignettes.R
@@ -1,3 +1,20 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
 fnames <- list.files("R-package/vignettes/", pattern="*.Rmd")
 sapply(fnames, function(x){
 	knitr::purl(paste0("R-package/vignettes/", x))
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index fb1869f842b1..fd23f0e82b24 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -117,21 +117,21 @@ if [ ${TASK} == "python_test" ]; then
     mkdir -p ${PWD}/data
 
     if [ ${TRAVIS_OS_NAME} == "osx" ]; then
-        python -m nose tests/python/unittest || exit -1
-        python3 -m nose tests/python/unittest || exit -1
+        python -m nose -v tests/python/unittest || exit -1
+        python3 -m nose -v tests/python/unittest || exit -1
         # make cython3
         # cython tests
         # export MXNET_ENFORCE_CYTHON=1
         # python3 -m nose tests/python/unittest || exit -1
-        python3 -m nose tests/python/train || exit -1
-        python -m nose tests/python/doctest || exit -1
-        python3 -m nose tests/python/doctest || exit -1
+        python3 -m nose -v tests/python/train || exit -1
+        python -m nose -v tests/python/doctest || exit -1
+        python3 -m nose -v tests/python/doctest || exit -1
     else
-        nosetests tests/python/unittest || exit -1
-        nosetests3 tests/python/unittest || exit -1
-        nosetests3 tests/python/train || exit -1
-        nosetests tests/python/doctest || exit -1
-        nosetests3 tests/python/doctest || exit -1
+        nosetests -v tests/python/unittest || exit -1
+        nosetests3 -v tests/python/unittest || exit -1
+        nosetests3 -v tests/python/train || exit -1
+        nosetests -v tests/python/doctest || exit -1
+        nosetests3 -v tests/python/doctest || exit -1
     fi
     exit 0
 fi
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
index 94d674f3943e..c6e3bf829917 100755
--- a/tests/travis/setup.sh
+++ b/tests/travis/setup.sh
@@ -33,8 +33,8 @@ if [ ${TRAVIS_OS_NAME} == "osx" ]; then
     brew install ImageMagick
     brew install swig
     if [ ${TASK} == "python_test" ]; then
-        python -m pip install --user nose numpy cython
-        python3 -m pip install --user nose numpy cython
+        python -m pip install --user nose numpy cython scipy
+        python3 -m pip install --user nose numpy cython scipy
     fi
 fi
 
@@ -51,9 +51,9 @@ fi
 
 if [ ${TASK} == "perl_test" ]; then
     if [ ${TRAVIS_OS_NAME} == "linux" ]; then
-       cpanm -q -L "${HOME}/perl5" Function::Parameters
+       cpanm -q -L "${HOME}/perl5" Function::Parameters Hash::Ordered
     else
        sudo sh -c 'curl -L https://cpanmin.us | perl - App::cpanminus'
-       sudo cpanm -q -n PDL Mouse Function::Parameters
+       sudo cpanm -q -n PDL Mouse Function::Parameters Hash::Ordered
     fi
 fi
diff --git a/tools/accnn/rank_selection.py b/tools/accnn/rank_selection.py
index 66937b2859b9..c5c026114aeb 100644
--- a/tools/accnn/rank_selection.py
+++ b/tools/accnn/rank_selection.py
@@ -81,7 +81,7 @@ def get_ranksel(model, ratio):
         if nxt_c > EC:
           continue
         nxt_v = dp[now][now_c] + math.log(S[i][d])
-        if dp[nxt].has_key(nxt_c):
+        if nxt_c in dp[nxt]:
           if nxt_v > dp[nxt][nxt_c]:
             dp[nxt][nxt_c] = nxt_v
             dpc[i][nxt_c] = (d,now_c)
diff --git a/tools/accnn/utils.py b/tools/accnn/utils.py
index 25fb18895620..2795f8558f7a 100644
--- a/tools/accnn/utils.py
+++ b/tools/accnn/utils.py
@@ -20,6 +20,7 @@
 import json
 import ast
 
+
 def load_model(args):
   devs = mx.cpu() if args.gpus == None else [mx.gpu(int(i)) for i in args.gpus.split(',')]
   return mx.model.FeedForward.load(args.model, args.load_epoch, ctx=devs)
@@ -29,7 +30,7 @@ def topsort(nodes):
   deg = [0]*n
   g = [[] for _ in xrange(n)]
   for i,node in enumerate(nodes):
-    if node.has_key('inputs'):
+    if 'inputs' in node:
       for j in node['inputs']:
         deg[i] += 1
         g[j[0]].append(i)
@@ -45,7 +46,7 @@ def topsort(nodes):
         q.append(j)
   new_ids=dict([(node['name'],i) for i,node in enumerate(res)])
   for node in res:
-    if node.has_key('inputs'):
+    if 'inputs' in node:
       for j in node['inputs']:
         j[0]=new_ids[nodes[j[0]]['name']]
   return res
diff --git a/tools/bandwidth/measure.py b/tools/bandwidth/measure.py
index 66ef7371f11e..cd4f0fe8433c 100644
--- a/tools/bandwidth/measure.py
+++ b/tools/bandwidth/measure.py
@@ -53,6 +53,8 @@ def parse_args():
                         help='number of classes')
     parser.add_argument('--optimizer', type=str, default='None',
                         help='the optimizer set to kvstore. None means no optimizer')
+    parser.add_argument('--gc-type', type=str, default='none',
+                        help='type of gradient compression')
     args = parser.parse_args()
     logging.info(args)
     return args
@@ -72,10 +74,12 @@ def error(gpu_res, cpu_res):
     return res
 
 def run(network, optimizer, gpus, kv_store, image_shape, disp_batches,
-        num_batches, test_results, **kwargs):
+        num_batches, test_results, gc_type, **kwargs):
     # create kvstore and optimizer
     devs = [mx.gpu(int(i)) for i in gpus.split(',')]
     kv = mx.kv.create(kv_store)
+    if gc_type != 'none':
+        kv.set_gradient_compression({'type': gc_type})
     if optimizer is None or optimizer == 'None':
         opt = None
     else:
diff --git a/tools/caffe_converter/convert_model.py b/tools/caffe_converter/convert_model.py
index 1624a017fe0d..b2ea57ea33f1 100644
--- a/tools/caffe_converter/convert_model.py
+++ b/tools/caffe_converter/convert_model.py
@@ -225,7 +225,7 @@ def main():
     args = parser.parse_args()
 
     convert_model(args.prototxt, args.caffemodel, args.save_model_name)
-    print ('Saved model successfully to {}'.format(args.save_model_name))
+    print('Saved model successfully to {}'.format(args.save_model_name))
 
 if __name__ == '__main__':
     main()
diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index 13b55fef1296..bbbdaa44e314 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -170,6 +170,11 @@ def _parse_proto(prototxt_fname):
         if layer.type == 'ReLU' or layer.type == 18:
             type_string = 'mx.symbol.Activation'
             param_string = "act_type='relu'"
+            param = layer.relu_param
+            if hasattr(param, 'negative_slope'):
+                if param.negative_slope > 0:
+                    type_string = 'mx.symbol.LeakyReLU'
+                    param_string = "act_type='leaky', slope=%f" % param.negative_slope
             need_flatten[name] = need_flatten[mapping[layer.bottom[0]]]
         if layer.type == 'TanH' or layer.type == 23:
             type_string = 'mx.symbol.Activation'
diff --git a/tools/caffe_converter/test_converter.py b/tools/caffe_converter/test_converter.py
index cdf833198eff..6bc5d56348d9 100644
--- a/tools/caffe_converter/test_converter.py
+++ b/tools/caffe_converter/test_converter.py
@@ -89,18 +89,20 @@ def main():
     args = parser.parse_args()
     if args.cpu:
         gpus = [-1]
-        batch_size = 32
+        default_batch_size = 32
     else:
         gpus = mx.test_utils.list_gpus()
         assert gpus, 'At least one GPU is needed to run test_converter in GPU mode'
-        batch_size = 32 * len(gpus)
+        default_batch_size = 32 * len(gpus)
 
     models = ['bvlc_googlenet', 'vgg-16', 'resnet-50']
 
     val = download_data()
     for m in models:
         test_model_weights_and_outputs(m, args.image_url, gpus[0])
-        test_imagenet_model_performance(m, val, gpus, batch_size)
+        # Build/testing machines tend to be short on GPU memory
+        this_batch_size = default_batch_size / 4 if m == 'vgg-16' else default_batch_size
+        test_imagenet_model_performance(m, val, gpus, this_batch_size)
 
 if __name__ == '__main__':
     main()
diff --git a/tools/coreml/README.md b/tools/coreml/README.md
index e29eebe84bc1..45f19b608bdb 100644
--- a/tools/coreml/README.md
+++ b/tools/coreml/README.md
@@ -3,22 +3,23 @@
 This tool helps convert MXNet models into [Apple CoreML](https://developer.apple.com/documentation/coreml) format which can then be run on Apple devices.
 
 ## Installation
-In order to use this tool you need to have these installed:
-* MacOS - High Sierra 10.13
-* Xcode 9
-* coremltools 0.5.0 or greater (pip install coremltools)
-* mxnet 0.10.0 or greater. [Installation instructions](http://mxnet.io/get_started/install.html).
-* yaml (pip install pyyaml)
+In order to use this tool you need to have these:
+* MacOS - 10.11 (El Capitan) or higher (for running inferences on the converted model MacOS 10.13 or higher (for phones: iOS 11 or above) is needed)
 * python 2.7
+* mxnet-to-coreml tool: 
+
+```bash
+pip install mxnet-to-coreml
+```
 
 ## How to use
-Let's say you want to use your MXNet model in an iPhone App. For the purpose of this example, let's say you want to use squeezenet-v1.1.
+Let's say you want to use your MXNet model in an iPhone App. For the purpose of this example, let's assume it is a squeezenet-v1.1 model.
 
-1. Download the model into the directory where this converter resides. Squeezenet can be downloaded from [here](http://data.mxnet.io/models/imagenet/squeezenet/).
+1. Download the model into the directory where this converter resides. Squeezenet can be downloaded from [here](http://data.mxnet.io/models/imagenet/squeezenet/). The synset.txt file which contains all the class-labels and can be downloaded from [here](http://data.mxnet.io/models/imagenet/synset.txt).
 2. Run this command:
 
   ```bash
-python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="squeezenetv11.mlmodel"
+mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="squeezenetv11.mlmodel"
 ```
 
   The above command will save the converted model in CoreML format to file squeezenet-v11.mlmodel. Internally, the model is first loaded by MXNet recreating the entire symbolic graph in memory. The converter walks through this symbolic graph converting each operator into its CoreML equivalent. Some of the supplied arguments to the converter are used by MXNet to generate the graph while others are used by CoreML either to pre-process the input (before passing it to the neural network) or to process the output of the neural network in a particular way.
@@ -40,20 +41,20 @@ python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --in
 You could provide a file containing class labels (as above) so that CoreML will return the category a given image belongs to. The file should have a label per line and labels can have any special characters. The line number of the label in the file should correspond with the index of softmax output. E.g.
 
 ```bash
-python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --class-labels classLabels.txt --output-file="squeezenetv11.mlmodel"
+mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --class-labels synset.txt --output-file="squeezenetv11.mlmodel"
 ```
 
 ### Adding a pre-processing layer to CoreML model.
 You could ask CoreML to pre-process the images before passing them through the model. The following command provides image re-centering parameters for red, blue and green channel.
 
 ```bash
-python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103}' --output-file="squeezenet_v11.mlmodel"
+mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103}' --output-file="squeezenet_v11.mlmodel"
 ```
 
 If you are building an app for a model that takes "Image" as an input, you will have to provide image_input_names as pre-processing arguments. This tells CoreML that a particular input variable is of type Image. E.g.:
 
 ```bash
-python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103,"image_input_names":"data"}' --output-file="squeezenet_v11.mlmodel"
+mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,224,224"}' --pre-processing-arguments='{"red_bias":127,"blue_bias":117,"green_bias":103,"image_input_names":"data"}' --output-file="squeezenet_v11.mlmodel"
 ```
 
 ## Currently supported
@@ -79,36 +80,32 @@ Any MXNet model that uses the above operators can be converted easily. For insta
 1. [Inception-BN](http://data.mxnet.io/models/imagenet/inception-bn/)
 
 ```bash
-python mxnet_coreml_converter.py --model-prefix='Inception-BN' --epoch=126 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="InceptionBN.mlmodel"
+mxnet_coreml_converter.py --model-prefix='Inception-BN' --epoch=126 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="InceptionBN.mlmodel"
 ```
 
 2. [NiN](http://data.dmlc.ml/models/imagenet/nin/)
 
 ```bash
-python mxnet_coreml_converter.py --model-prefix='nin' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="nin.mlmodel"
+mxnet_coreml_converter.py --model-prefix='nin' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="nin.mlmodel"
 ```
 
 3. [Resnet](http://data.mxnet.io/models/imagenet/resnet/)
 
 ```bash
-python mxnet_coreml_converter.py --model-prefix='resnet-50' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="resnet50.mlmodel"
+mxnet_coreml_converter.py --model-prefix='resnet-50' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="resnet50.mlmodel"
 ```
 
 4. [Squeezenet](http://data.mxnet.io/models/imagenet/squeezenet/)
 
 ```bash
-python mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="squeezenetv11.mlmodel"
+mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' --epoch=0 --input-shape='{"data":"3,227,227"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="squeezenetv11.mlmodel"
 ```
 
 5. [Vgg](http://data.mxnet.io/models/imagenet/vgg/)
 
 ```bash
-python mxnet_coreml_converter.py --model-prefix='vgg16' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels classLabels.txt --output-file="vgg16.mlmodel"
+mxnet_coreml_converter.py --model-prefix='vgg16' --epoch=0 --input-shape='{"data":"3,224,224"}' --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' --class-labels synset.txt --output-file="vgg16.mlmodel"
 ```
 
 ## Known issues
 * [Inception-V3](http://data.mxnet.io/models/imagenet/inception-v3.tar.gz) model can be converted into CoreML format but is unable to run on Xcode.
-
-## This tool has been tested with:
-* MacOS - High Sierra 10.13 Beta.
-* Xcode 9 beta 5.
diff --git a/tools/coreml/converter/__init__.py b/tools/coreml/converter/__init__.py
index 245692337bc3..13a83393a912 100644
--- a/tools/coreml/converter/__init__.py
+++ b/tools/coreml/converter/__init__.py
@@ -14,4 +14,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
diff --git a/tools/coreml/converter/_layers.py b/tools/coreml/converter/_layers.py
index 0a089949a1a6..4c5ebc6fb0a9 100644
--- a/tools/coreml/converter/_layers.py
+++ b/tools/coreml/converter/_layers.py
@@ -38,6 +38,30 @@ def _get_node_name(net, node_id):
 def _get_node_shape(net, node_id):
     return net['nodes'][node_id]['shape']
 
+def _get_attrs(node):
+    """get attribute dict from node
+
+    This functions keeps backward compatibility
+    for both attr and attrs key in the json field.
+
+    Parameters
+    ----------
+    node : dict
+       The json graph Node
+
+    Returns
+    -------
+    attrs : dict
+       The attr dict, returns empty dict if
+       the field do not exist.
+    """
+    if 'attrs' in node:
+        return node['attrs']
+    elif 'attr' in node:
+        return node['attr']
+    else:
+        return {}
+
 
 # TODO These operators still need to be converted (listing in order of priority):
 # High priority:
@@ -108,7 +132,7 @@ def convert_transpose(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    param = node['attr']
+    param = _get_attrs(node)
 
     axes = literal_eval(param['axes'])
     builder.add_permute(name, axes, input_name, output_name)
@@ -180,7 +204,7 @@ def convert_activation(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    mx_non_linearity = node['attr']['act_type']
+    mx_non_linearity = _get_attrs(node)['act_type']
     #TODO add SCALED_TANH, SOFTPLUS, SOFTSIGN, SIGMOID_HARD, LEAKYRELU, PRELU, ELU, PARAMETRICSOFTPLUS, THRESHOLDEDRELU, LINEAR
     if mx_non_linearity == 'relu':
         non_linearity = 'RELU'
@@ -281,7 +305,7 @@ def convert_convolution(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    param = node['attr']
+    param = _get_attrs(node)
     inputs = node['inputs']
     args, _ = module.get_params()
 
@@ -290,7 +314,7 @@ def convert_convolution(net, node, module, builder):
     else:
         has_bias = True
 
-    if literal_eval(param['pad']) != (0, 0):
+    if 'pad' in param.keys() and literal_eval(param['pad']) != (0, 0):
         pad = literal_eval(param['pad'])
         builder.add_padding(
             name=name+"_pad",
@@ -314,7 +338,12 @@ def convert_convolution(net, node, module, builder):
         Wb = None
 
     channels = W.shape[1]
-    stride_height, stride_width = literal_eval(param['stride'])
+
+    stride_height = 1
+    stride_width = 1
+    if 'stride' in param.keys():
+        stride_height, stride_width = literal_eval(param['stride'])
+
     kernel_height, kernel_width = literal_eval(param['kernel'])
 
     W = W.transpose((2, 3, 1, 0))
@@ -356,7 +385,7 @@ def convert_pooling(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    param = node['attr']
+    param = _get_attrs(node)
 
     layer_type_mx = param['pool_type']
     if layer_type_mx == 'max':
@@ -367,7 +396,7 @@ def convert_pooling(net, node, module, builder):
         raise TypeError("Pooling type %s not supported" % layer_type_mx)
 
     # Add padding if there is any
-    if literal_eval(param['pad']) != (0, 0):
+    if 'pad' in param.keys() and literal_eval(param['pad']) != (0, 0):
         pad = literal_eval(param['pad'])
         builder.add_padding(
             name=name+"_pad",
@@ -380,7 +409,11 @@ def convert_pooling(net, node, module, builder):
             output_name=name+"_pad_output")
         input_name = name+"_pad_output"
 
-    stride_height, stride_width = literal_eval(param['stride'])
+    stride_height = 1
+    stride_width = 1
+    if 'stride' in param.keys():
+        stride_height, stride_width = literal_eval(param['stride'])
+
     kernel_width, kernel_height = literal_eval(param['kernel'])
 
     type_map = {'valid': 'VALID', 'full': 'INCLUDE_LAST_PIXEL'}
@@ -413,7 +446,7 @@ def convert_pooling(net, node, module, builder):
 
 
 def convert_batchnorm(net, node, module, builder):
-    """Convert a transpose layer from mxnet to coreml.
+    """Convert a batchnorm layer from mxnet to coreml.
 
     Parameters
     ----------
@@ -436,9 +469,9 @@ def convert_batchnorm(net, node, module, builder):
 
     eps = 1e-3 # Default value of eps for MXNet.
     use_global_stats = False # Default value of use_global_stats for MXNet.
-    if 'attr' in node:
-        if 'eps' in node['attr']:
-            eps = literal_eval(node['attr']['eps'])
+    attrs = _get_attrs(node)
+    if 'eps' in attrs:
+        eps = literal_eval(attrs['eps'])
 
     args, aux = module.get_params()
     gamma = args[_get_node_name(net, inputs[1][0])].asnumpy()
@@ -502,7 +535,7 @@ def convert_deconvolution(net, node, module, builder):
     """
     input_name, output_name = _get_input_output_name(net, node)
     name = node['name']
-    param = node['attr']
+    param = _get_attrs(node)
     inputs = node['inputs']
     args, _ = module.get_params()
 
diff --git a/tools/coreml/utils.py b/tools/coreml/converter/utils.py
similarity index 67%
rename from tools/coreml/utils.py
rename to tools/coreml/converter/utils.py
index 1e4ff7a4d975..b4e3ecb14017 100644
--- a/tools/coreml/utils.py
+++ b/tools/coreml/converter/utils.py
@@ -19,7 +19,7 @@
 
 
 def load_model(model_name, epoch_num, data_shapes, label_shapes, label_names, gpus=''):
-    """Loads and returns a given MXNet model.
+    """Returns a module loaded with the provided model.
 
     Parameters
     ----------
@@ -53,12 +53,59 @@ def load_model(model_name, epoch_num, data_shapes, label_shapes, label_names, gp
     MXNet module
     """
     sym, arg_params, aux_params = mx.model.load_checkpoint(model_name, epoch_num)
+
+    mod = create_module(sym, data_shapes, label_shapes, label_names, gpus)
+
+    mod.set_params(
+        arg_params=arg_params,
+        aux_params=aux_params,
+        allow_missing=True
+    )
+
+    return mod
+
+
+def create_module(sym, data_shapes, label_shapes, label_names, gpus=''):
+    """Creates a new MXNet module.
+
+    Parameters
+    ----------
+    sym : Symbol
+        An MXNet symbol.
+
+    input_shape: tuple
+        The shape of the input data in the form of (batch_size, channels, height, width)
+
+    files: list of strings
+        List of URLs pertaining to files that need to be downloaded in order to use the model.
+
+    data_shapes: list of tuples.
+        List of tuples where each tuple is a pair of input variable name and its shape.
+
+    label_shapes: list of (str, tuple)
+        Typically is ``data_iter.provide_label``.
+
+    label_names: list of str
+        Name of the output labels in the MXNet symbolic graph.
+
+    gpus: str
+        Comma separated string of gpu ids on which inferences are executed. E.g. 3,5,6 would refer to GPUs 3, 5 and 6.
+        If empty, we use CPU.
+
+    Returns
+    -------
+    MXNet module
+    """
     if gpus == '':
         devices = mx.cpu()
     else:
         devices = [mx.gpu(int(i)) for i in gpus.split(',')]
+
+    data_names = [data_shape[0] for data_shape in data_shapes]
+
     mod = mx.mod.Module(
         symbol=sym,
+        data_names=data_names,
         context=devices,
         label_names=label_names
     )
@@ -67,11 +114,5 @@ def load_model(model_name, epoch_num, data_shapes, label_shapes, label_names, gp
         data_shapes=data_shapes,
         label_shapes=label_shapes
     )
-    mod.set_params(
-        arg_params=arg_params,
-        aux_params=aux_params,
-        allow_missing=True
-    )
     return mod
 
-
diff --git a/tools/coreml/mxnet_coreml_converter.py b/tools/coreml/mxnet_coreml_converter.py
index 502377eca864..7051ee392b0c 100644
--- a/tools/coreml/mxnet_coreml_converter.py
+++ b/tools/coreml/mxnet_coreml_converter.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -18,7 +19,7 @@
 from __future__ import print_function
 import argparse
 from converter._mxnet_converter import convert
-from utils import load_model
+from converter.utils import load_model
 import yaml
 from ast import literal_eval
 
@@ -87,7 +88,7 @@
     input_shape = yaml.safe_load(args.input_shape)
     data_shapes = []
     for key in input_shape:
-        # We prepend 1 because the coreml model only accept 1 input data at a time.
+        # We prepend 1 because the coreml model only accept 1 input data at a time (=batch-size).
         shape = (1,)+literal_eval(input_shape[key])
         input_shape[key] = shape
         data_shapes.append((key, shape))
diff --git a/tools/coreml/pip_package/.gitignore b/tools/coreml/pip_package/.gitignore
new file mode 100644
index 000000000000..7c67bf467970
--- /dev/null
+++ b/tools/coreml/pip_package/.gitignore
@@ -0,0 +1,10 @@
+# Compiled python modules.
+*.pyc
+
+# Setuptools distribution folder.
+/dist/
+
+# Python egg metadata, regenerated from source files by setuptools.
+/*.egg-info
+/*.egg
+
diff --git a/tools/coreml/pip_package/MANIFEST.in b/tools/coreml/pip_package/MANIFEST.in
new file mode 100644
index 000000000000..6ecd97d57dc7
--- /dev/null
+++ b/tools/coreml/pip_package/MANIFEST.in
@@ -0,0 +1,5 @@
+# Include the license file
+include LICENSE.txt
+
+# Documentation for pypi webpage
+include README.rst
diff --git a/tools/coreml/pip_package/README.rst b/tools/coreml/pip_package/README.rst
new file mode 100644
index 000000000000..875d89fcd208
--- /dev/null
+++ b/tools/coreml/pip_package/README.rst
@@ -0,0 +1,44 @@
+MXNET -> CoreML Converter
+=========================
+
+`Apache MXNet <https://github.com/apache/incubator-mxnet>`_ (incubating) is a deep learning framework designed for both efficiency and flexibility. It allows you to mix `symbolic and imperative programming <http://mxnet.io/architecture/index.html#deep-learning-system-design-concepts>`_ to maximize efficiency and productivity. At its core, MXNet contains a dynamic dependency scheduler that automatically parallelizes both symbolic and imperative operations on the fly. A graph optimization layer on top of that makes symbolic execution fast and memory efficient. MXNet is portable and lightweight, scaling effectively to multiple GPUs and multiple machines.
+
+`Core ML <http://developer.apple.com/documentation/coreml>`_ is an Apple framework which allows developers to simply and easily integrate machine learning (ML) models into apps running on Apple devices (including iOS, watchOS, macOS, and tvOS). Core ML introduces a public file format (.mlmodel) for a broad set of ML methods including deep neural networks (both convolutional and recurrent), tree ensembles with boosting, and generalized linear models. Models in this format can be directly integrated into apps through Xcode.
+
+This tool helps convert `MXNet models <https://github.com/apache/incubator-mxnet>`_ into `Apple CoreML <https://developer.apple.com/documentation/coreml>`_ format which can then be run on Apple devices. You can find more information about this tool on our `github <https://github.com/apache/incubator-mxnet/tree/master/tools/coreml>`_ page.
+
+Prerequisites
+-------------
+This package can only be installed on MacOS X since it relies on Apple's CoreML SDK. It can be run on MacOS 10.11 or higher though for running inferences on the converted model MacOS 10.13 or higher is needed (or for phones, iOS 11 or above).
+
+Installation
+------------
+The method for installing this tool follows the `standard python package installation steps <https://packaging.python.org/installing/>`_. Once you have set up a python environment, run::
+
+  pip install mxnet-to-coreml
+
+The package `documentation <https://github.com/apache/incubator-mxnet/tree/master/tools/coreml>`_ contains more details on how to use coremltools.
+
+Dependencies
+------------
+This tool has the following dependencies:
+
+* mxnet (0.10.0+)
+* coremltools (0.5.1+)
+* pyyaml (3.12+)
+
+Sample Usage
+------------
+
+In order to convert, say a `Squeezenet model <http://data.mxnet.io/models/imagenet/squeezenet/>`_, with labels from `synset.txt <http://data.mxnet.io/models/imagenet/synset.txt>`_, execute this ::
+
+  mxnet_coreml_converter.py --model-prefix='squeezenet_v1.1' \
+  --epoch=0 --input-shape='{"data":"3,227,227"}' \
+  --mode=classifier --pre-processing-arguments='{"image_input_names":"data"}' \
+  --class-labels synset.txt --output-file="squeezenetv11.mlmodel"
+
+More Information
+----------------
+* `On Github <https://github.com/apache/incubator-mxnet/tree/master/tools/coreml>`_
+* `MXNet framework <https://github.com/apache/incubator-mxnet>`_
+* `Apple CoreML <https://developer.apple.com/documentation/coreml>`_
diff --git a/tools/coreml/pip_package/setup.py b/tools/coreml/pip_package/setup.py
new file mode 100644
index 000000000000..18c601d38166
--- /dev/null
+++ b/tools/coreml/pip_package/setup.py
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from setuptools import setup
+from setuptools import find_packages
+
+# We are overriding the default behavior of bdist_wheel which is generating
+# pure python wheels while we need platform specific wheel since this tool
+# can only work on MacOS.
+try:
+    from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+    class bdist_wheel(_bdist_wheel):
+        def finalize_options(self):
+            _bdist_wheel.finalize_options(self)
+            self.root_is_pure = False
+except ImportError:
+    bdist_wheel = None
+
+
+def readme():
+    """
+    Reads README.rst file and allows us to provide
+    a better experience for pypi webpage.
+    """
+    with open('README.rst') as f:
+        return f.read()
+
+setup(name='mxnet-to-coreml',
+      version='0.1.0',
+      description='Tool to convert MXNet models into Apple CoreML model format.',
+      long_description=readme(),
+      classifiers=[
+        'Development Status :: 4 - Beta',
+        'Intended Audience :: Developers',
+        'License :: OSI Approved :: Apache Software License',
+        'Operating System :: MacOS :: MacOS X',
+        'Programming Language :: Python :: 2.7',
+        'Topic :: Software Development :: Libraries :: Python Modules'
+      ],
+      keywords='Apache MXNet Apple CoreML Converter Deep Learning',
+      url='https://github.com/apache/incubator-mxnet/tree/master/tools/coreml',
+      author='pracheer',
+      author_email='pracheer_gupta@hotmail.com',
+      license='Apache 2.0',
+      package_dir = {'': '..'},
+      packages=['converter'],
+      install_requires=[
+          'mxnet',
+          'coremltools',
+          'pyyaml',
+      ],
+      scripts=['../mxnet_coreml_converter.py'],
+      python_requires='~=2.7',
+      zip_safe=False,
+      cmdclass={'bdist_wheel': bdist_wheel},)
diff --git a/tools/coreml/test/test_mxnet_converter.py b/tools/coreml/test/test_mxnet_converter.py
index 6692b44ec370..bf1ace6b8f59 100644
--- a/tools/coreml/test/test_mxnet_converter.py
+++ b/tools/coreml/test/test_mxnet_converter.py
@@ -25,7 +25,7 @@
 sys.path.append(current_working_directory + "/../converter/")
 import _mxnet_converter as mxnet_converter
 from collections import namedtuple
-
+from converter import utils
 
 def _mxnet_remove_batch(input_data):
     for blob in input_data:
@@ -33,22 +33,14 @@ def _mxnet_remove_batch(input_data):
     return input_data
 
 
-def _get_mxnet_module(net, input_shape, mode, label_names, input_names=None):
+def _get_mxnet_module(net, data_shapes, mode, label_names, input_names=None):
     """ Given a symbolic graph, input shape and the initialization mode,
         returns an MXNet module.
     """
     mx.random.seed(1993)
 
-    mod = mx.mod.Module(
-        symbol=net,
-        context=mx.cpu(),
-        label_names=label_names
-    )
-    mod.bind(
-        for_training=False,
-        data_shapes=[('data', input_shape)],
-        label_shapes=input_names
-    )
+    mod = utils.create_module(sym=net, data_shapes=data_shapes, label_shapes=input_names, label_names=label_names)
+
     if mode == 'random':
         mod.init_params(
             initializer=mx.init.Uniform(scale=.1)
@@ -73,7 +65,7 @@ class SingleLayerTest(unittest.TestCase):
     In order to do so, it converts model and generates preds on both CoreML and MXNet and check they are the same.
     """
     def _test_mxnet_model(self, net, input_shape, mode, class_labels=None, coreml_mode=None, label_names=None, delta=1e-3,
-                          pre_processing_args=None):
+                          pre_processing_args=None, input_name='data'):
         """ Helper method that convert the CoreML model into CoreML and compares the predictions over random data.
 
         Parameters
@@ -92,13 +84,19 @@ def _test_mxnet_model(self, net, input_shape, mode, class_labels=None, coreml_mo
 
         delta: float
             The maximum difference b/w predictions of MXNet and CoreML that is tolerable.
+
+        input_name: str
+            The name of the input variable to the symbolic graph.
         """
-        mod = _get_mxnet_module(net, input_shape, mode, label_names)
+
+        data_shapes=[(input_name, input_shape)]
+
+        mod = _get_mxnet_module(net, data_shapes, mode, label_names)
 
         # Generate some dummy data
-        input_data = {'data': np.random.uniform(-10., 10., input_shape)}
+        input_data = {input_name: np.random.uniform(-10., 10., input_shape)}
         Batch = namedtuple('Batch', ['data'])
-        mod.forward(Batch([mx.nd.array(input_data['data'])]))
+        mod.forward(Batch([mx.nd.array(input_data[input_name])]))
         mxnet_preds = mod.get_outputs()[0].asnumpy().flatten()
 
         # Get predictions from coreml
@@ -106,7 +104,7 @@ def _test_mxnet_model(self, net, input_shape, mode, class_labels=None, coreml_mo
             model=mod,
             class_labels=class_labels,
             mode=coreml_mode,
-            input_shape={'data': input_shape},
+            input_shape={input_name: input_shape},
             preprocessor_args=pre_processing_args
         )
         coreml_preds = coreml_model.predict(_mxnet_remove_batch(input_data)).values()[0].flatten()
@@ -512,7 +510,7 @@ def test_tiny_synset_random_input(self):
         net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
         net = mx.sym.SoftmaxOutput(net, name='softmax')
         mod = _get_mxnet_module(net,
-                                input_shape=input_shape,
+                                data_shapes=[('data', input_shape)],
                                 mode='random',
                                 label_names=['softmax_label'])
 
@@ -941,6 +939,35 @@ def test_pre_processing_args(self):
         self._test_mxnet_model(net, input_shape=input_shape, mode='random', label_names=['softmax_label'],
                                pre_processing_args={'red_bias':0, 'blue_bias':0, 'green_bias':0, 'image_scale':1})
 
+    def test_different_input_variables(self):
+        """
+        Verifying the behavior when input variable name is different than the standard name - 'data'.
+        """
+        np.random.seed(1988)
+        input_shape = (1, 10)
+        net = mx.sym.Variable('data1')
+        net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=5)
+        self._test_mxnet_model(net, input_shape=input_shape, mode='zeros', input_name='data1')
+
+    def test_really_tiny_conv_optional_params(self):
+        """
+        Verifying the behavior of a convolutional layer when stride and pad are not provided.
+        """
+        np.random.seed(1988)
+        input_shape = (1, 1, 10, 10)
+        num_filter = 1
+        kernel = (1 ,1)
+
+        # Define a model
+        net = mx.sym.Variable('data')
+        net = mx.symbol.Convolution(
+            data=net,
+            num_filter=num_filter,
+            kernel=kernel,
+            name='conv_1'
+        )
+        self._test_mxnet_model(net, input_shape=input_shape, mode='random')
+
     # TODO test_concat
 
 
diff --git a/tools/coreml/test/test_mxnet_image.py b/tools/coreml/test/test_mxnet_image.py
index ac30ac7f5ad9..2bbf7b1e264b 100644
--- a/tools/coreml/test/test_mxnet_image.py
+++ b/tools/coreml/test/test_mxnet_image.py
@@ -24,7 +24,7 @@
 sys.path.append(current_working_directory + "/..")
 sys.path.append(current_working_directory + "/../converter/")
 import _mxnet_converter as mxnet_converter
-from utils import load_model
+from converter.utils import load_model
 
 
 VAL_DATA = 'data/val-5k-256.rec'
diff --git a/tools/diagnose.py b/tools/diagnose.py
new file mode 100644
index 000000000000..adb621d1e402
--- /dev/null
+++ b/tools/diagnose.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Diagnose script for checking OS/hardware/python/pip/mxnet/network.
+The output of this script can be a very good hint to issue/problem.
+"""
+import platform, subprocess, sys, os
+import socket, time
+try:
+    from urllib.request import urlopen
+    from urllib.parse import urlparse
+except ImportError:
+    from urlparse import urlparse
+    from urllib2 import urlopen
+import argparse
+
+def parse_args():
+    """Parse arguments."""
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Diagnose script for checking the current system.')
+    choices = ['python', 'pip', 'mxnet', 'os', 'hardware', 'network']
+    for choice in choices:
+        parser.add_argument('--' + choice, default=1, type=int,
+                            help='Diagnose {}.'.format(choice))
+    parser.add_argument('--region', default='', type=str,
+                        help="Additional sites in which region(s) to test. \
+                        Specify 'cn' for example to test mirror sites in China.")
+    parser.add_argument('--timeout', default=10, type=int,
+                        help="Connection test timeout threshold, 0 to disable.")
+    args = parser.parse_args()
+    return args
+
+URLS = {
+    'MXNet': 'https://github.com/apache/incubator-mxnet',
+    'Gluon Tutorial(en)': 'http://gluon.mxnet.io',
+    'Gluon Tutorial(cn)': 'https://zh.gluon.ai',
+    'FashionMNIST': 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz',
+    'PYPI': 'https://pypi.python.org/pypi/pip',
+    'Conda': 'https://repo.continuum.io/pkgs/free/',
+}
+REGIONAL_URLS = {
+    'cn': {
+        'PYPI(douban)': 'https://pypi.douban.com/',
+        'Conda(tsinghua)': 'https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/',
+    }
+}
+
+def test_connection(name, url, timeout=10):
+    """Simple connection test"""
+    urlinfo = urlparse(url)
+    start = time.time()
+    try:
+        ip = socket.gethostbyname(urlinfo.netloc)
+    except Exception as e:
+        print('Error resolving DNS for {}: {}, {}'.format(name, url, e))
+        return
+    dns_elapsed = time.time() - start
+    start = time.time()
+    try:
+        _ = urlopen(url, timeout=timeout)
+    except Exception as e:
+        print("Error open {}: {}, {}, DNS finished in {} sec.".format(name, url, e, dns_elapsed))
+        return
+    load_elapsed = time.time() - start
+    print("Timing for {}: {}, DNS: {:.4f} sec, LOAD: {:.4f} sec.".format(name, url, dns_elapsed, load_elapsed))
+
+def check_python():
+    print('----------Python Info----------')
+    print('Version      :', platform.python_version())
+    print('Compiler     :', platform.python_compiler())
+    print('Build        :', platform.python_build())
+    print('Arch         :', platform.architecture())
+
+def check_pip():
+    print('------------Pip Info-----------')
+    try:
+        import pip
+        print('Version      :', pip.__version__)
+        print('Directory    :', os.path.dirname(pip.__file__))
+    except ImportError:
+        print('No corresponding pip install for current python.')
+
+def check_mxnet():
+    print('----------MXNet Info-----------')
+    try:
+        import mxnet
+        print('Version      :', mxnet.__version__)
+        mx_dir = os.path.dirname(mxnet.__file__)
+        print('Directory    :', mx_dir)
+        commit_hash = os.path.join(mx_dir, 'COMMIT_HASH')
+        with open(commit_hash, 'r') as f:
+            ch = f.read().strip()
+            print('Commit Hash   :', ch)
+    except ImportError:
+        print('No MXNet installed.')
+    except FileNotFoundError:
+        print('Hashtag not found. Not installed from pre-built package.')
+    except Exception as e:
+        import traceback
+        if not isinstance(e, IOError):
+            print("An error occured trying to import mxnet.")
+            print("This is very likely due to missing missing or incompatible library files.")
+        print(traceback.format_exc())
+
+def check_os():
+    print('----------System Info----------')
+    print('Platform     :', platform.platform())
+    print('system       :', platform.system())
+    print('node         :', platform.node())
+    print('release      :', platform.release())
+    print('version      :', platform.version())
+
+def check_hardware():
+    print('----------Hardware Info----------')
+    print('machine      :', platform.machine())
+    print('processor    :', platform.processor())
+    if sys.platform.startswith('darwin'):
+        pipe = subprocess.Popen(('sysctl', '-a'), stdout=subprocess.PIPE)
+        output = pipe.communicate()[0]
+        for line in output.split(b'\n'):
+            if b'brand_string' in line or b'features' in line:
+                print(line.strip())
+    elif sys.platform.startswith('linux'):
+        subprocess.call(['lscpu'])
+    elif sys.platform.startswith('win32'):
+        subprocess.call(['wmic', 'cpu', 'get', 'name'])
+
+def check_network(args):
+    print('----------Network Test----------')
+    if args.timeout > 0:
+        print('Setting timeout: {}'.format(args.timeout))
+        socket.setdefaulttimeout(10)
+    for region in args.region.strip().split(','):
+        r = region.strip().lower()
+        if not r:
+            continue
+        if r in REGIONAL_URLS:
+            URLS.update(REGIONAL_URLS[r])
+        else:
+            import warnings
+            warnings.warn('Region {} do not need specific test, please refer to global sites.'.format(r))
+    for name, url in URLS.items():
+        test_connection(name, url, args.timeout)
+
+if __name__ == '__main__':
+    args = parse_args()
+    if args.python:
+        check_python()
+
+    if args.pip:
+        check_pip()
+
+    if args.mxnet:
+        check_mxnet()
+
+    if args.os:
+        check_os()
+
+    if args.hardware:
+        check_hardware()
+
+    if args.network:
+        check_network(args)
diff --git a/tools/im2rec.cc b/tools/im2rec.cc
index 856814024037..915b78029c87 100644
--- a/tools/im2rec.cc
+++ b/tools/im2rec.cc
@@ -18,6 +18,7 @@
  */
 
 /*!
+ *  Copyright (c) 2015 by Contributors
  * \file im2rec.cc
  * \brief convert images into image recordio format
  *  Image Record Format: zeropad[64bit] imid[64bit] img-binary-content
diff --git a/tools/license_header.py b/tools/license_header.py
index db67000837b0..7903279d6976 100644
--- a/tools/license_header.py
+++ b/tools/license_header.py
@@ -63,7 +63,16 @@
                'nnvm',
                'ps-lite',
                'src/operator/mkl/',
-               'src/operator/contrib/ctc_include/']
+               'cmake/Modules/FindJeMalloc.cmake',
+               'src/operator/special_functions-inl.h',
+               'src/operator/nn/pool.h',
+               'src/operator/contrib/psroi_pooling-inl.h',
+               'src/operator/contrib/nn/deformable_im2col.h',
+               'example/speech-demo/io_func/convert2kaldi.py',
+               'example/speech-demo/decode_mxnet.sh',
+               'example/image-classification/predict-cpp/image-classification-predict.cc',
+               'src/operator/contrib/ctc_include/',
+               'cmake/Modules/FindJeMalloc.cmake']
 
 # language extensions and the according commment mark
 _LANGS = {'.cc':'*', '.h':'*', '.cu':'*', '.cuh':'*', '.py':'#',
@@ -119,13 +128,6 @@ def process_file(fname, action, verbose=True):
     elif action == 'check':
         return False
     _, ext = os.path.splitext(fname)
-    # remove old license
-    if ext == '.h' or ext == '.cc' or ext == '.cu' or ext == '.cpp' \
-        or ext == '.hpp':
-        for i, l in enumerate(lines):
-            if _OLD_LICENSE.match(l.decode('utf-8')):
-                del lines[i]
-                break
     with open(fname, 'wb') as f:
         # shebang line
         if lines[0].startswith(b'#!'):
diff --git a/tools/rec2idx.py b/tools/rec2idx.py
new file mode 100644
index 000000000000..111ce9ba0a22
--- /dev/null
+++ b/tools/rec2idx.py
@@ -0,0 +1,106 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import print_function
+import os
+import time
+import ctypes
+from mxnet.base import _LIB
+from mxnet.base import check_call
+import mxnet as mx
+import argparse
+
+class IndexCreator(mx.recordio.MXRecordIO):
+    """Reads `RecordIO` data format, and creates index file
+    that enables random access.
+
+    Example usage:
+    ----------
+    >>> creator = IndexCreator('data/test.rec','data/test.idx')
+    >>> record.create_index()
+    >>> record.close()
+    >>> !ls data/
+    test.rec  test.idx
+
+    Parameters
+    ----------
+    uri : str
+        Path to the record file.
+    idx_path : str
+        Path to the index file, that will be created/overwritten.
+    key_type : type
+        Data type for keys (optional, default = int).
+    """
+    def __init__(self, uri, idx_path, key_type=int):
+        self.key_type = key_type
+        self.fidx = None
+        self.idx_path = idx_path
+        super(IndexCreator, self).__init__(uri, 'r')
+
+    def open(self):
+        super(IndexCreator, self).open()
+        self.fidx = open(self.idx_path, 'w')
+
+    def close(self):
+        """Closes the record and index files."""
+        if not self.is_open:
+            return
+        super(IndexCreator, self).close()
+        self.fidx.close()
+
+    def tell(self):
+        """Returns the current position of read head.
+        """
+        pos = ctypes.c_size_t()
+        check_call(_LIB.MXRecordIOReaderTell(self.handle, ctypes.byref(pos)))
+        return pos.value
+
+    def create_index(self):
+        """Creates the index file from open record file
+        """
+        self.reset()
+        counter = 0
+        pre_time = time.time()
+        while True:
+            if counter % 1000 == 0:
+                cur_time = time.time()
+                print('time:', cur_time - pre_time, ' count:', counter)
+            pos = self.tell()
+            cont = self.read()
+            if cont is None:
+                break
+            key = self.key_type(counter)
+            self.fidx.write('%s\t%d\n'%(str(key), pos))
+            counter = counter + 1
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description='Create an index file from .rec file')
+    parser.add_argument('record', help='path to .rec file.')
+    parser.add_argument('index', help='path to index file.')
+    args = parser.parse_args()
+    args.record = os.path.abspath(args.record)
+    args.index = os.path.abspath(args.index)
+    return args
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    creator = IndexCreator(args.record, args.index)
+    creator.create_index()
+    creator.close()